mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-27 08:21:57 -07:00
* Minor documentation updates, plus calenvironscreen S3 URL fix * Update score comparison docs and code * Add steps for running the comparison tool * Update HUD recap ETL to ensure GEOID is imported as a string (if it is imported as an interger by default it will strip the beginning "0" from many IDs) * Add note about execution time * Move step from paragraph to list * Update output dir in README for comp tool Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
2653 lines
110 KiB
Text
2653 lines
110 KiB
Text
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "93c7b73b",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import collections\n",
|
||
"import functools\n",
|
||
"import IPython\n",
|
||
"import itertools\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"import pandas as pd\n",
|
||
"import pathlib\n",
|
||
"import pypandoc\n",
|
||
"import requests\n",
|
||
"import string\n",
|
||
"import sys\n",
|
||
"import typing\n",
|
||
"import us\n",
|
||
"import zipfile\n",
|
||
"\n",
|
||
"from datetime import datetime\n",
|
||
"from tqdm.notebook import tqdm_notebook\n",
|
||
"\n",
|
||
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
|
||
"if module_path not in sys.path:\n",
|
||
" sys.path.append(module_path)\n",
|
||
"\n",
|
||
"from utils import remove_all_from_dir, get_excel_column_name\n",
|
||
"from etl.sources.census.etl_utils import get_state_information\n",
|
||
"\n",
|
||
"\n",
|
||
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
|
||
"tqdm_notebook.pandas()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "881424fd",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
|
||
"pd.options.display.float_format = \"{:.2f}\".format\n",
|
||
"\n",
|
||
"# Set some global parameters\n",
|
||
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
|
||
"TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n",
|
||
"COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n",
|
||
"\n",
|
||
"# Make the dirs if they don't exist\n",
|
||
"TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||
"COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||
"\n",
|
||
"CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n",
|
||
"\n",
|
||
"# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n",
|
||
"# and introducing the risk of misspelling the field name.)\n",
|
||
"\n",
|
||
"GEOID_FIELD_NAME = \"GEOID10\"\n",
|
||
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
|
||
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
|
||
"COUNTRY_FIELD_NAME = \"Country\"\n",
|
||
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n",
|
||
"\n",
|
||
"CEJST_SCORE_FIELD = \"cejst_score\"\n",
|
||
"CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
|
||
"CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
|
||
"\n",
|
||
"# Define some suffixes\n",
|
||
"POPULATION_SUFFIX = \" (priority population)\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "c5f3eaa5",
|
||
"metadata": {
|
||
"scrolled": false
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3169: DtypeWarning: Columns (87,88,90) have mixed types.Specify dtype option on import or set low_memory=False.\n",
|
||
" has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>GEOID10</th>\n",
|
||
" <th>Housing burden (percent)</th>\n",
|
||
" <th>Total population</th>\n",
|
||
" <th>Air toxics cancer risk</th>\n",
|
||
" <th>Respiratory hazard index</th>\n",
|
||
" <th>Diesel particulate matter</th>\n",
|
||
" <th>Particulate matter (PM2.5)</th>\n",
|
||
" <th>Ozone</th>\n",
|
||
" <th>Traffic proximity and volume</th>\n",
|
||
" <th>Proximity to RMP sites</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Score D (top 25th percentile)</th>\n",
|
||
" <th>Score E (percentile)</th>\n",
|
||
" <th>Score E (top 25th percentile)</th>\n",
|
||
" <th>GEOID</th>\n",
|
||
" <th>State Abbreviation</th>\n",
|
||
" <th>County Name</th>\n",
|
||
" <th>State Code</th>\n",
|
||
" <th>State Name</th>\n",
|
||
" <th>GEOID10_TRACT</th>\n",
|
||
" <th>GEOID10_STATE</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>010010201001</td>\n",
|
||
" <td>0.15</td>\n",
|
||
" <td>692</td>\n",
|
||
" <td>49.38</td>\n",
|
||
" <td>0.79</td>\n",
|
||
" <td>0.28</td>\n",
|
||
" <td>10.00</td>\n",
|
||
" <td>40.12</td>\n",
|
||
" <td>91.02</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.35</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1001</td>\n",
|
||
" <td>AL</td>\n",
|
||
" <td>Autauga County</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>Alabama</td>\n",
|
||
" <td>01001020100</td>\n",
|
||
" <td>01</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>010010201002</td>\n",
|
||
" <td>0.15</td>\n",
|
||
" <td>1153</td>\n",
|
||
" <td>49.38</td>\n",
|
||
" <td>0.79</td>\n",
|
||
" <td>0.28</td>\n",
|
||
" <td>10.00</td>\n",
|
||
" <td>40.12</td>\n",
|
||
" <td>2.62</td>\n",
|
||
" <td>0.07</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.11</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1001</td>\n",
|
||
" <td>AL</td>\n",
|
||
" <td>Baldwin County</td>\n",
|
||
" <td>2.00</td>\n",
|
||
" <td>Alaska</td>\n",
|
||
" <td>01001020100</td>\n",
|
||
" <td>01</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>010010202001</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>50.32</td>\n",
|
||
" <td>0.81</td>\n",
|
||
" <td>0.30</td>\n",
|
||
" <td>10.07</td>\n",
|
||
" <td>40.22</td>\n",
|
||
" <td>4.68</td>\n",
|
||
" <td>0.08</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.51</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1001</td>\n",
|
||
" <td>AL</td>\n",
|
||
" <td>Barbour County</td>\n",
|
||
" <td>4.00</td>\n",
|
||
" <td>Arizona</td>\n",
|
||
" <td>01001020200</td>\n",
|
||
" <td>01</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>010010202002</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>1152</td>\n",
|
||
" <td>50.32</td>\n",
|
||
" <td>0.81</td>\n",
|
||
" <td>0.30</td>\n",
|
||
" <td>10.07</td>\n",
|
||
" <td>40.22</td>\n",
|
||
" <td>218.65</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.59</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1001</td>\n",
|
||
" <td>AL</td>\n",
|
||
" <td>Bibb County</td>\n",
|
||
" <td>5.00</td>\n",
|
||
" <td>Arkansas</td>\n",
|
||
" <td>01001020200</td>\n",
|
||
" <td>01</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>010010203001</td>\n",
|
||
" <td>0.21</td>\n",
|
||
" <td>2555</td>\n",
|
||
" <td>50.77</td>\n",
|
||
" <td>0.82</td>\n",
|
||
" <td>0.36</td>\n",
|
||
" <td>10.12</td>\n",
|
||
" <td>40.31</td>\n",
|
||
" <td>69.64</td>\n",
|
||
" <td>0.08</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.47</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1001</td>\n",
|
||
" <td>AL</td>\n",
|
||
" <td>Blount County</td>\n",
|
||
" <td>6.00</td>\n",
|
||
" <td>California</td>\n",
|
||
" <td>01001020300</td>\n",
|
||
" <td>01</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 93 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" GEOID10 Housing burden (percent) Total population \\\n",
|
||
"0 010010201001 0.15 692 \n",
|
||
"1 010010201002 0.15 1153 \n",
|
||
"2 010010202001 0.25 1020 \n",
|
||
"3 010010202002 0.25 1152 \n",
|
||
"4 010010203001 0.21 2555 \n",
|
||
"\n",
|
||
" Air toxics cancer risk Respiratory hazard index \\\n",
|
||
"0 49.38 0.79 \n",
|
||
"1 49.38 0.79 \n",
|
||
"2 50.32 0.81 \n",
|
||
"3 50.32 0.81 \n",
|
||
"4 50.77 0.82 \n",
|
||
"\n",
|
||
" Diesel particulate matter Particulate matter (PM2.5) Ozone \\\n",
|
||
"0 0.28 10.00 40.12 \n",
|
||
"1 0.28 10.00 40.12 \n",
|
||
"2 0.30 10.07 40.22 \n",
|
||
"3 0.30 10.07 40.22 \n",
|
||
"4 0.36 10.12 40.31 \n",
|
||
"\n",
|
||
" Traffic proximity and volume Proximity to RMP sites ... \\\n",
|
||
"0 91.02 0.09 ... \n",
|
||
"1 2.62 0.07 ... \n",
|
||
"2 4.68 0.08 ... \n",
|
||
"3 218.65 0.09 ... \n",
|
||
"4 69.64 0.08 ... \n",
|
||
"\n",
|
||
" Score D (top 25th percentile) Score E (percentile) \\\n",
|
||
"0 False 0.35 \n",
|
||
"1 False 0.11 \n",
|
||
"2 False 0.51 \n",
|
||
"3 False 0.59 \n",
|
||
"4 False 0.47 \n",
|
||
"\n",
|
||
" Score E (top 25th percentile) GEOID State Abbreviation County Name \\\n",
|
||
"0 False 1001 AL Autauga County \n",
|
||
"1 False 1001 AL Baldwin County \n",
|
||
"2 False 1001 AL Barbour County \n",
|
||
"3 False 1001 AL Bibb County \n",
|
||
"4 False 1001 AL Blount County \n",
|
||
"\n",
|
||
" State Code State Name GEOID10_TRACT GEOID10_STATE \n",
|
||
"0 1.00 Alabama 01001020100 01 \n",
|
||
"1 2.00 Alaska 01001020100 01 \n",
|
||
"2 4.00 Arizona 01001020200 01 \n",
|
||
"3 5.00 Arkansas 01001020200 01 \n",
|
||
"4 6.00 California 01001020300 01 \n",
|
||
"\n",
|
||
"[5 rows x 93 columns]"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Load CEJST score data\n",
|
||
"cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa.csv\"\n",
|
||
"cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n",
|
||
"\n",
|
||
"# Create the CBG's Census Tract ID by dropping the last number from the FIPS CODE of the CBG.\n",
|
||
"# The CBG ID is the last one character.\n",
|
||
"# For more information, see https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html.\n",
|
||
"cejst_df.loc[:, GEOID_TRACT_FIELD_NAME] = (\n",
|
||
" cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[:-1]\n",
|
||
")\n",
|
||
"\n",
|
||
"cejst_df.loc[:, GEOID_STATE_FIELD_NAME] = (\n",
|
||
" cejst_df.loc[:, GEOID_FIELD_NAME].astype(str).str[0:2]\n",
|
||
")\n",
|
||
"\n",
|
||
"cejst_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "a2448dcd",
|
||
"metadata": {
|
||
"scrolled": false
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>GEOID10_TRACT</th>\n",
|
||
" <th>Total Population</th>\n",
|
||
" <th>California County</th>\n",
|
||
" <th>ZIP</th>\n",
|
||
" <th>Nearby City \\r\\n(to help approximate location only)</th>\n",
|
||
" <th>Longitude</th>\n",
|
||
" <th>Latitude</th>\n",
|
||
" <th>calenviroscreen_score</th>\n",
|
||
" <th>calenviroscreen_percentile</th>\n",
|
||
" <th>DRAFT CES 4.0\\r\\nPercentile Range</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Poverty</th>\n",
|
||
" <th>Poverty Pctl</th>\n",
|
||
" <th>Unemployment</th>\n",
|
||
" <th>Unemployment Pctl</th>\n",
|
||
" <th>Housing Burden</th>\n",
|
||
" <th>Housing Burden Pctl</th>\n",
|
||
" <th>Pop. Char.</th>\n",
|
||
" <th>Pop. Char. Score</th>\n",
|
||
" <th>Pop. Char. Pctl</th>\n",
|
||
" <th>calenviroscreen_priority_community</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>06019001100</td>\n",
|
||
" <td>2760</td>\n",
|
||
" <td>Fresno</td>\n",
|
||
" <td>93706</td>\n",
|
||
" <td>Fresno</td>\n",
|
||
" <td>-119.78</td>\n",
|
||
" <td>36.71</td>\n",
|
||
" <td>94.61</td>\n",
|
||
" <td>100.00</td>\n",
|
||
" <td>95-100% (highest scores)</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>76.60</td>\n",
|
||
" <td>98.43</td>\n",
|
||
" <td>16.20</td>\n",
|
||
" <td>97.15</td>\n",
|
||
" <td>30.70</td>\n",
|
||
" <td>90.61</td>\n",
|
||
" <td>93.73</td>\n",
|
||
" <td>9.72</td>\n",
|
||
" <td>99.87</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>06077000700</td>\n",
|
||
" <td>4177</td>\n",
|
||
" <td>San Joaquin</td>\n",
|
||
" <td>95206</td>\n",
|
||
" <td>Stockton</td>\n",
|
||
" <td>-121.29</td>\n",
|
||
" <td>37.94</td>\n",
|
||
" <td>90.83</td>\n",
|
||
" <td>99.99</td>\n",
|
||
" <td>95-100% (highest scores)</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>70.60</td>\n",
|
||
" <td>96.43</td>\n",
|
||
" <td>18.50</td>\n",
|
||
" <td>98.45</td>\n",
|
||
" <td>35.20</td>\n",
|
||
" <td>95.61</td>\n",
|
||
" <td>93.40</td>\n",
|
||
" <td>9.68</td>\n",
|
||
" <td>99.84</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>06077000100</td>\n",
|
||
" <td>4055</td>\n",
|
||
" <td>San Joaquin</td>\n",
|
||
" <td>95202</td>\n",
|
||
" <td>Stockton</td>\n",
|
||
" <td>-121.29</td>\n",
|
||
" <td>37.95</td>\n",
|
||
" <td>85.75</td>\n",
|
||
" <td>99.97</td>\n",
|
||
" <td>95-100% (highest scores)</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>81.80</td>\n",
|
||
" <td>99.50</td>\n",
|
||
" <td>17.90</td>\n",
|
||
" <td>98.17</td>\n",
|
||
" <td>36.40</td>\n",
|
||
" <td>96.51</td>\n",
|
||
" <td>95.71</td>\n",
|
||
" <td>9.92</td>\n",
|
||
" <td>99.97</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>06071001600</td>\n",
|
||
" <td>5527</td>\n",
|
||
" <td>San Bernardino</td>\n",
|
||
" <td>91761</td>\n",
|
||
" <td>Ontario</td>\n",
|
||
" <td>-117.62</td>\n",
|
||
" <td>34.06</td>\n",
|
||
" <td>83.56</td>\n",
|
||
" <td>99.96</td>\n",
|
||
" <td>95-100% (highest scores)</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>67.10</td>\n",
|
||
" <td>94.82</td>\n",
|
||
" <td>6.70</td>\n",
|
||
" <td>57.20</td>\n",
|
||
" <td>32.10</td>\n",
|
||
" <td>92.65</td>\n",
|
||
" <td>80.59</td>\n",
|
||
" <td>8.36</td>\n",
|
||
" <td>93.06</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>06037204920</td>\n",
|
||
" <td>2639</td>\n",
|
||
" <td>Los Angeles</td>\n",
|
||
" <td>90023</td>\n",
|
||
" <td>Los Angeles</td>\n",
|
||
" <td>-118.20</td>\n",
|
||
" <td>34.02</td>\n",
|
||
" <td>82.90</td>\n",
|
||
" <td>99.95</td>\n",
|
||
" <td>95-100% (highest scores)</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>64.90</td>\n",
|
||
" <td>93.51</td>\n",
|
||
" <td>5.60</td>\n",
|
||
" <td>43.81</td>\n",
|
||
" <td>25.00</td>\n",
|
||
" <td>77.95</td>\n",
|
||
" <td>83.95</td>\n",
|
||
" <td>8.70</td>\n",
|
||
" <td>95.78</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 59 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" GEOID10_TRACT Total Population California County ZIP \\\n",
|
||
"0 06019001100 2760 Fresno 93706 \n",
|
||
"1 06077000700 4177 San Joaquin 95206 \n",
|
||
"2 06077000100 4055 San Joaquin 95202 \n",
|
||
"3 06071001600 5527 San Bernardino 91761 \n",
|
||
"4 06037204920 2639 Los Angeles 90023 \n",
|
||
"\n",
|
||
" Nearby City \\r\\n(to help approximate location only) Longitude Latitude \\\n",
|
||
"0 Fresno -119.78 36.71 \n",
|
||
"1 Stockton -121.29 37.94 \n",
|
||
"2 Stockton -121.29 37.95 \n",
|
||
"3 Ontario -117.62 34.06 \n",
|
||
"4 Los Angeles -118.20 34.02 \n",
|
||
"\n",
|
||
" calenviroscreen_score calenviroscreen_percentile \\\n",
|
||
"0 94.61 100.00 \n",
|
||
"1 90.83 99.99 \n",
|
||
"2 85.75 99.97 \n",
|
||
"3 83.56 99.96 \n",
|
||
"4 82.90 99.95 \n",
|
||
"\n",
|
||
" DRAFT CES 4.0\\r\\nPercentile Range ... Poverty Poverty Pctl Unemployment \\\n",
|
||
"0 95-100% (highest scores) ... 76.60 98.43 16.20 \n",
|
||
"1 95-100% (highest scores) ... 70.60 96.43 18.50 \n",
|
||
"2 95-100% (highest scores) ... 81.80 99.50 17.90 \n",
|
||
"3 95-100% (highest scores) ... 67.10 94.82 6.70 \n",
|
||
"4 95-100% (highest scores) ... 64.90 93.51 5.60 \n",
|
||
"\n",
|
||
" Unemployment Pctl Housing Burden Housing Burden Pctl Pop. Char. \\\n",
|
||
"0 97.15 30.70 90.61 93.73 \n",
|
||
"1 98.45 35.20 95.61 93.40 \n",
|
||
"2 98.17 36.40 96.51 95.71 \n",
|
||
"3 57.20 32.10 92.65 80.59 \n",
|
||
"4 43.81 25.00 77.95 83.95 \n",
|
||
"\n",
|
||
" Pop. Char. Score Pop. Char. Pctl calenviroscreen_priority_community \n",
|
||
"0 9.72 99.87 True \n",
|
||
"1 9.68 99.84 True \n",
|
||
"2 9.92 99.97 True \n",
|
||
"3 8.36 93.06 True \n",
|
||
"4 8.70 95.78 True \n",
|
||
"\n",
|
||
"[5 rows x 59 columns]"
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Load CalEnviroScreen 4.0\n",
|
||
"CALENVIROSCREEN_SCORE_FIELD = \"calenviroscreen_score\"\n",
|
||
"CALENVIROSCREEN_PERCENTILE_FIELD = \"calenviroscreen_percentile\"\n",
|
||
"CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD = \"calenviroscreen_priority_community\"\n",
|
||
"\n",
|
||
"calenviroscreen_data_path = DATA_DIR / \"dataset\" / \"calenviroscreen4\" / \"data06.csv\"\n",
|
||
"calenviroscreen_df = pd.read_csv(\n",
|
||
" calenviroscreen_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
|
||
")\n",
|
||
"\n",
|
||
"# Convert priority community field to a bool.\n",
|
||
"calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD] = calenviroscreen_df[\n",
|
||
" CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD\n",
|
||
"].astype(bool)\n",
|
||
"\n",
|
||
"calenviroscreen_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f612a86a",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>FID</th>\n",
|
||
" <th>GEOID10_TRACT</th>\n",
|
||
" <th>STATE</th>\n",
|
||
" <th>STUSAB</th>\n",
|
||
" <th>STATE_NAME</th>\n",
|
||
" <th>COUNTY</th>\n",
|
||
" <th>COUNTY_NAME</th>\n",
|
||
" <th>CNTY_FIPS</th>\n",
|
||
" <th>TRACT</th>\n",
|
||
" <th>RCAP_90</th>\n",
|
||
" <th>RCAP_00</th>\n",
|
||
" <th>RCAP_10</th>\n",
|
||
" <th>hud_recap_priority_community</th>\n",
|
||
" <th>SHAPE_Length</th>\n",
|
||
" <th>SHAPE_Area</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>29993</td>\n",
|
||
" <td>01001020100</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>AL</td>\n",
|
||
" <td>Alabama</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Autauga</td>\n",
|
||
" <td>1001</td>\n",
|
||
" <td>20100</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.15</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>30627</td>\n",
|
||
" <td>01001020200</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>AL</td>\n",
|
||
" <td>Alabama</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Autauga</td>\n",
|
||
" <td>1001</td>\n",
|
||
" <td>20200</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>29992</td>\n",
|
||
" <td>01001020300</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>AL</td>\n",
|
||
" <td>Alabama</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Autauga</td>\n",
|
||
" <td>1001</td>\n",
|
||
" <td>20300</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.10</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>30079</td>\n",
|
||
" <td>01001020400</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>AL</td>\n",
|
||
" <td>Alabama</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Autauga</td>\n",
|
||
" <td>1001</td>\n",
|
||
" <td>20400</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.12</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>30078</td>\n",
|
||
" <td>01001020500</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>AL</td>\n",
|
||
" <td>Alabama</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>Autauga</td>\n",
|
||
" <td>1001</td>\n",
|
||
" <td>20500</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.16</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" FID GEOID10_TRACT STATE STUSAB STATE_NAME COUNTY COUNTY_NAME \\\n",
|
||
"0 29993 01001020100 1 AL Alabama 1 Autauga \n",
|
||
"1 30627 01001020200 1 AL Alabama 1 Autauga \n",
|
||
"2 29992 01001020300 1 AL Alabama 1 Autauga \n",
|
||
"3 30079 01001020400 1 AL Alabama 1 Autauga \n",
|
||
"4 30078 01001020500 1 AL Alabama 1 Autauga \n",
|
||
"\n",
|
||
" CNTY_FIPS TRACT RCAP_90 RCAP_00 RCAP_10 hud_recap_priority_community \\\n",
|
||
"0 1001 20100 0.00 0.00 0.00 False \n",
|
||
"1 1001 20200 0.00 0.00 0.00 False \n",
|
||
"2 1001 20300 0.00 0.00 0.00 False \n",
|
||
"3 1001 20400 0.00 0.00 0.00 False \n",
|
||
"4 1001 20500 0.00 0.00 0.00 False \n",
|
||
"\n",
|
||
" SHAPE_Length SHAPE_Area \n",
|
||
"0 0.15 0.00 \n",
|
||
"1 0.09 0.00 \n",
|
||
"2 0.10 0.00 \n",
|
||
"3 0.12 0.00 \n",
|
||
"4 0.16 0.00 "
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Load HUD data\n",
|
||
"hud_recap_data_path = DATA_DIR / \"dataset\" / \"hud_recap\" / \"usa.csv\"\n",
|
||
"hud_recap_df = pd.read_csv(\n",
|
||
" hud_recap_data_path, dtype={GEOID_TRACT_FIELD_NAME: \"string\"}\n",
|
||
")\n",
|
||
"\n",
|
||
"hud_recap_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "4ee6e6ee",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>GEOID10_TRACT</th>\n",
|
||
" <th>Total Population</th>\n",
|
||
" <th>California County</th>\n",
|
||
" <th>ZIP</th>\n",
|
||
" <th>Nearby City \\r\\n(to help approximate location only)</th>\n",
|
||
" <th>Longitude</th>\n",
|
||
" <th>Latitude</th>\n",
|
||
" <th>calenviroscreen_score</th>\n",
|
||
" <th>calenviroscreen_percentile</th>\n",
|
||
" <th>DRAFT CES 4.0\\r\\nPercentile Range</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>COUNTY</th>\n",
|
||
" <th>COUNTY_NAME</th>\n",
|
||
" <th>CNTY_FIPS</th>\n",
|
||
" <th>TRACT</th>\n",
|
||
" <th>RCAP_90</th>\n",
|
||
" <th>RCAP_00</th>\n",
|
||
" <th>RCAP_10</th>\n",
|
||
" <th>hud_recap_priority_community</th>\n",
|
||
" <th>SHAPE_Length</th>\n",
|
||
" <th>SHAPE_Area</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>06019001100</td>\n",
|
||
" <td>2760.00</td>\n",
|
||
" <td>Fresno</td>\n",
|
||
" <td>93706.00</td>\n",
|
||
" <td>Fresno</td>\n",
|
||
" <td>-119.78</td>\n",
|
||
" <td>36.71</td>\n",
|
||
" <td>94.61</td>\n",
|
||
" <td>100.00</td>\n",
|
||
" <td>95-100% (highest scores)</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>19</td>\n",
|
||
" <td>Fresno</td>\n",
|
||
" <td>6019</td>\n",
|
||
" <td>1100</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>06077000700</td>\n",
|
||
" <td>4177.00</td>\n",
|
||
" <td>San Joaquin</td>\n",
|
||
" <td>95206.00</td>\n",
|
||
" <td>Stockton</td>\n",
|
||
" <td>-121.29</td>\n",
|
||
" <td>37.94</td>\n",
|
||
" <td>90.83</td>\n",
|
||
" <td>99.99</td>\n",
|
||
" <td>95-100% (highest scores)</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>77</td>\n",
|
||
" <td>San Joaquin</td>\n",
|
||
" <td>6077</td>\n",
|
||
" <td>700</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0.07</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>06077000100</td>\n",
|
||
" <td>4055.00</td>\n",
|
||
" <td>San Joaquin</td>\n",
|
||
" <td>95202.00</td>\n",
|
||
" <td>Stockton</td>\n",
|
||
" <td>-121.29</td>\n",
|
||
" <td>37.95</td>\n",
|
||
" <td>85.75</td>\n",
|
||
" <td>99.97</td>\n",
|
||
" <td>95-100% (highest scores)</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>77</td>\n",
|
||
" <td>San Joaquin</td>\n",
|
||
" <td>6077</td>\n",
|
||
" <td>100</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0.06</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>06071001600</td>\n",
|
||
" <td>5527.00</td>\n",
|
||
" <td>San Bernardino</td>\n",
|
||
" <td>91761.00</td>\n",
|
||
" <td>Ontario</td>\n",
|
||
" <td>-117.62</td>\n",
|
||
" <td>34.06</td>\n",
|
||
" <td>83.56</td>\n",
|
||
" <td>99.96</td>\n",
|
||
" <td>95-100% (highest scores)</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>71</td>\n",
|
||
" <td>San Bernardino</td>\n",
|
||
" <td>6071</td>\n",
|
||
" <td>1600</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>06037204920</td>\n",
|
||
" <td>2639.00</td>\n",
|
||
" <td>Los Angeles</td>\n",
|
||
" <td>90023.00</td>\n",
|
||
" <td>Los Angeles</td>\n",
|
||
" <td>-118.20</td>\n",
|
||
" <td>34.02</td>\n",
|
||
" <td>82.90</td>\n",
|
||
" <td>99.95</td>\n",
|
||
" <td>95-100% (highest scores)</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>37</td>\n",
|
||
" <td>Los Angeles</td>\n",
|
||
" <td>6037</td>\n",
|
||
" <td>204920</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.04</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 73 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" GEOID10_TRACT Total Population California County ZIP \\\n",
|
||
"0 06019001100 2760.00 Fresno 93706.00 \n",
|
||
"1 06077000700 4177.00 San Joaquin 95206.00 \n",
|
||
"2 06077000100 4055.00 San Joaquin 95202.00 \n",
|
||
"3 06071001600 5527.00 San Bernardino 91761.00 \n",
|
||
"4 06037204920 2639.00 Los Angeles 90023.00 \n",
|
||
"\n",
|
||
" Nearby City \\r\\n(to help approximate location only) Longitude Latitude \\\n",
|
||
"0 Fresno -119.78 36.71 \n",
|
||
"1 Stockton -121.29 37.94 \n",
|
||
"2 Stockton -121.29 37.95 \n",
|
||
"3 Ontario -117.62 34.06 \n",
|
||
"4 Los Angeles -118.20 34.02 \n",
|
||
"\n",
|
||
" calenviroscreen_score calenviroscreen_percentile \\\n",
|
||
"0 94.61 100.00 \n",
|
||
"1 90.83 99.99 \n",
|
||
"2 85.75 99.97 \n",
|
||
"3 83.56 99.96 \n",
|
||
"4 82.90 99.95 \n",
|
||
"\n",
|
||
" DRAFT CES 4.0\\r\\nPercentile Range ... COUNTY COUNTY_NAME CNTY_FIPS \\\n",
|
||
"0 95-100% (highest scores) ... 19 Fresno 6019 \n",
|
||
"1 95-100% (highest scores) ... 77 San Joaquin 6077 \n",
|
||
"2 95-100% (highest scores) ... 77 San Joaquin 6077 \n",
|
||
"3 95-100% (highest scores) ... 71 San Bernardino 6071 \n",
|
||
"4 95-100% (highest scores) ... 37 Los Angeles 6037 \n",
|
||
"\n",
|
||
" TRACT RCAP_90 RCAP_00 RCAP_10 hud_recap_priority_community \\\n",
|
||
"0 1100 0.00 1.00 1.00 True \n",
|
||
"1 700 0.00 0.00 0.00 True \n",
|
||
"2 100 1.00 1.00 1.00 True \n",
|
||
"3 1600 0.00 0.00 0.00 True \n",
|
||
"4 204920 0.00 0.00 0.00 False \n",
|
||
"\n",
|
||
" SHAPE_Length SHAPE_Area \n",
|
||
"0 0.09 0.00 \n",
|
||
"1 0.07 0.00 \n",
|
||
"2 0.06 0.00 \n",
|
||
"3 0.25 0.00 \n",
|
||
"4 0.04 0.00 \n",
|
||
"\n",
|
||
"[5 rows x 73 columns]"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Join all dataframes that use tracts\n",
|
||
"census_tract_dfs = [calenviroscreen_df, hud_recap_df]\n",
|
||
"\n",
|
||
"census_tract_df = functools.reduce(\n",
|
||
" lambda left, right: pd.merge(\n",
|
||
" left=left, right=right, on=GEOID_TRACT_FIELD_NAME, how=\"outer\"\n",
|
||
" ),\n",
|
||
" census_tract_dfs,\n",
|
||
")\n",
|
||
"\n",
|
||
"tract_values = census_tract_df[GEOID_TRACT_FIELD_NAME].str.len().unique()\n",
|
||
"if any(tract_values != [11]):\n",
|
||
" print(tract_values)\n",
|
||
" raise ValueError(\"Some of the census tract data has the wrong length.\")\n",
|
||
"\n",
|
||
"if len(census_tract_df) > 74134:\n",
|
||
" raise ValueError(\"Too many rows in the join.\")\n",
|
||
"\n",
|
||
"census_tract_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "70d76fbc",
|
||
"metadata": {
|
||
"scrolled": false
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>GEOID10</th>\n",
|
||
" <th>Housing burden (percent)</th>\n",
|
||
" <th>Total population</th>\n",
|
||
" <th>Air toxics cancer risk</th>\n",
|
||
" <th>Respiratory hazard index</th>\n",
|
||
" <th>Diesel particulate matter</th>\n",
|
||
" <th>Particulate matter (PM2.5)</th>\n",
|
||
" <th>Ozone_x</th>\n",
|
||
" <th>Traffic proximity and volume</th>\n",
|
||
" <th>Proximity to RMP sites</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>COUNTY</th>\n",
|
||
" <th>COUNTY_NAME</th>\n",
|
||
" <th>CNTY_FIPS</th>\n",
|
||
" <th>TRACT</th>\n",
|
||
" <th>RCAP_90</th>\n",
|
||
" <th>RCAP_00</th>\n",
|
||
" <th>RCAP_10</th>\n",
|
||
" <th>hud_recap_priority_community</th>\n",
|
||
" <th>SHAPE_Length</th>\n",
|
||
" <th>SHAPE_Area</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>010010201001</td>\n",
|
||
" <td>0.15</td>\n",
|
||
" <td>692</td>\n",
|
||
" <td>49.38</td>\n",
|
||
" <td>0.79</td>\n",
|
||
" <td>0.28</td>\n",
|
||
" <td>10.00</td>\n",
|
||
" <td>40.12</td>\n",
|
||
" <td>91.02</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>Autauga</td>\n",
|
||
" <td>1001.00</td>\n",
|
||
" <td>20100.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.15</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>010010201002</td>\n",
|
||
" <td>0.15</td>\n",
|
||
" <td>1153</td>\n",
|
||
" <td>49.38</td>\n",
|
||
" <td>0.79</td>\n",
|
||
" <td>0.28</td>\n",
|
||
" <td>10.00</td>\n",
|
||
" <td>40.12</td>\n",
|
||
" <td>2.62</td>\n",
|
||
" <td>0.07</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>Autauga</td>\n",
|
||
" <td>1001.00</td>\n",
|
||
" <td>20100.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.15</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>010010202001</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>1020</td>\n",
|
||
" <td>50.32</td>\n",
|
||
" <td>0.81</td>\n",
|
||
" <td>0.30</td>\n",
|
||
" <td>10.07</td>\n",
|
||
" <td>40.22</td>\n",
|
||
" <td>4.68</td>\n",
|
||
" <td>0.08</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>Autauga</td>\n",
|
||
" <td>1001.00</td>\n",
|
||
" <td>20200.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>010010202002</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>1152</td>\n",
|
||
" <td>50.32</td>\n",
|
||
" <td>0.81</td>\n",
|
||
" <td>0.30</td>\n",
|
||
" <td>10.07</td>\n",
|
||
" <td>40.22</td>\n",
|
||
" <td>218.65</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>Autauga</td>\n",
|
||
" <td>1001.00</td>\n",
|
||
" <td>20200.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>010010203001</td>\n",
|
||
" <td>0.21</td>\n",
|
||
" <td>2555</td>\n",
|
||
" <td>50.77</td>\n",
|
||
" <td>0.82</td>\n",
|
||
" <td>0.36</td>\n",
|
||
" <td>10.12</td>\n",
|
||
" <td>40.31</td>\n",
|
||
" <td>69.64</td>\n",
|
||
" <td>0.08</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1.00</td>\n",
|
||
" <td>Autauga</td>\n",
|
||
" <td>1001.00</td>\n",
|
||
" <td>20300.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0.10</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 165 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" GEOID10 Housing burden (percent) Total population \\\n",
|
||
"0 010010201001 0.15 692 \n",
|
||
"1 010010201002 0.15 1153 \n",
|
||
"2 010010202001 0.25 1020 \n",
|
||
"3 010010202002 0.25 1152 \n",
|
||
"4 010010203001 0.21 2555 \n",
|
||
"\n",
|
||
" Air toxics cancer risk Respiratory hazard index \\\n",
|
||
"0 49.38 0.79 \n",
|
||
"1 49.38 0.79 \n",
|
||
"2 50.32 0.81 \n",
|
||
"3 50.32 0.81 \n",
|
||
"4 50.77 0.82 \n",
|
||
"\n",
|
||
" Diesel particulate matter Particulate matter (PM2.5) Ozone_x \\\n",
|
||
"0 0.28 10.00 40.12 \n",
|
||
"1 0.28 10.00 40.12 \n",
|
||
"2 0.30 10.07 40.22 \n",
|
||
"3 0.30 10.07 40.22 \n",
|
||
"4 0.36 10.12 40.31 \n",
|
||
"\n",
|
||
" Traffic proximity and volume Proximity to RMP sites ... COUNTY \\\n",
|
||
"0 91.02 0.09 ... 1.00 \n",
|
||
"1 2.62 0.07 ... 1.00 \n",
|
||
"2 4.68 0.08 ... 1.00 \n",
|
||
"3 218.65 0.09 ... 1.00 \n",
|
||
"4 69.64 0.08 ... 1.00 \n",
|
||
"\n",
|
||
" COUNTY_NAME CNTY_FIPS TRACT RCAP_90 RCAP_00 RCAP_10 \\\n",
|
||
"0 Autauga 1001.00 20100.00 0.00 0.00 0.00 \n",
|
||
"1 Autauga 1001.00 20100.00 0.00 0.00 0.00 \n",
|
||
"2 Autauga 1001.00 20200.00 0.00 0.00 0.00 \n",
|
||
"3 Autauga 1001.00 20200.00 0.00 0.00 0.00 \n",
|
||
"4 Autauga 1001.00 20300.00 0.00 0.00 0.00 \n",
|
||
"\n",
|
||
" hud_recap_priority_community SHAPE_Length SHAPE_Area \n",
|
||
"0 False 0.15 0.00 \n",
|
||
"1 False 0.15 0.00 \n",
|
||
"2 False 0.09 0.00 \n",
|
||
"3 False 0.09 0.00 \n",
|
||
"4 False 0.10 0.00 \n",
|
||
"\n",
|
||
"[5 rows x 165 columns]"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Join tract indices and CEJST data.\n",
|
||
"# Note: we're joining on the census *tract*, so there will be multiple CBG entries joined to the same census tract row from CES,\n",
|
||
"# creating multiple rows of the same CES data.\n",
|
||
"merged_df = cejst_df.merge(\n",
|
||
" census_tract_df,\n",
|
||
" how=\"left\",\n",
|
||
" on=GEOID_TRACT_FIELD_NAME,\n",
|
||
")\n",
|
||
"\n",
|
||
"\n",
|
||
"if len(merged_df) > 220333:\n",
|
||
" raise ValueError(\"Too many rows in the join.\")\n",
|
||
"\n",
|
||
"merged_df.head()\n",
|
||
"\n",
|
||
"\n",
|
||
"# merged_df.to_csv(\n",
|
||
"# path_or_buf=COMPARISON_OUTPUTS_DIR / \"merged.csv\", na_rep=\"\", index=False\n",
|
||
"# )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "558a2cc1",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Define a namedtuple for indices.\n",
|
||
"Index = collections.namedtuple(\n",
|
||
" typename=\"Index\",\n",
|
||
" field_names=[\n",
|
||
" \"method_name\",\n",
|
||
" \"priority_communities_field\",\n",
|
||
" # Note: this field only used by indices defined at the census tract level.\n",
|
||
" \"other_census_tract_fields_to_keep\",\n",
|
||
" ],\n",
|
||
")\n",
|
||
"\n",
|
||
"# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n",
|
||
"# (`census_tract_indices`).\n",
|
||
"census_block_group_indices = [\n",
|
||
" Index(\n",
|
||
" method_name=\"Score A\",\n",
|
||
" priority_communities_field=\"Score A (top 25th percentile)\",\n",
|
||
" other_census_tract_fields_to_keep=[],\n",
|
||
" ),\n",
|
||
" Index(\n",
|
||
" method_name=\"Score B\",\n",
|
||
" priority_communities_field=\"Score B (top 25th percentile)\",\n",
|
||
" other_census_tract_fields_to_keep=[],\n",
|
||
" ),\n",
|
||
" Index(\n",
|
||
" method_name=\"Score C\",\n",
|
||
" priority_communities_field=\"Score C (top 25th percentile)\",\n",
|
||
" other_census_tract_fields_to_keep=[],\n",
|
||
" ),\n",
|
||
" Index(\n",
|
||
" method_name=\"Score D (25th percentile)\",\n",
|
||
" priority_communities_field=\"Score D (top 25th percentile)\",\n",
|
||
" other_census_tract_fields_to_keep=[],\n",
|
||
" ),\n",
|
||
" Index(\n",
|
||
" method_name=\"Score D (30th percentile)\",\n",
|
||
" priority_communities_field=\"Score D (top 30th percentile)\",\n",
|
||
" other_census_tract_fields_to_keep=[],\n",
|
||
" ),\n",
|
||
" Index(\n",
|
||
" method_name=\"Score D (35th percentile)\",\n",
|
||
" priority_communities_field=\"Score D (top 35th percentile)\",\n",
|
||
" other_census_tract_fields_to_keep=[],\n",
|
||
" ),\n",
|
||
" Index(\n",
|
||
" method_name=\"Score D (40th percentile)\",\n",
|
||
" priority_communities_field=\"Score D (top 40th percentile)\",\n",
|
||
" other_census_tract_fields_to_keep=[],\n",
|
||
" ),\n",
|
||
" Index(\n",
|
||
" method_name=\"Poverty\",\n",
|
||
" priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
|
||
" other_census_tract_fields_to_keep=[],\n",
|
||
" ),\n",
|
||
"]\n",
|
||
"\n",
|
||
"census_tract_indices = [\n",
|
||
" Index(\n",
|
||
" method_name=\"CalEnviroScreen 4.0\",\n",
|
||
" priority_communities_field=\"calenviroscreen_priority_community\",\n",
|
||
" other_census_tract_fields_to_keep=[\n",
|
||
" CALENVIROSCREEN_SCORE_FIELD,\n",
|
||
" CALENVIROSCREEN_PERCENTILE_FIELD,\n",
|
||
" ],\n",
|
||
" ),\n",
|
||
" Index(\n",
|
||
" method_name=\"HUD RECAP\",\n",
|
||
" priority_communities_field=\"hud_recap_priority_community\",\n",
|
||
" other_census_tract_fields_to_keep=[],\n",
|
||
" ),\n",
|
||
"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "5b71b2ab",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Converting calenviroscreen_priority_community to boolean.\n",
|
||
"Converting hud_recap_priority_community to boolean.\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "223dcb75c0384fd5b93bc2ac3bc07656",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/52 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th>GEOID10_STATE</th>\n",
|
||
" <th>State name</th>\n",
|
||
" <th>Total CBGs in state</th>\n",
|
||
" <th>Total population in state</th>\n",
|
||
" <th>Score A (top 25th percentile) (priority population)</th>\n",
|
||
" <th>Score A (top 25th percentile) (total CBGs)</th>\n",
|
||
" <th>Score A (top 25th percentile) (percent CBGs)</th>\n",
|
||
" <th>Score A (top 25th percentile) (percent population)</th>\n",
|
||
" <th>Score B (top 25th percentile) (priority population)</th>\n",
|
||
" <th>Score B (top 25th percentile) (total CBGs)</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>Score E (top 25th percentile) (percent CBGs)</th>\n",
|
||
" <th>Score E (top 25th percentile) (percent population)</th>\n",
|
||
" <th>calenviroscreen_priority_community (priority population)</th>\n",
|
||
" <th>calenviroscreen_priority_community (total CBGs)</th>\n",
|
||
" <th>calenviroscreen_priority_community (percent CBGs)</th>\n",
|
||
" <th>calenviroscreen_priority_community (percent population)</th>\n",
|
||
" <th>hud_recap_priority_community (priority population)</th>\n",
|
||
" <th>hud_recap_priority_community (total CBGs)</th>\n",
|
||
" <th>hud_recap_priority_community (percent CBGs)</th>\n",
|
||
" <th>hud_recap_priority_community (percent population)</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>GEOID10_STATE</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>01</th>\n",
|
||
" <th>0</th>\n",
|
||
" <td>01</td>\n",
|
||
" <td>Alabama</td>\n",
|
||
" <td>3438</td>\n",
|
||
" <td>4850771</td>\n",
|
||
" <td>1547345</td>\n",
|
||
" <td>1326</td>\n",
|
||
" <td>0.39</td>\n",
|
||
" <td>0.32</td>\n",
|
||
" <td>1556417</td>\n",
|
||
" <td>1323</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.23</td>\n",
|
||
" <td>0.19</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>235117</td>\n",
|
||
" <td>258</td>\n",
|
||
" <td>0.08</td>\n",
|
||
" <td>0.05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>02</th>\n",
|
||
" <th>0</th>\n",
|
||
" <td>02</td>\n",
|
||
" <td>Alaska</td>\n",
|
||
" <td>534</td>\n",
|
||
" <td>738565</td>\n",
|
||
" <td>63868</td>\n",
|
||
" <td>57</td>\n",
|
||
" <td>0.11</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>63868</td>\n",
|
||
" <td>57</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.14</td>\n",
|
||
" <td>0.12</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>6536</td>\n",
|
||
" <td>8</td>\n",
|
||
" <td>0.01</td>\n",
|
||
" <td>0.01</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>04</th>\n",
|
||
" <th>0</th>\n",
|
||
" <td>04</td>\n",
|
||
" <td>Arizona</td>\n",
|
||
" <td>4178</td>\n",
|
||
" <td>6809946</td>\n",
|
||
" <td>1956052</td>\n",
|
||
" <td>1230</td>\n",
|
||
" <td>0.29</td>\n",
|
||
" <td>0.29</td>\n",
|
||
" <td>1960856</td>\n",
|
||
" <td>1231</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.30</td>\n",
|
||
" <td>0.30</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>560353</td>\n",
|
||
" <td>378</td>\n",
|
||
" <td>0.09</td>\n",
|
||
" <td>0.08</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>05</th>\n",
|
||
" <th>0</th>\n",
|
||
" <td>05</td>\n",
|
||
" <td>Arkansas</td>\n",
|
||
" <td>2147</td>\n",
|
||
" <td>2977944</td>\n",
|
||
" <td>960799</td>\n",
|
||
" <td>817</td>\n",
|
||
" <td>0.38</td>\n",
|
||
" <td>0.32</td>\n",
|
||
" <td>975780</td>\n",
|
||
" <td>826</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.20</td>\n",
|
||
" <td>0.18</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>0.00</td>\n",
|
||
" <td>101200</td>\n",
|
||
" <td>106</td>\n",
|
||
" <td>0.05</td>\n",
|
||
" <td>0.03</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>06</th>\n",
|
||
" <th>0</th>\n",
|
||
" <td>06</td>\n",
|
||
" <td>California</td>\n",
|
||
" <td>23212</td>\n",
|
||
" <td>38982847</td>\n",
|
||
" <td>12610810</td>\n",
|
||
" <td>7102</td>\n",
|
||
" <td>0.31</td>\n",
|
||
" <td>0.32</td>\n",
|
||
" <td>12556846</td>\n",
|
||
" <td>7065</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.40</td>\n",
|
||
" <td>0.42</td>\n",
|
||
" <td>9610287</td>\n",
|
||
" <td>5690</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>0.25</td>\n",
|
||
" <td>1748765</td>\n",
|
||
" <td>1013</td>\n",
|
||
" <td>0.04</td>\n",
|
||
" <td>0.04</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 32 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" GEOID10_STATE State name Total CBGs in state \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 01 Alabama 3438 \n",
|
||
"02 0 02 Alaska 534 \n",
|
||
"04 0 04 Arizona 4178 \n",
|
||
"05 0 05 Arkansas 2147 \n",
|
||
"06 0 06 California 23212 \n",
|
||
"\n",
|
||
" Total population in state \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 4850771 \n",
|
||
"02 0 738565 \n",
|
||
"04 0 6809946 \n",
|
||
"05 0 2977944 \n",
|
||
"06 0 38982847 \n",
|
||
"\n",
|
||
" Score A (top 25th percentile) (priority population) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 1547345 \n",
|
||
"02 0 63868 \n",
|
||
"04 0 1956052 \n",
|
||
"05 0 960799 \n",
|
||
"06 0 12610810 \n",
|
||
"\n",
|
||
" Score A (top 25th percentile) (total CBGs) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 1326 \n",
|
||
"02 0 57 \n",
|
||
"04 0 1230 \n",
|
||
"05 0 817 \n",
|
||
"06 0 7102 \n",
|
||
"\n",
|
||
" Score A (top 25th percentile) (percent CBGs) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 0.39 \n",
|
||
"02 0 0.11 \n",
|
||
"04 0 0.29 \n",
|
||
"05 0 0.38 \n",
|
||
"06 0 0.31 \n",
|
||
"\n",
|
||
" Score A (top 25th percentile) (percent population) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 0.32 \n",
|
||
"02 0 0.09 \n",
|
||
"04 0 0.29 \n",
|
||
"05 0 0.32 \n",
|
||
"06 0 0.32 \n",
|
||
"\n",
|
||
" Score B (top 25th percentile) (priority population) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 1556417 \n",
|
||
"02 0 63868 \n",
|
||
"04 0 1960856 \n",
|
||
"05 0 975780 \n",
|
||
"06 0 12556846 \n",
|
||
"\n",
|
||
" Score B (top 25th percentile) (total CBGs) ... \\\n",
|
||
"GEOID10_STATE ... \n",
|
||
"01 0 1323 ... \n",
|
||
"02 0 57 ... \n",
|
||
"04 0 1231 ... \n",
|
||
"05 0 826 ... \n",
|
||
"06 0 7065 ... \n",
|
||
"\n",
|
||
" Score E (top 25th percentile) (percent CBGs) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 0.23 \n",
|
||
"02 0 0.14 \n",
|
||
"04 0 0.30 \n",
|
||
"05 0 0.20 \n",
|
||
"06 0 0.40 \n",
|
||
"\n",
|
||
" Score E (top 25th percentile) (percent population) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 0.19 \n",
|
||
"02 0 0.12 \n",
|
||
"04 0 0.30 \n",
|
||
"05 0 0.18 \n",
|
||
"06 0 0.42 \n",
|
||
"\n",
|
||
" calenviroscreen_priority_community (priority population) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 0 \n",
|
||
"02 0 0 \n",
|
||
"04 0 0 \n",
|
||
"05 0 0 \n",
|
||
"06 0 9610287 \n",
|
||
"\n",
|
||
" calenviroscreen_priority_community (total CBGs) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 0 \n",
|
||
"02 0 0 \n",
|
||
"04 0 0 \n",
|
||
"05 0 0 \n",
|
||
"06 0 5690 \n",
|
||
"\n",
|
||
" calenviroscreen_priority_community (percent CBGs) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 0.00 \n",
|
||
"02 0 0.00 \n",
|
||
"04 0 0.00 \n",
|
||
"05 0 0.00 \n",
|
||
"06 0 0.25 \n",
|
||
"\n",
|
||
" calenviroscreen_priority_community (percent population) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 0.00 \n",
|
||
"02 0 0.00 \n",
|
||
"04 0 0.00 \n",
|
||
"05 0 0.00 \n",
|
||
"06 0 0.25 \n",
|
||
"\n",
|
||
" hud_recap_priority_community (priority population) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 235117 \n",
|
||
"02 0 6536 \n",
|
||
"04 0 560353 \n",
|
||
"05 0 101200 \n",
|
||
"06 0 1748765 \n",
|
||
"\n",
|
||
" hud_recap_priority_community (total CBGs) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 258 \n",
|
||
"02 0 8 \n",
|
||
"04 0 378 \n",
|
||
"05 0 106 \n",
|
||
"06 0 1013 \n",
|
||
"\n",
|
||
" hud_recap_priority_community (percent CBGs) \\\n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 0.08 \n",
|
||
"02 0 0.01 \n",
|
||
"04 0 0.09 \n",
|
||
"05 0 0.05 \n",
|
||
"06 0 0.04 \n",
|
||
"\n",
|
||
" hud_recap_priority_community (percent population) \n",
|
||
"GEOID10_STATE \n",
|
||
"01 0 0.05 \n",
|
||
"02 0 0.01 \n",
|
||
"04 0 0.08 \n",
|
||
"05 0 0.03 \n",
|
||
"06 0 0.04 \n",
|
||
"\n",
|
||
"[5 rows x 32 columns]"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"def get_state_distributions(\n",
|
||
" df: pd.DataFrame, priority_communities_fields: typing.List[str]\n",
|
||
") -> pd.DataFrame:\n",
|
||
" \"\"\"For each boolean field of priority communities, calculate distribution across states and territories.\"\"\"\n",
|
||
"\n",
|
||
" # Ensure each field is boolean.\n",
|
||
" for priority_communities_field in priority_communities_fields:\n",
|
||
" if df[priority_communities_field].dtype != bool:\n",
|
||
" print(f\"Converting {priority_communities_field} to boolean.\")\n",
|
||
"\n",
|
||
" # Calculate the population included as priority communities per CBG. Will either be 0 or the population.\n",
|
||
" df[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = (\n",
|
||
" df[priority_communities_field] * df[CENSUS_BLOCK_GROUP_POPULATION_FIELD]\n",
|
||
" )\n",
|
||
"\n",
|
||
" def calculate_state_comparison(\n",
|
||
" frame: pd.DataFrame, geography_field: str\n",
|
||
" ) -> pd.DataFrame:\n",
|
||
" \"\"\"\n",
|
||
" This method will be applied to a `group_by` object. Inherits some parameters from outer scope.\n",
|
||
"\n",
|
||
" \"\"\"\n",
|
||
" summary_dict = {}\n",
|
||
" summary_dict[COUNTRY_FIELD_NAME] = frame[COUNTRY_FIELD_NAME].unique()[0]\n",
|
||
"\n",
|
||
" if geography_field == COUNTRY_FIELD_NAME:\n",
|
||
" summary_dict[GEOID_STATE_FIELD_NAME] = \"00\"\n",
|
||
" summary_dict[\"Geography name\"] = \"(Entire USA)\"\n",
|
||
"\n",
|
||
" if geography_field == GEOID_STATE_FIELD_NAME:\n",
|
||
" state_id = frame[GEOID_STATE_FIELD_NAME].unique()[0]\n",
|
||
" summary_dict[GEOID_STATE_FIELD_NAME] = state_id\n",
|
||
" summary_dict[\"Geography name\"] = us.states.lookup(state_id).name\n",
|
||
"\n",
|
||
" # Also add region information\n",
|
||
" region_id = frame[\"region\"].unique()[0]\n",
|
||
" summary_dict[\"region\"] = region_id\n",
|
||
"\n",
|
||
" if geography_field == \"region\":\n",
|
||
" region_id = frame[\"region\"].unique()[0]\n",
|
||
" summary_dict[\"region\"] = region_id\n",
|
||
" summary_dict[\"Geography name\"] = region_id\n",
|
||
"\n",
|
||
" if geography_field == \"division\":\n",
|
||
" division_id = frame[\"division\"].unique()[0]\n",
|
||
" summary_dict[\"division\"] = division_id\n",
|
||
" summary_dict[\"Geography name\"] = division_id\n",
|
||
"\n",
|
||
" summary_dict[\"Total CBGs in geography\"] = len(frame)\n",
|
||
" summary_dict[\"Total population in geography\"] = frame[\n",
|
||
" CENSUS_BLOCK_GROUP_POPULATION_FIELD\n",
|
||
" ].sum()\n",
|
||
"\n",
|
||
" for priority_communities_field in priority_communities_fields:\n",
|
||
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"] = frame[\n",
|
||
" f\"{priority_communities_field}{POPULATION_SUFFIX}\"\n",
|
||
" ].sum()\n",
|
||
"\n",
|
||
" summary_dict[f\"{priority_communities_field} (total CBGs)\"] = frame[\n",
|
||
" f\"{priority_communities_field}\"\n",
|
||
" ].sum()\n",
|
||
"\n",
|
||
" # Calculate some combinations of other variables.\n",
|
||
" summary_dict[f\"{priority_communities_field} (percent CBGs)\"] = (\n",
|
||
" summary_dict[f\"{priority_communities_field} (total CBGs)\"]\n",
|
||
" / summary_dict[\"Total CBGs in geography\"]\n",
|
||
" )\n",
|
||
"\n",
|
||
" summary_dict[f\"{priority_communities_field} (percent population)\"] = (\n",
|
||
" summary_dict[f\"{priority_communities_field}{POPULATION_SUFFIX}\"]\n",
|
||
" / summary_dict[\"Total population in geography\"]\n",
|
||
" )\n",
|
||
"\n",
|
||
" df = pd.DataFrame(summary_dict, index=[0])\n",
|
||
"\n",
|
||
" return df\n",
|
||
"\n",
|
||
" # Add a field for country so we can do aggregations across the entire country.\n",
|
||
" df[COUNTRY_FIELD_NAME] = \"USA\"\n",
|
||
"\n",
|
||
" # First, run the comparison by the whole country\n",
|
||
" usa_grouped_df = df.groupby(COUNTRY_FIELD_NAME)\n",
|
||
"\n",
|
||
" # Run the comparison function on the groups.\n",
|
||
" usa_distribution_df = usa_grouped_df.progress_apply(\n",
|
||
" lambda frame: calculate_state_comparison(\n",
|
||
" frame, geography_field=COUNTRY_FIELD_NAME\n",
|
||
" )\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Next, run the comparison by state\n",
|
||
" state_grouped_df = df.groupby(GEOID_STATE_FIELD_NAME)\n",
|
||
"\n",
|
||
" # Run the comparison function on the groups.\n",
|
||
" state_distribution_df = state_grouped_df.progress_apply(\n",
|
||
" lambda frame: calculate_state_comparison(\n",
|
||
" frame, geography_field=GEOID_STATE_FIELD_NAME\n",
|
||
" )\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Next, run the comparison by region\n",
|
||
" region_grouped_df = df.groupby(\"region\")\n",
|
||
"\n",
|
||
" # Run the comparison function on the groups.\n",
|
||
" region_distribution_df = region_grouped_df.progress_apply(\n",
|
||
" lambda frame: calculate_state_comparison(frame, geography_field=\"region\")\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Next, run the comparison by division\n",
|
||
" division_grouped_df = df.groupby(\"division\")\n",
|
||
"\n",
|
||
" # Run the comparison function on the groups.\n",
|
||
" division_distribution_df = division_grouped_df.progress_apply(\n",
|
||
" lambda frame: calculate_state_comparison(frame, geography_field=\"division\")\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Combine the three\n",
|
||
" combined_df = pd.concat(\n",
|
||
" [\n",
|
||
" usa_distribution_df,\n",
|
||
" state_distribution_df,\n",
|
||
" region_distribution_df,\n",
|
||
" division_distribution_df,\n",
|
||
" ]\n",
|
||
" )\n",
|
||
"\n",
|
||
" return combined_df\n",
|
||
"\n",
|
||
"\n",
|
||
"def write_state_distribution_excel(\n",
|
||
" state_distribution_df: pd.DataFrame, file_path: pathlib.PosixPath\n",
|
||
") -> None:\n",
|
||
" \"\"\"Write the dataframe to excel with special formatting.\"\"\"\n",
|
||
" # Create a Pandas Excel writer using XlsxWriter as the engine.\n",
|
||
" writer = pd.ExcelWriter(file_path, engine=\"xlsxwriter\")\n",
|
||
"\n",
|
||
" # Convert the dataframe to an XlsxWriter Excel object. We also turn off the\n",
|
||
" # index column at the left of the output dataframe.\n",
|
||
" state_distribution_df.to_excel(writer, sheet_name=\"Sheet1\", index=False)\n",
|
||
"\n",
|
||
" # Get the xlsxwriter workbook and worksheet objects.\n",
|
||
" workbook = writer.book\n",
|
||
" worksheet = writer.sheets[\"Sheet1\"]\n",
|
||
" worksheet.autofilter(\n",
|
||
" 0, 0, state_distribution_df.shape[0], state_distribution_df.shape[1]\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Set a width parameter for all columns\n",
|
||
" # Note: this is parameterized because every call to `set_column` requires setting the width.\n",
|
||
" column_width = 15\n",
|
||
"\n",
|
||
" for column in state_distribution_df.columns:\n",
|
||
" # Turn the column index into excel ranges (e.g., column #95 is \"CR\" and the range may be \"CR2:CR53\").\n",
|
||
" column_index = state_distribution_df.columns.get_loc(column)\n",
|
||
" column_character = get_excel_column_name(column_index)\n",
|
||
"\n",
|
||
" # Set all columns to larger width\n",
|
||
" worksheet.set_column(f\"{column_character}:{column_character}\", column_width)\n",
|
||
"\n",
|
||
" # Special formatting for all percent columns\n",
|
||
" # Note: we can't just search for `percent`, because that's included in the word `percentile`.\n",
|
||
" if \"percent \" in column or \"(percent)\" in column:\n",
|
||
" # Make these columns percentages.\n",
|
||
" percentage_format = workbook.add_format({\"num_format\": \"0%\"})\n",
|
||
" worksheet.set_column(\n",
|
||
" f\"{column_character}:{column_character}\",\n",
|
||
" column_width,\n",
|
||
" percentage_format,\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Special formatting for columns that capture the percent of population considered priority.\n",
|
||
" if \"(percent population)\" in column:\n",
|
||
" column_ranges = (\n",
|
||
" f\"{column_character}2:{column_character}{len(state_distribution_df)+1}\"\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Add green to red conditional formatting.\n",
|
||
" worksheet.conditional_format(\n",
|
||
" column_ranges,\n",
|
||
" # Min: green, max: red.\n",
|
||
" {\n",
|
||
" \"type\": \"2_color_scale\",\n",
|
||
" \"min_color\": \"#00FF7F\",\n",
|
||
" \"max_color\": \"#C82538\",\n",
|
||
" },\n",
|
||
" )\n",
|
||
"\n",
|
||
" header_format = workbook.add_format(\n",
|
||
" {\"bold\": True, \"text_wrap\": True, \"valign\": \"bottom\"}\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Overwrite both the value and the format of each header cell\n",
|
||
" # This is because xlsxwriter / pandas has a known bug where it can't wrap text for a dataframe.\n",
|
||
" # See https://stackoverflow.com/questions/42562977/xlsxwriter-text-wrap-not-working.\n",
|
||
" for col_num, value in enumerate(state_distribution_df.columns.values):\n",
|
||
" worksheet.write(0, col_num, value, header_format)\n",
|
||
"\n",
|
||
" writer.save()\n",
|
||
"\n",
|
||
"\n",
|
||
"fields_to_analyze = [\n",
|
||
" index.priority_communities_field\n",
|
||
" for index in census_block_group_indices + census_tract_indices\n",
|
||
"]\n",
|
||
"\n",
|
||
"state_fips_codes = get_state_information(DATA_DIR)\n",
|
||
"\n",
|
||
"merged_with_state_information_df = merged_df.merge(\n",
|
||
" right=state_fips_codes, left_on=GEOID_STATE_FIELD_NAME, right_on=\"fips\"\n",
|
||
")\n",
|
||
"\n",
|
||
"state_distribution_df = get_state_distributions(\n",
|
||
" df=merged_with_state_information_df,\n",
|
||
" priority_communities_fields=fields_to_analyze,\n",
|
||
")\n",
|
||
"\n",
|
||
"state_distribution_df.to_csv(\n",
|
||
" path_or_buf=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.csv\",\n",
|
||
" na_rep=\"\",\n",
|
||
" index=False,\n",
|
||
")\n",
|
||
"\n",
|
||
"write_state_distribution_excel(\n",
|
||
" state_distribution_df=state_distribution_df,\n",
|
||
" file_path=COMPARISON_OUTPUTS_DIR / \"Priority CBGs by state.xlsx\",\n",
|
||
")\n",
|
||
"\n",
|
||
"state_distribution_df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "f9b9a329",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def write_markdown_and_docx_content(\n",
|
||
" markdown_content: str, file_dir: pathlib.PosixPath, file_name_without_extension: str\n",
|
||
") -> pathlib.PosixPath:\n",
|
||
" \"\"\"Write Markdown content to both .md and .docx files.\"\"\"\n",
|
||
" # Set the file paths for both files.\n",
|
||
" markdown_file_path = file_dir / f\"{file_name_without_extension}.md\"\n",
|
||
" docx_file_path = file_dir / f\"{file_name_without_extension}.docx\"\n",
|
||
"\n",
|
||
" # Write the markdown content to file.\n",
|
||
" with open(markdown_file_path, \"w\") as text_file:\n",
|
||
" text_file.write(markdown_content)\n",
|
||
"\n",
|
||
" # Convert markdown file to Word doc.\n",
|
||
" pypandoc.convert_file(\n",
|
||
" source_file=str(markdown_file_path),\n",
|
||
" to=\"docx\",\n",
|
||
" outputfile=str(docx_file_path),\n",
|
||
" extra_args=[],\n",
|
||
" )\n",
|
||
"\n",
|
||
" return docx_file_path\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_markdown_comparing_census_block_group_indices(\n",
|
||
" census_block_group_indices=typing.List[Index],\n",
|
||
" df=pd.DataFrame,\n",
|
||
" state_field=GEOID_STATE_FIELD_NAME,\n",
|
||
") -> str:\n",
|
||
" \"\"\"Generate a Markdown string of analysis of multiple CBG indices.\"\"\"\n",
|
||
" count_field_name = \"Count of CBGs\"\n",
|
||
"\n",
|
||
" # List of all states/territories in their FIPS codes:\n",
|
||
" state_ids = sorted(df[state_field].unique())\n",
|
||
" state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
|
||
"\n",
|
||
" # Create markdown content for comparisons.\n",
|
||
" markdown_content = f\"\"\"\n",
|
||
"# Comparing multiple indices at the census block group level\n",
|
||
" \n",
|
||
"(This report was calculated on {datetime.today().strftime('%Y-%m-%d')}.)\n",
|
||
"\n",
|
||
"This report compares the following indices: {\", \".join([index.method_name for index in census_block_group_indices])}.\n",
|
||
"\n",
|
||
"This report analyzes the following US states and territories: {state_names}.\n",
|
||
"\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
" for (index1, index2) in itertools.combinations(census_block_group_indices, 2):\n",
|
||
" # Group all data by their different values on Priority Communities Field for Index1 vs Priority Communities Field for Index2.\n",
|
||
" count_df = (\n",
|
||
" df.groupby(\n",
|
||
" [index1.priority_communities_field, index2.priority_communities_field]\n",
|
||
" )[GEOID_FIELD_NAME]\n",
|
||
" .count()\n",
|
||
" .reset_index(name=count_field_name)\n",
|
||
" )\n",
|
||
"\n",
|
||
" total_cbgs = count_df[count_field_name].sum()\n",
|
||
"\n",
|
||
" # Returns a series\n",
|
||
" true_true_cbgs_series = count_df.loc[\n",
|
||
" count_df[index1.priority_communities_field]\n",
|
||
" & count_df[index2.priority_communities_field],\n",
|
||
" count_field_name,\n",
|
||
" ]\n",
|
||
" true_false_cbgs_series = count_df.loc[\n",
|
||
" count_df[index1.priority_communities_field]\n",
|
||
" & ~count_df[index2.priority_communities_field],\n",
|
||
" count_field_name,\n",
|
||
" ]\n",
|
||
" false_true_cbgs_series = count_df.loc[\n",
|
||
" ~count_df[index1.priority_communities_field]\n",
|
||
" & count_df[index2.priority_communities_field],\n",
|
||
" count_field_name,\n",
|
||
" ]\n",
|
||
" false_false_cbgs_series = count_df.loc[\n",
|
||
" ~count_df[index1.priority_communities_field]\n",
|
||
" & ~count_df[index2.priority_communities_field],\n",
|
||
" count_field_name,\n",
|
||
" ]\n",
|
||
"\n",
|
||
" # Convert from series to a scalar value, including accounting for if no data exists for that pairing.\n",
|
||
" true_true_cbgs = (\n",
|
||
" true_true_cbgs_series.iloc[0] if len(true_true_cbgs_series) > 0 else 0\n",
|
||
" )\n",
|
||
" true_false_cbgs = (\n",
|
||
" true_false_cbgs_series.iloc[0] if len(true_false_cbgs_series) > 0 else 0\n",
|
||
" )\n",
|
||
" false_true_cbgs = (\n",
|
||
" false_true_cbgs_series.iloc[0] if len(false_true_cbgs_series) > 0 else 0\n",
|
||
" )\n",
|
||
" false_false_cbgs = (\n",
|
||
" false_false_cbgs_series.iloc[0] if len(false_false_cbgs_series) > 0 else 0\n",
|
||
" )\n",
|
||
"\n",
|
||
" markdown_content += (\n",
|
||
" \"*** \\n\\n\"\n",
|
||
" \"There are \"\n",
|
||
" f\"{true_true_cbgs} ({true_true_cbgs / total_cbgs:.0%}) \"\n",
|
||
" f\"census block groups that are both {index1.method_name} priority communities and {index2.method_name} priority communities.\\n\\n\"\n",
|
||
" \"There are \"\n",
|
||
" f\"{true_false_cbgs} ({true_false_cbgs / total_cbgs:.0%}) \"\n",
|
||
" f\"census block groups that are {index1.method_name} priority communities but not {index2.method_name} priority communities.\\n\\n\"\n",
|
||
" \"There are \"\n",
|
||
" f\"{false_true_cbgs} ({false_true_cbgs / total_cbgs:.0%}) \"\n",
|
||
" f\"census block groups that are not {index1.method_name} priority communities but are {index2.method_name} priority communities.\\n\\n\"\n",
|
||
" \"There are \"\n",
|
||
" f\"{false_false_cbgs} ({false_false_cbgs / total_cbgs:.0%}) \"\n",
|
||
" f\"census block groups that are neither {index1.method_name} priority communities nor {index2.method_name} priority communities.\\n\\n\"\n",
|
||
" \"\\n\\n\"\n",
|
||
" )\n",
|
||
"\n",
|
||
" return markdown_content\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_comparison_census_block_group_indices(\n",
|
||
" census_block_group_indices=typing.List[Index],\n",
|
||
" df=pd.DataFrame,\n",
|
||
" state_field=GEOID_STATE_FIELD_NAME,\n",
|
||
") -> pathlib.PosixPath:\n",
|
||
" markdown_content = get_markdown_comparing_census_block_group_indices(\n",
|
||
" census_block_group_indices=census_block_group_indices,\n",
|
||
" df=merged_with_state_information_df,\n",
|
||
" )\n",
|
||
"\n",
|
||
" comparison_docx_file_path = write_markdown_and_docx_content(\n",
|
||
" markdown_content=markdown_content,\n",
|
||
" file_dir=COMPARISON_OUTPUTS_DIR,\n",
|
||
" file_name_without_extension=f\"Comparison report - All CBG indices\",\n",
|
||
" )\n",
|
||
"\n",
|
||
" return comparison_docx_file_path\n",
|
||
"\n",
|
||
"\n",
|
||
"# Compare multiple scores at the CBG level\n",
|
||
"get_comparison_census_block_group_indices(\n",
|
||
" census_block_group_indices=census_block_group_indices,\n",
|
||
" df=merged_with_state_information_df,\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "25a10027",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# This cell defines a variety of comparison functions. It does not run them.\n",
|
||
"\n",
|
||
"# Define a namedtuple for column names, which need to be shared between multiple parts of this comparison pipeline.\n",
|
||
"# Named tuples are useful here because they provide guarantees that for each instance, all properties are defined and\n",
|
||
"# can be accessed as properties (rather than as strings).\n",
|
||
"\n",
|
||
"# Note: if you'd like to add a field used throughout the comparison process, add it in three places.\n",
|
||
"# For an example `new_field`,\n",
|
||
"# 1. in this namedtuple, add the field as a string in `field_names` (e.g., `field_names=[..., \"new_field\"])`)\n",
|
||
"# 2. in the function `get_comparison_field_names`, define how the field name should be created from input data\n",
|
||
"# (e.g., `...new_field=f\"New field compares {method_a_name} to {method_b_name}\")\n",
|
||
"# 3. In the function `get_comparison_markdown_content`, add some reporting on the new field to the markdown content.\n",
|
||
"# (e.g., `The statistics indicate that {calculation_based_on_new_field} percent of census tracts are different between scores.`)\n",
|
||
"ComparisonFieldNames = collections.namedtuple(\n",
|
||
" typename=\"ComparisonFieldNames\",\n",
|
||
" field_names=[\n",
|
||
" \"any_tract_has_at_least_one_method_a_cbg\",\n",
|
||
" \"method_b_tract_has_at_least_one_method_a_cbg\",\n",
|
||
" \"method_b_tract_has_100_percent_method_a_cbg\",\n",
|
||
" \"method_b_non_priority_tract_has_at_least_one_method_a_cbg\",\n",
|
||
" \"method_b_non_priority_tract_has_100_percent_method_a_cbg\",\n",
|
||
" ],\n",
|
||
")\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_comparison_field_names(\n",
|
||
" method_a_name: str,\n",
|
||
" method_b_name: str,\n",
|
||
") -> ComparisonFieldNames:\n",
|
||
" comparison_field_names = ComparisonFieldNames(\n",
|
||
" any_tract_has_at_least_one_method_a_cbg=(\n",
|
||
" f\"Any tract has at least one {method_a_name} Priority CBG?\"\n",
|
||
" ),\n",
|
||
" method_b_tract_has_at_least_one_method_a_cbg=(\n",
|
||
" f\"{method_b_name} priority tract has at least one {method_a_name} CBG?\"\n",
|
||
" ),\n",
|
||
" method_b_tract_has_100_percent_method_a_cbg=(\n",
|
||
" f\"{method_b_name} tract has 100% {method_a_name} priority CBGs?\"\n",
|
||
" ),\n",
|
||
" method_b_non_priority_tract_has_at_least_one_method_a_cbg=(\n",
|
||
" f\"Non-priority {method_b_name} tract has at least one {method_a_name} priority CBG?\"\n",
|
||
" ),\n",
|
||
" method_b_non_priority_tract_has_100_percent_method_a_cbg=(\n",
|
||
" f\"Non-priority {method_b_name} tract has 100% {method_a_name} priority CBGs?\"\n",
|
||
" ),\n",
|
||
" )\n",
|
||
" return comparison_field_names\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_df_with_only_shared_states(\n",
|
||
" df: pd.DataFrame,\n",
|
||
" field_a: str,\n",
|
||
" field_b: str,\n",
|
||
" state_field=GEOID_STATE_FIELD_NAME,\n",
|
||
") -> pd.DataFrame:\n",
|
||
" \"\"\"\n",
|
||
" Useful for looking at shared geographies across two fields.\n",
|
||
"\n",
|
||
" For a data frame and two fields, return a data frame only for states where there are non-null\n",
|
||
" values for both fields in that state (or territory).\n",
|
||
"\n",
|
||
" This is useful, for example, when running a comparison of CalEnviroScreen (only in California) against\n",
|
||
" a draft score that's national, and returning only the data for California for the entire data frame.\n",
|
||
" \"\"\"\n",
|
||
" field_a_states = df.loc[df[field_a].notnull(), state_field].unique()\n",
|
||
" field_b_states = df.loc[df[field_b].notnull(), state_field].unique()\n",
|
||
"\n",
|
||
" shared_states = list(set(field_a_states) & set(field_b_states))\n",
|
||
"\n",
|
||
" df = df.loc[df[state_field].isin(shared_states), :]\n",
|
||
"\n",
|
||
" return df\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_comparison_df(\n",
|
||
" df: pd.DataFrame,\n",
|
||
" method_a_priority_census_block_groups_field: str,\n",
|
||
" method_b_priority_census_tracts_field: str,\n",
|
||
" other_census_tract_fields_to_keep: typing.Optional[typing.List[str]],\n",
|
||
" comparison_field_names: ComparisonFieldNames,\n",
|
||
" output_dir: pathlib.PosixPath,\n",
|
||
") -> None:\n",
|
||
" \"\"\"Produces a comparison report for any two given boolean columns representing priority fields.\n",
|
||
"\n",
|
||
" Args:\n",
|
||
" df: a pandas dataframe including the data for this comparison.\n",
|
||
" method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority\n",
|
||
" community field that defines communities at the level of census block groups (CBGs).\n",
|
||
" method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority\n",
|
||
" community field that defines communities at the level of census tracts.\n",
|
||
" other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" df: a pandas dataframe with one row with the results of this comparison\n",
|
||
" \"\"\"\n",
|
||
"\n",
|
||
" def calculate_comparison(frame: pd.DataFrame) -> pd.DataFrame:\n",
|
||
" \"\"\"\n",
|
||
" This method will be applied to a `group_by` object.\n",
|
||
"\n",
|
||
" Note: It inherits from outer scope `method_a_priority_census_block_groups_field`, `method_b_priority_census_tracts_field`,\n",
|
||
" and `other_census_tract_fields_to_keep`.\n",
|
||
" \"\"\"\n",
|
||
" # Keep all the tract values at the Census Tract Level\n",
|
||
" for field in other_census_tract_fields_to_keep:\n",
|
||
" if len(frame[field].unique()) != 1:\n",
|
||
" raise ValueError(\n",
|
||
" f\"There are different values per CBG for field {field}.\"\n",
|
||
" \"`other_census_tract_fields_to_keep` can only be used for fields at the census tract level.\"\n",
|
||
" )\n",
|
||
"\n",
|
||
" df = frame.loc[\n",
|
||
" frame.index[0],\n",
|
||
" [\n",
|
||
" GEOID_TRACT_FIELD_NAME,\n",
|
||
" method_b_priority_census_tracts_field,\n",
|
||
" ]\n",
|
||
" + other_census_tract_fields_to_keep,\n",
|
||
" ]\n",
|
||
"\n",
|
||
" # Convenience constant for whether the tract is or is not a method B priority community.\n",
|
||
" is_a_method_b_priority_tract = frame.loc[\n",
|
||
" frame.index[0], [method_b_priority_census_tracts_field]\n",
|
||
" ][0]\n",
|
||
"\n",
|
||
" # Recall that NaN values are not falsy, so we need to check if `is_a_method_b_priority_tract` is True.\n",
|
||
" is_a_method_b_priority_tract = is_a_method_b_priority_tract is True\n",
|
||
"\n",
|
||
" # Calculate whether the tract (whether or not it is a comparison priority tract) includes CBGs that are priority\n",
|
||
" # according to the current CBG score.\n",
|
||
" df[comparison_field_names.any_tract_has_at_least_one_method_a_cbg] = (\n",
|
||
" frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Calculate comparison\n",
|
||
" # A comparison priority tract has at least one CBG that is a priority CBG.\n",
|
||
" df[comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg] = (\n",
|
||
" frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n",
|
||
" if is_a_method_b_priority_tract\n",
|
||
" else None\n",
|
||
" )\n",
|
||
"\n",
|
||
" # A comparison priority tract has all of its contained CBGs as CBG priority CBGs.\n",
|
||
" df[comparison_field_names.method_b_tract_has_100_percent_method_a_cbg] = (\n",
|
||
" frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
|
||
" if is_a_method_b_priority_tract\n",
|
||
" else None\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Calculate the inverse\n",
|
||
" # A tract that is _not_ a comparison priority has at least one CBG priority CBG.\n",
|
||
" df[\n",
|
||
" comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg\n",
|
||
" ] = (\n",
|
||
" frame.loc[:, method_a_priority_census_block_groups_field].sum() > 0\n",
|
||
" if not is_a_method_b_priority_tract\n",
|
||
" else None\n",
|
||
" )\n",
|
||
"\n",
|
||
" # A tract that is _not_ a comparison priority has all of its contained CBGs as CBG priority CBGs.\n",
|
||
" df[\n",
|
||
" comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg\n",
|
||
" ] = (\n",
|
||
" frame.loc[:, method_a_priority_census_block_groups_field].mean() == 1\n",
|
||
" if not is_a_method_b_priority_tract\n",
|
||
" else None\n",
|
||
" )\n",
|
||
"\n",
|
||
" # For all remaining fields, calculate the average\n",
|
||
" # TODO: refactor to vectorize to make faster.\n",
|
||
" for field in [\n",
|
||
" \"Poverty (Less than 200% of federal poverty line)\",\n",
|
||
" \"Percent of households in linguistic isolation\",\n",
|
||
" \"Percent individuals age 25 or over with less than high school degree\",\n",
|
||
" \"Unemployed civilians (percent)\",\n",
|
||
" ]:\n",
|
||
" df[f\"{field} (average of CBGs)\"] = frame.loc[:, field].mean()\n",
|
||
"\n",
|
||
" return df\n",
|
||
"\n",
|
||
" # Group all data by the census tract.\n",
|
||
" grouped_df = df.groupby(GEOID_TRACT_FIELD_NAME)\n",
|
||
"\n",
|
||
" # Run the comparison function on the groups.\n",
|
||
" comparison_df = grouped_df.progress_apply(calculate_comparison)\n",
|
||
"\n",
|
||
" return comparison_df\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_comparison_markdown_content(\n",
|
||
" original_df: pd.DataFrame,\n",
|
||
" comparison_df: pd.DataFrame,\n",
|
||
" comparison_field_names: ComparisonFieldNames,\n",
|
||
" method_a_name: str,\n",
|
||
" method_b_name: str,\n",
|
||
" method_a_priority_census_block_groups_field: str,\n",
|
||
" method_b_priority_census_tracts_field: str,\n",
|
||
" state_field: str = GEOID_STATE_FIELD_NAME,\n",
|
||
") -> str:\n",
|
||
" # Prepare some constants for use in the following Markdown content.\n",
|
||
" total_cbgs = len(original_df)\n",
|
||
"\n",
|
||
" # List of all states/territories in their FIPS codes:\n",
|
||
" state_ids = sorted(original_df[state_field].unique())\n",
|
||
" state_names = \", \".join([us.states.lookup(state_id).name for state_id in state_ids])\n",
|
||
"\n",
|
||
" # Note: using squeeze throughout do reduce result of `sum()` to a scalar.\n",
|
||
" # TODO: investigate why sums are sometimes series and sometimes scalar.\n",
|
||
" method_a_priority_cbgs = (\n",
|
||
" original_df.loc[:, method_a_priority_census_block_groups_field].sum().squeeze()\n",
|
||
" )\n",
|
||
" method_a_priority_cbgs_percent = f\"{method_a_priority_cbgs / total_cbgs:.0%}\"\n",
|
||
"\n",
|
||
" total_tracts_count = len(comparison_df)\n",
|
||
"\n",
|
||
" method_b_priority_tracts_count = comparison_df.loc[\n",
|
||
" :, method_b_priority_census_tracts_field\n",
|
||
" ].sum()\n",
|
||
"\n",
|
||
" method_b_priority_tracts_count_percent = (\n",
|
||
" f\"{method_b_priority_tracts_count / total_tracts_count:.0%}\"\n",
|
||
" )\n",
|
||
" method_b_non_priority_tracts_count = (\n",
|
||
" total_tracts_count - method_b_priority_tracts_count\n",
|
||
" )\n",
|
||
"\n",
|
||
" method_a_tracts_count = (\n",
|
||
" comparison_df.loc[\n",
|
||
" :, comparison_field_names.any_tract_has_at_least_one_method_a_cbg\n",
|
||
" ]\n",
|
||
" .sum()\n",
|
||
" .squeeze()\n",
|
||
" )\n",
|
||
" method_a_tracts_count_percent = f\"{method_a_tracts_count / total_tracts_count:.0%}\"\n",
|
||
"\n",
|
||
" # Method A priority community stats\n",
|
||
" method_b_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n",
|
||
" :, comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg\n",
|
||
" ].sum()\n",
|
||
" method_b_tracts_with_at_least_one_method_a_cbg_percent = f\"{method_b_tracts_with_at_least_one_method_a_cbg / method_b_priority_tracts_count:.0%}\"\n",
|
||
"\n",
|
||
" method_b_tracts_with_at_100_percent_method_a_cbg = comparison_df.loc[\n",
|
||
" :, comparison_field_names.method_b_tract_has_100_percent_method_a_cbg\n",
|
||
" ].sum()\n",
|
||
" method_b_tracts_with_at_100_percent_method_a_cbg_percent = f\"{method_b_tracts_with_at_100_percent_method_a_cbg / method_b_priority_tracts_count:.0%}\"\n",
|
||
"\n",
|
||
" # Method A non-priority community stats\n",
|
||
" method_b_non_priority_tracts_with_at_least_one_method_a_cbg = comparison_df.loc[\n",
|
||
" :,\n",
|
||
" comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg,\n",
|
||
" ].sum()\n",
|
||
"\n",
|
||
" method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent = f\"{method_b_non_priority_tracts_with_at_least_one_method_a_cbg / method_b_non_priority_tracts_count:.0%}\"\n",
|
||
"\n",
|
||
" method_b_non_priority_tracts_with_100_percent_method_a_cbg = comparison_df.loc[\n",
|
||
" :,\n",
|
||
" comparison_field_names.method_b_non_priority_tract_has_100_percent_method_a_cbg,\n",
|
||
" ].sum()\n",
|
||
" method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent = f\"{method_b_non_priority_tracts_with_100_percent_method_a_cbg / method_b_non_priority_tracts_count:.0%}\"\n",
|
||
"\n",
|
||
" # Create markdown content for comparisons.\n",
|
||
" markdown_content = f\"\"\"\n",
|
||
"# {method_a_name} compared to {method_b_name}\n",
|
||
"\n",
|
||
"(This report was calculated on {datetime.today().strftime('%Y-%m-%d')}.)\n",
|
||
"\n",
|
||
"This report analyzes the following US states and territories: {state_names}.\n",
|
||
"\n",
|
||
"Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.\n",
|
||
"\n",
|
||
"Within the geographic area analyzed, there are {method_b_priority_tracts_count} census tracts designated as priority communities by {method_b_name}, out of {total_tracts_count} total tracts ({method_b_priority_tracts_count_percent}). \n",
|
||
"\n",
|
||
"Within the geographic region analyzed, there are {method_a_priority_cbgs} census block groups considered as priority communities by {method_a_name}, out of {total_cbgs} CBGs ({method_a_priority_cbgs_percent}). They occupy {method_a_tracts_count} census tracts ({method_a_tracts_count_percent}) of the geographic area analyzed.\n",
|
||
"\n",
|
||
"Out of every {method_b_name} priority census tract, {method_b_tracts_with_at_least_one_method_a_cbg} ({method_b_tracts_with_at_least_one_method_a_cbg_percent}) of these census tracts have at least one census block group within them that is considered a priority community by {method_a_name}.\n",
|
||
"\n",
|
||
"Out of every {method_b_name} priority census tract, {method_b_tracts_with_at_100_percent_method_a_cbg} ({method_b_tracts_with_at_100_percent_method_a_cbg_percent}) of these census tracts have 100% of the included census block groups within them considered priority communities by {method_a_name}.\n",
|
||
"\n",
|
||
"Out of every census tract that is __not__ marked as a priority community by {method_b_name}, {method_b_non_priority_tracts_with_at_least_one_method_a_cbg} ({method_b_non_priority_tracts_with_at_least_one_method_a_cbg_percent}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n",
|
||
"\n",
|
||
"Out of every census tract that is __not__ marked as a priority community by {method_b_name}, {method_b_non_priority_tracts_with_100_percent_method_a_cbg} ({method_b_non_priority_tracts_with_100_percent_method_a_cbg_percent}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
" return markdown_content\n",
|
||
"\n",
|
||
"\n",
|
||
"def get_secondary_comparison_df(\n",
|
||
" comparison_df: pd.DataFrame,\n",
|
||
" comparison_field_names: ComparisonFieldNames,\n",
|
||
" method_b_priority_census_tracts_field: str,\n",
|
||
") -> pd.DataFrame:\n",
|
||
" \"\"\"A secondary level of comparison.\n",
|
||
"\n",
|
||
" The first level of comparison identifies census tracts prioritized by Method A,\n",
|
||
" compared to whether or not they're prioritized by Method B.\n",
|
||
"\n",
|
||
" This comparison method analyzes characteristics of those census tracts, based on whether or not they are prioritized\n",
|
||
" or not by Method A and/or Method B.\n",
|
||
"\n",
|
||
"\n",
|
||
" E.g., it might show that tracts prioritized by A but not B have a higher average income,\n",
|
||
" or that tracts prioritized by B but not A have a lower percent of unemployed people.\"\"\"\n",
|
||
" grouped_df = comparison_df.groupby(\n",
|
||
" [\n",
|
||
" method_b_priority_census_tracts_field,\n",
|
||
" comparison_field_names.method_b_tract_has_at_least_one_method_a_cbg,\n",
|
||
" comparison_field_names.method_b_non_priority_tract_has_at_least_one_method_a_cbg,\n",
|
||
" ],\n",
|
||
" dropna=False,\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Run the comparison function on the groups.\n",
|
||
" secondary_comparison_df = grouped_df.mean().reset_index()\n",
|
||
"\n",
|
||
" return secondary_comparison_df\n",
|
||
"\n",
|
||
"\n",
|
||
"def execute_comparison(\n",
|
||
" df: pd.DataFrame,\n",
|
||
" method_a_name: str,\n",
|
||
" method_b_name: str,\n",
|
||
" method_a_priority_census_block_groups_field: str,\n",
|
||
" method_b_priority_census_tracts_field: str,\n",
|
||
" other_census_tract_fields_to_keep: typing.Optional[typing.List[str]],\n",
|
||
") -> pathlib.PosixPath:\n",
|
||
" \"\"\"Execute an individual comparison by creating the data frame and writing the report.\n",
|
||
"\n",
|
||
" Args:\n",
|
||
" df: a pandas dataframe including the data for this comparison.\n",
|
||
" method_a_priority_census_block_groups_field: the name of a boolean column in `df`, such as the CEJST priority\n",
|
||
" community field that defines communities at the level of census block groups (CBGs).\n",
|
||
" method_b_priority_census_tracts_field: the name of a boolean column in `df`, such as the CalEnviroScreen priority\n",
|
||
" community field that defines communities at the level of census tracts.\n",
|
||
" other_census_tract_fields_to_keep (optional): a list of field names to preserve at the census tract level\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" df: a pandas dataframe with one row with the results of this comparison\n",
|
||
"\n",
|
||
" \"\"\"\n",
|
||
" comparison_field_names = get_comparison_field_names(\n",
|
||
" method_a_name=method_a_name, method_b_name=method_b_name\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Create or use a directory for outputs grouped by Method A.\n",
|
||
" output_dir = COMPARISON_OUTPUTS_DIR / method_a_name\n",
|
||
" output_dir.mkdir(parents=True, exist_ok=True)\n",
|
||
"\n",
|
||
" df_with_only_shared_states = get_df_with_only_shared_states(\n",
|
||
" df=df,\n",
|
||
" field_a=method_a_priority_census_block_groups_field,\n",
|
||
" field_b=method_b_priority_census_tracts_field,\n",
|
||
" )\n",
|
||
"\n",
|
||
" comparison_df = get_comparison_df(\n",
|
||
" df=df_with_only_shared_states,\n",
|
||
" method_a_priority_census_block_groups_field=method_a_priority_census_block_groups_field,\n",
|
||
" method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n",
|
||
" comparison_field_names=comparison_field_names,\n",
|
||
" other_census_tract_fields_to_keep=other_census_tract_fields_to_keep,\n",
|
||
" output_dir=output_dir,\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Write comparison to CSV.\n",
|
||
" file_path = (\n",
|
||
" output_dir / f\"Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
|
||
" )\n",
|
||
" comparison_df.to_csv(\n",
|
||
" path_or_buf=file_path,\n",
|
||
" na_rep=\"\",\n",
|
||
" index=False,\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Secondary comparison DF\n",
|
||
" secondary_comparison_df = get_secondary_comparison_df(\n",
|
||
" comparison_df=comparison_df,\n",
|
||
" comparison_field_names=comparison_field_names,\n",
|
||
" method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n",
|
||
" )\n",
|
||
"\n",
|
||
" # Write secondary comparison to CSV.\n",
|
||
" file_path = (\n",
|
||
" output_dir\n",
|
||
" / f\"Secondary Comparison Output - {method_a_name} and {method_b_name}.csv\"\n",
|
||
" )\n",
|
||
" secondary_comparison_df.to_csv(\n",
|
||
" path_or_buf=file_path,\n",
|
||
" na_rep=\"\",\n",
|
||
" index=False,\n",
|
||
" )\n",
|
||
"\n",
|
||
" markdown_content = get_comparison_markdown_content(\n",
|
||
" original_df=df_with_only_shared_states,\n",
|
||
" comparison_df=comparison_df,\n",
|
||
" comparison_field_names=comparison_field_names,\n",
|
||
" method_a_name=method_a_name,\n",
|
||
" method_b_name=method_b_name,\n",
|
||
" method_a_priority_census_block_groups_field=method_a_priority_census_block_groups_field,\n",
|
||
" method_b_priority_census_tracts_field=method_b_priority_census_tracts_field,\n",
|
||
" )\n",
|
||
"\n",
|
||
" comparison_docx_file_path = write_markdown_and_docx_content(\n",
|
||
" markdown_content=markdown_content,\n",
|
||
" file_dir=output_dir,\n",
|
||
" file_name_without_extension=f\"Comparison report - {method_a_name} and {method_b_name}\",\n",
|
||
" )\n",
|
||
"\n",
|
||
" return comparison_docx_file_path\n",
|
||
"\n",
|
||
"\n",
|
||
"def execute_comparisons(\n",
|
||
" df: pd.DataFrame,\n",
|
||
" census_block_group_indices: typing.List[Index],\n",
|
||
" census_tract_indices: typing.List[Index],\n",
|
||
"):\n",
|
||
" \"\"\"Create multiple comparison reports.\"\"\"\n",
|
||
" comparison_docx_file_paths = []\n",
|
||
" for cbg_index in census_block_group_indices:\n",
|
||
" for census_tract_index in census_tract_indices:\n",
|
||
" print(\n",
|
||
" f\"Running comparisons for {cbg_index.method_name} against {census_tract_index.method_name}...\"\n",
|
||
" )\n",
|
||
"\n",
|
||
" comparison_docx_file_path = execute_comparison(\n",
|
||
" df=df,\n",
|
||
" method_a_name=cbg_index.method_name,\n",
|
||
" method_b_name=census_tract_index.method_name,\n",
|
||
" method_a_priority_census_block_groups_field=cbg_index.priority_communities_field,\n",
|
||
" method_b_priority_census_tracts_field=census_tract_index.priority_communities_field,\n",
|
||
" other_census_tract_fields_to_keep=census_tract_index.other_census_tract_fields_to_keep,\n",
|
||
" )\n",
|
||
"\n",
|
||
" comparison_docx_file_paths.append(comparison_docx_file_path)\n",
|
||
"\n",
|
||
" return comparison_docx_file_paths"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9b8b6d1e",
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Running comparisons for Score A against CalEnviroScreen 4.0...\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "531ec4deb2f54c26ad0f5311fdea0e60",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/8057 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Running comparisons for Score A against HUD RECAP...\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "170da68ae0734892bef4a452b5de45f7",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/74001 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Actually execute the functions\n",
|
||
"file_paths = execute_comparisons(\n",
|
||
" df=merged_df,\n",
|
||
" census_block_group_indices=census_block_group_indices,\n",
|
||
" census_tract_indices=census_tract_indices,\n",
|
||
")\n",
|
||
"\n",
|
||
"print(file_paths)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "887ee948",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.9.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|