{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "93c7b73b", "metadata": { "scrolled": true }, "outputs": [], "source": [ "import collections\n", "import functools\n", "import IPython\n", "import itertools\n", "import numpy as np\n", "import os\n", "import pandas as pd\n", "import pathlib\n", "import pypandoc\n", "import requests\n", "import string\n", "import sys\n", "import typing\n", "import us\n", "import zipfile\n", "\n", "from datetime import datetime\n", "from tqdm.notebook import tqdm_notebook\n", "\n", "module_path = os.path.abspath(os.path.join(\"..\"))\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", "\n", "from utils import remove_all_from_dir, get_excel_column_name\n", "from etl.sources.census.etl_utils import get_state_information\n", "\n", "\n", "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n", "tqdm_notebook.pandas()" ] }, { "cell_type": "code", "execution_count": null, "id": "881424fd", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n", "pd.options.display.float_format = \"{:.2f}\".format\n", "\n", "# Set some global parameters\n", "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n", "TEMP_DATA_DIR = DATA_DIR / \"tmp\"\n", "COMPARISON_OUTPUTS_DIR = DATA_DIR / \"comparison_outputs\"\n", "\n", "# Make the dirs if they don't exist\n", "TEMP_DATA_DIR.mkdir(parents=True, exist_ok=True)\n", "COMPARISON_OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)\n", "\n", "CEJST_PRIORITY_COMMUNITY_THRESHOLD = 0.75\n", "\n", "# Name fields using variables. (This makes it easy to reference the same fields frequently without using strings\n", "# and introducing the risk of misspelling the field name.)\n", "\n", "GEOID_FIELD_NAME = \"GEOID10\"\n", "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n", "COUNTRY_FIELD_NAME = \"Country\"\n", "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"\n", "\n", "CEJST_SCORE_FIELD = \"cejst_score\"\n", "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n", "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n", "\n", "# Define some suffixes\n", "POPULATION_SUFFIX = \" (priority population)\"" ] }, { "cell_type": "code", "execution_count": null, "id": "c5f3eaa5", "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3169: DtypeWarning: Columns (87,88,90) have mixed types.Specify dtype option on import or set low_memory=False.\n", " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n" ] }, { "data": { "text/html": [ "
\n", " | GEOID10 | \n", "Housing burden (percent) | \n", "Total population | \n", "Air toxics cancer risk | \n", "Respiratory hazard index | \n", "Diesel particulate matter | \n", "Particulate matter (PM2.5) | \n", "Ozone | \n", "Traffic proximity and volume | \n", "Proximity to RMP sites | \n", "... | \n", "Score D (top 25th percentile) | \n", "Score E (percentile) | \n", "Score E (top 25th percentile) | \n", "GEOID | \n", "State Abbreviation | \n", "County Name | \n", "State Code | \n", "State Name | \n", "GEOID10_TRACT | \n", "GEOID10_STATE | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "010010201001 | \n", "0.15 | \n", "692 | \n", "49.38 | \n", "0.79 | \n", "0.28 | \n", "10.00 | \n", "40.12 | \n", "91.02 | \n", "0.09 | \n", "... | \n", "False | \n", "0.35 | \n", "False | \n", "1001 | \n", "AL | \n", "Autauga County | \n", "1.00 | \n", "Alabama | \n", "01001020100 | \n", "01 | \n", "
1 | \n", "010010201002 | \n", "0.15 | \n", "1153 | \n", "49.38 | \n", "0.79 | \n", "0.28 | \n", "10.00 | \n", "40.12 | \n", "2.62 | \n", "0.07 | \n", "... | \n", "False | \n", "0.11 | \n", "False | \n", "1001 | \n", "AL | \n", "Baldwin County | \n", "2.00 | \n", "Alaska | \n", "01001020100 | \n", "01 | \n", "
2 | \n", "010010202001 | \n", "0.25 | \n", "1020 | \n", "50.32 | \n", "0.81 | \n", "0.30 | \n", "10.07 | \n", "40.22 | \n", "4.68 | \n", "0.08 | \n", "... | \n", "False | \n", "0.51 | \n", "False | \n", "1001 | \n", "AL | \n", "Barbour County | \n", "4.00 | \n", "Arizona | \n", "01001020200 | \n", "01 | \n", "
3 | \n", "010010202002 | \n", "0.25 | \n", "1152 | \n", "50.32 | \n", "0.81 | \n", "0.30 | \n", "10.07 | \n", "40.22 | \n", "218.65 | \n", "0.09 | \n", "... | \n", "False | \n", "0.59 | \n", "False | \n", "1001 | \n", "AL | \n", "Bibb County | \n", "5.00 | \n", "Arkansas | \n", "01001020200 | \n", "01 | \n", "
4 | \n", "010010203001 | \n", "0.21 | \n", "2555 | \n", "50.77 | \n", "0.82 | \n", "0.36 | \n", "10.12 | \n", "40.31 | \n", "69.64 | \n", "0.08 | \n", "... | \n", "False | \n", "0.47 | \n", "False | \n", "1001 | \n", "AL | \n", "Blount County | \n", "6.00 | \n", "California | \n", "01001020300 | \n", "01 | \n", "
5 rows × 93 columns
\n", "\n", " | GEOID10_TRACT | \n", "Total Population | \n", "California County | \n", "ZIP | \n", "Nearby City \\r\\n(to help approximate location only) | \n", "Longitude | \n", "Latitude | \n", "calenviroscreen_score | \n", "calenviroscreen_percentile | \n", "DRAFT CES 4.0\\r\\nPercentile Range | \n", "... | \n", "Poverty | \n", "Poverty Pctl | \n", "Unemployment | \n", "Unemployment Pctl | \n", "Housing Burden | \n", "Housing Burden Pctl | \n", "Pop. Char. | \n", "Pop. Char. Score | \n", "Pop. Char. Pctl | \n", "calenviroscreen_priority_community | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "06019001100 | \n", "2760 | \n", "Fresno | \n", "93706 | \n", "Fresno | \n", "-119.78 | \n", "36.71 | \n", "94.61 | \n", "100.00 | \n", "95-100% (highest scores) | \n", "... | \n", "76.60 | \n", "98.43 | \n", "16.20 | \n", "97.15 | \n", "30.70 | \n", "90.61 | \n", "93.73 | \n", "9.72 | \n", "99.87 | \n", "True | \n", "
1 | \n", "06077000700 | \n", "4177 | \n", "San Joaquin | \n", "95206 | \n", "Stockton | \n", "-121.29 | \n", "37.94 | \n", "90.83 | \n", "99.99 | \n", "95-100% (highest scores) | \n", "... | \n", "70.60 | \n", "96.43 | \n", "18.50 | \n", "98.45 | \n", "35.20 | \n", "95.61 | \n", "93.40 | \n", "9.68 | \n", "99.84 | \n", "True | \n", "
2 | \n", "06077000100 | \n", "4055 | \n", "San Joaquin | \n", "95202 | \n", "Stockton | \n", "-121.29 | \n", "37.95 | \n", "85.75 | \n", "99.97 | \n", "95-100% (highest scores) | \n", "... | \n", "81.80 | \n", "99.50 | \n", "17.90 | \n", "98.17 | \n", "36.40 | \n", "96.51 | \n", "95.71 | \n", "9.92 | \n", "99.97 | \n", "True | \n", "
3 | \n", "06071001600 | \n", "5527 | \n", "San Bernardino | \n", "91761 | \n", "Ontario | \n", "-117.62 | \n", "34.06 | \n", "83.56 | \n", "99.96 | \n", "95-100% (highest scores) | \n", "... | \n", "67.10 | \n", "94.82 | \n", "6.70 | \n", "57.20 | \n", "32.10 | \n", "92.65 | \n", "80.59 | \n", "8.36 | \n", "93.06 | \n", "True | \n", "
4 | \n", "06037204920 | \n", "2639 | \n", "Los Angeles | \n", "90023 | \n", "Los Angeles | \n", "-118.20 | \n", "34.02 | \n", "82.90 | \n", "99.95 | \n", "95-100% (highest scores) | \n", "... | \n", "64.90 | \n", "93.51 | \n", "5.60 | \n", "43.81 | \n", "25.00 | \n", "77.95 | \n", "83.95 | \n", "8.70 | \n", "95.78 | \n", "True | \n", "
5 rows × 59 columns
\n", "\n", " | FID | \n", "GEOID10_TRACT | \n", "STATE | \n", "STUSAB | \n", "STATE_NAME | \n", "COUNTY | \n", "COUNTY_NAME | \n", "CNTY_FIPS | \n", "TRACT | \n", "RCAP_90 | \n", "RCAP_00 | \n", "RCAP_10 | \n", "hud_recap_priority_community | \n", "SHAPE_Length | \n", "SHAPE_Area | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "29993 | \n", "01001020100 | \n", "1 | \n", "AL | \n", "Alabama | \n", "1 | \n", "Autauga | \n", "1001 | \n", "20100 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.15 | \n", "0.00 | \n", "
1 | \n", "30627 | \n", "01001020200 | \n", "1 | \n", "AL | \n", "Alabama | \n", "1 | \n", "Autauga | \n", "1001 | \n", "20200 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.09 | \n", "0.00 | \n", "
2 | \n", "29992 | \n", "01001020300 | \n", "1 | \n", "AL | \n", "Alabama | \n", "1 | \n", "Autauga | \n", "1001 | \n", "20300 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.10 | \n", "0.00 | \n", "
3 | \n", "30079 | \n", "01001020400 | \n", "1 | \n", "AL | \n", "Alabama | \n", "1 | \n", "Autauga | \n", "1001 | \n", "20400 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.12 | \n", "0.00 | \n", "
4 | \n", "30078 | \n", "01001020500 | \n", "1 | \n", "AL | \n", "Alabama | \n", "1 | \n", "Autauga | \n", "1001 | \n", "20500 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.16 | \n", "0.00 | \n", "
\n", " | GEOID10_TRACT | \n", "Total Population | \n", "California County | \n", "ZIP | \n", "Nearby City \\r\\n(to help approximate location only) | \n", "Longitude | \n", "Latitude | \n", "calenviroscreen_score | \n", "calenviroscreen_percentile | \n", "DRAFT CES 4.0\\r\\nPercentile Range | \n", "... | \n", "COUNTY | \n", "COUNTY_NAME | \n", "CNTY_FIPS | \n", "TRACT | \n", "RCAP_90 | \n", "RCAP_00 | \n", "RCAP_10 | \n", "hud_recap_priority_community | \n", "SHAPE_Length | \n", "SHAPE_Area | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "06019001100 | \n", "2760.00 | \n", "Fresno | \n", "93706.00 | \n", "Fresno | \n", "-119.78 | \n", "36.71 | \n", "94.61 | \n", "100.00 | \n", "95-100% (highest scores) | \n", "... | \n", "19 | \n", "Fresno | \n", "6019 | \n", "1100 | \n", "0.00 | \n", "1.00 | \n", "1.00 | \n", "True | \n", "0.09 | \n", "0.00 | \n", "
1 | \n", "06077000700 | \n", "4177.00 | \n", "San Joaquin | \n", "95206.00 | \n", "Stockton | \n", "-121.29 | \n", "37.94 | \n", "90.83 | \n", "99.99 | \n", "95-100% (highest scores) | \n", "... | \n", "77 | \n", "San Joaquin | \n", "6077 | \n", "700 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "True | \n", "0.07 | \n", "0.00 | \n", "
2 | \n", "06077000100 | \n", "4055.00 | \n", "San Joaquin | \n", "95202.00 | \n", "Stockton | \n", "-121.29 | \n", "37.95 | \n", "85.75 | \n", "99.97 | \n", "95-100% (highest scores) | \n", "... | \n", "77 | \n", "San Joaquin | \n", "6077 | \n", "100 | \n", "1.00 | \n", "1.00 | \n", "1.00 | \n", "True | \n", "0.06 | \n", "0.00 | \n", "
3 | \n", "06071001600 | \n", "5527.00 | \n", "San Bernardino | \n", "91761.00 | \n", "Ontario | \n", "-117.62 | \n", "34.06 | \n", "83.56 | \n", "99.96 | \n", "95-100% (highest scores) | \n", "... | \n", "71 | \n", "San Bernardino | \n", "6071 | \n", "1600 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "True | \n", "0.25 | \n", "0.00 | \n", "
4 | \n", "06037204920 | \n", "2639.00 | \n", "Los Angeles | \n", "90023.00 | \n", "Los Angeles | \n", "-118.20 | \n", "34.02 | \n", "82.90 | \n", "99.95 | \n", "95-100% (highest scores) | \n", "... | \n", "37 | \n", "Los Angeles | \n", "6037 | \n", "204920 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.04 | \n", "0.00 | \n", "
5 rows × 73 columns
\n", "\n", " | GEOID10 | \n", "Housing burden (percent) | \n", "Total population | \n", "Air toxics cancer risk | \n", "Respiratory hazard index | \n", "Diesel particulate matter | \n", "Particulate matter (PM2.5) | \n", "Ozone_x | \n", "Traffic proximity and volume | \n", "Proximity to RMP sites | \n", "... | \n", "COUNTY | \n", "COUNTY_NAME | \n", "CNTY_FIPS | \n", "TRACT | \n", "RCAP_90 | \n", "RCAP_00 | \n", "RCAP_10 | \n", "hud_recap_priority_community | \n", "SHAPE_Length | \n", "SHAPE_Area | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "010010201001 | \n", "0.15 | \n", "692 | \n", "49.38 | \n", "0.79 | \n", "0.28 | \n", "10.00 | \n", "40.12 | \n", "91.02 | \n", "0.09 | \n", "... | \n", "1.00 | \n", "Autauga | \n", "1001.00 | \n", "20100.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.15 | \n", "0.00 | \n", "
1 | \n", "010010201002 | \n", "0.15 | \n", "1153 | \n", "49.38 | \n", "0.79 | \n", "0.28 | \n", "10.00 | \n", "40.12 | \n", "2.62 | \n", "0.07 | \n", "... | \n", "1.00 | \n", "Autauga | \n", "1001.00 | \n", "20100.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.15 | \n", "0.00 | \n", "
2 | \n", "010010202001 | \n", "0.25 | \n", "1020 | \n", "50.32 | \n", "0.81 | \n", "0.30 | \n", "10.07 | \n", "40.22 | \n", "4.68 | \n", "0.08 | \n", "... | \n", "1.00 | \n", "Autauga | \n", "1001.00 | \n", "20200.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.09 | \n", "0.00 | \n", "
3 | \n", "010010202002 | \n", "0.25 | \n", "1152 | \n", "50.32 | \n", "0.81 | \n", "0.30 | \n", "10.07 | \n", "40.22 | \n", "218.65 | \n", "0.09 | \n", "... | \n", "1.00 | \n", "Autauga | \n", "1001.00 | \n", "20200.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.09 | \n", "0.00 | \n", "
4 | \n", "010010203001 | \n", "0.21 | \n", "2555 | \n", "50.77 | \n", "0.82 | \n", "0.36 | \n", "10.12 | \n", "40.31 | \n", "69.64 | \n", "0.08 | \n", "... | \n", "1.00 | \n", "Autauga | \n", "1001.00 | \n", "20300.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "False | \n", "0.10 | \n", "0.00 | \n", "
5 rows × 165 columns
\n", "\n", " | \n", " | GEOID10_STATE | \n", "State name | \n", "Total CBGs in state | \n", "Total population in state | \n", "Score A (top 25th percentile) (priority population) | \n", "Score A (top 25th percentile) (total CBGs) | \n", "Score A (top 25th percentile) (percent CBGs) | \n", "Score A (top 25th percentile) (percent population) | \n", "Score B (top 25th percentile) (priority population) | \n", "Score B (top 25th percentile) (total CBGs) | \n", "... | \n", "Score E (top 25th percentile) (percent CBGs) | \n", "Score E (top 25th percentile) (percent population) | \n", "calenviroscreen_priority_community (priority population) | \n", "calenviroscreen_priority_community (total CBGs) | \n", "calenviroscreen_priority_community (percent CBGs) | \n", "calenviroscreen_priority_community (percent population) | \n", "hud_recap_priority_community (priority population) | \n", "hud_recap_priority_community (total CBGs) | \n", "hud_recap_priority_community (percent CBGs) | \n", "hud_recap_priority_community (percent population) | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
GEOID10_STATE | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
01 | \n", "0 | \n", "01 | \n", "Alabama | \n", "3438 | \n", "4850771 | \n", "1547345 | \n", "1326 | \n", "0.39 | \n", "0.32 | \n", "1556417 | \n", "1323 | \n", "... | \n", "0.23 | \n", "0.19 | \n", "0 | \n", "0 | \n", "0.00 | \n", "0.00 | \n", "235117 | \n", "258 | \n", "0.08 | \n", "0.05 | \n", "
02 | \n", "0 | \n", "02 | \n", "Alaska | \n", "534 | \n", "738565 | \n", "63868 | \n", "57 | \n", "0.11 | \n", "0.09 | \n", "63868 | \n", "57 | \n", "... | \n", "0.14 | \n", "0.12 | \n", "0 | \n", "0 | \n", "0.00 | \n", "0.00 | \n", "6536 | \n", "8 | \n", "0.01 | \n", "0.01 | \n", "
04 | \n", "0 | \n", "04 | \n", "Arizona | \n", "4178 | \n", "6809946 | \n", "1956052 | \n", "1230 | \n", "0.29 | \n", "0.29 | \n", "1960856 | \n", "1231 | \n", "... | \n", "0.30 | \n", "0.30 | \n", "0 | \n", "0 | \n", "0.00 | \n", "0.00 | \n", "560353 | \n", "378 | \n", "0.09 | \n", "0.08 | \n", "
05 | \n", "0 | \n", "05 | \n", "Arkansas | \n", "2147 | \n", "2977944 | \n", "960799 | \n", "817 | \n", "0.38 | \n", "0.32 | \n", "975780 | \n", "826 | \n", "... | \n", "0.20 | \n", "0.18 | \n", "0 | \n", "0 | \n", "0.00 | \n", "0.00 | \n", "101200 | \n", "106 | \n", "0.05 | \n", "0.03 | \n", "
06 | \n", "0 | \n", "06 | \n", "California | \n", "23212 | \n", "38982847 | \n", "12610810 | \n", "7102 | \n", "0.31 | \n", "0.32 | \n", "12556846 | \n", "7065 | \n", "... | \n", "0.40 | \n", "0.42 | \n", "9610287 | \n", "5690 | \n", "0.25 | \n", "0.25 | \n", "1748765 | \n", "1013 | \n", "0.04 | \n", "0.04 | \n", "
5 rows × 32 columns
\n", "