Scores D & E (#266)

* running black throughout

* adding housing

* hud housing etl working

* got score d and e working

* updating scoring comparison

* minor fixes

* small changes

* small comments
This commit is contained in:
Lucas Merrill Brown 2021-06-29 08:20:23 -07:00 committed by Nat Hillard
parent 902e8037fe
commit caf9fe0e00
6 changed files with 625 additions and 74 deletions

View file

@ -14,7 +14,7 @@
"import os\n", "import os\n",
"import sys\n", "import sys\n",
"\n", "\n",
"module_path = os.path.abspath(os.path.join('..'))\n", "module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n", "if module_path not in sys.path:\n",
" sys.path.append(module_path)\n", " sys.path.append(module_path)\n",
"\n", "\n",
@ -26,13 +26,36 @@
"OUTPUT_PATH = DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n", "OUTPUT_PATH = DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
"\n", "\n",
"GEOID_FIELD_NAME = \"GEOID10\"\n", "GEOID_FIELD_NAME = \"GEOID10\"\n",
"UNEMPLOYED_FIELD_NAME = \"Unemployed Civilians (fraction)\"\n", "UNEMPLOYED_FIELD_NAME = \"Unemployed civilians (percent)\"\n",
"LINGUISTIC_ISOLATION_FIELD_NAME = \"Linguistic isolation (percent)\"\n",
"LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = \"Linguistic isolation (total)\"\n",
"\n",
"LINGUISTIC_ISOLATION_FIELDS = [\n",
" \"C16002_001E\",\n",
" \"C16002_004E\",\n",
" \"C16002_007E\",\n",
" \"C16002_010E\",\n",
" \"C16002_013E\",\n",
"]\n",
"\n", "\n",
"# Some display settings to make pandas outputs more readable.\n", "# Some display settings to make pandas outputs more readable.\n",
"pd.set_option(\"display.expand_frame_repr\", False)\n", "pd.set_option(\"display.expand_frame_repr\", False)\n",
"pd.set_option(\"display.precision\", 2)" "pd.set_option(\"display.precision\", 2)"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "64df0b63",
"metadata": {},
"outputs": [],
"source": [
"# For variable discovery, if necessary.\n",
"# censusdata.search(\n",
"# \"acs5\", 2019, \"label\", \"Limited English speaking\"\n",
"# )"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -44,7 +67,8 @@
"source": [ "source": [
"# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n", "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
"# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n", "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
"censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B23025\"))" "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B23025\"))\n",
"censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"C16002\"))"
] ]
}, },
{ {
@ -73,10 +97,16 @@
" geo=censusdata.censusgeo(\n", " geo=censusdata.censusgeo(\n",
" [(\"state\", fips), (\"county\", \"*\"), (\"block group\", \"*\")]\n", " [(\"state\", fips), (\"county\", \"*\"), (\"block group\", \"*\")]\n",
" ),\n", " ),\n",
" var=[\"B23025_005E\", \"B23025_003E\"],\n", " var=[\n",
" # Emploment fields\n",
" \"B23025_005E\",\n",
" \"B23025_003E\",\n",
" ]\n",
" + LINGUISTIC_ISOLATION_FIELDS,\n",
" )\n", " )\n",
" )\n", " )\n",
"\n", "\n",
"\n",
"df = pd.concat(dfs)\n", "df = pd.concat(dfs)\n",
"\n", "\n",
"df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n", "df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n",
@ -97,7 +127,34 @@
"# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.\n", "# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.\n",
"df[UNEMPLOYED_FIELD_NAME] = df.B23025_005E / df.B23025_003E\n", "df[UNEMPLOYED_FIELD_NAME] = df.B23025_005E / df.B23025_003E\n",
"\n", "\n",
"df.head()" "df[UNEMPLOYED_FIELD_NAME].describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e475472c",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Calculate linguistic isolation.\n",
"individual_limited_english_fields = [\n",
" \"C16002_004E\",\n",
" \"C16002_007E\",\n",
" \"C16002_010E\",\n",
" \"C16002_013E\",\n",
"]\n",
"\n",
"df[LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME] = df[individual_limited_english_fields].sum(\n",
" axis=1, skipna=True\n",
")\n",
"df[LINGUISTIC_ISOLATION_FIELD_NAME] = (\n",
" df[LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME].astype(float) / df[\"C16002_001E\"]\n",
")\n",
"\n",
"df[LINGUISTIC_ISOLATION_FIELD_NAME].describe()"
] ]
}, },
{ {
@ -112,18 +169,14 @@
"# mkdir census\n", "# mkdir census\n",
"OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n", "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n",
"\n", "\n",
"columns_to_include = [GEOID_FIELD_NAME, UNEMPLOYED_FIELD_NAME]\n", "columns_to_include = [\n",
" GEOID_FIELD_NAME,\n",
" UNEMPLOYED_FIELD_NAME,\n",
" LINGUISTIC_ISOLATION_FIELD_NAME,\n",
"]\n",
"\n", "\n",
"df[columns_to_include].to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)" "df[columns_to_include].to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91932af5",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -142,7 +195,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.0" "version": "3.7.1"
} }
}, },
"nbformat": 4, "nbformat": 4,

View file

@ -14,7 +14,7 @@
"import sys\n", "import sys\n",
"import os\n", "import os\n",
"\n", "\n",
"module_path = os.path.abspath(os.path.join('..'))\n", "module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n", "if module_path not in sys.path:\n",
" sys.path.append(module_path)\n", " sys.path.append(module_path)\n",
"\n", "\n",
@ -23,7 +23,9 @@
"\n", "\n",
"DATA_PATH = Path.cwd().parent / \"data\"\n", "DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH = DATA_PATH / \"tmp\"\n", "TMP_PATH = DATA_PATH / \"tmp\"\n",
"EJSCREEN_FTP_URL = \"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\"\n", "EJSCREEN_FTP_URL = (\n",
" \"https://gaftp.epa.gov/EJSCREEN/2020/EJSCREEN_2020_StatePctile.csv.zip\"\n",
")\n",
"EJSCREEN_CSV = TMP_PATH / \"EJSCREEN_2020_StatePctile.csv\"\n", "EJSCREEN_CSV = TMP_PATH / \"EJSCREEN_2020_StatePctile.csv\"\n",
"CSV_PATH = DATA_PATH / \"dataset\" / \"ejscreen_2020\"\n", "CSV_PATH = DATA_PATH / \"dataset\" / \"ejscreen_2020\"\n",
"print(DATA_PATH)" "print(DATA_PATH)"
@ -49,7 +51,13 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"df = pd.read_csv(EJSCREEN_CSV, dtype={\"ID\": \"string\"}, low_memory=False)" "df = pd.read_csv(\n",
" EJSCREEN_CSV,\n",
" dtype={\"ID\": \"string\"},\n",
" # EJSCREEN writes the word \"None\" for NA data.\n",
" na_values=[\"None\"],\n",
" low_memory=False,\n",
")"
] ]
}, },
{ {
@ -89,14 +97,6 @@
"# cleanup\n", "# cleanup\n",
"remove_all_from_dir(TMP_PATH)" "remove_all_from_dir(TMP_PATH)"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d4f74d7",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -115,7 +115,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.0" "version": "3.7.1"
} }
}, },
"nbformat": 4, "nbformat": 4,

View file

@ -14,7 +14,7 @@
"import os\n", "import os\n",
"import sys\n", "import sys\n",
"\n", "\n",
"module_path = os.path.abspath(os.path.join('..'))\n", "module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n", "if module_path not in sys.path:\n",
" sys.path.append(module_path)\n", " sys.path.append(module_path)\n",
"\n", "\n",
@ -44,7 +44,7 @@
"for fips in get_state_fips_codes(DATA_PATH):\n", "for fips in get_state_fips_codes(DATA_PATH):\n",
" print(f\"Downloading housing data for state/territory with FIPS code {fips}\")\n", " print(f\"Downloading housing data for state/territory with FIPS code {fips}\")\n",
" unzip_file_from_url(f\"{HOUSING_FTP_URL}{fips}\", TMP_PATH, zip_file_dir)\n", " unzip_file_from_url(f\"{HOUSING_FTP_URL}{fips}\", TMP_PATH, zip_file_dir)\n",
" \n", "\n",
" # New file name:\n", " # New file name:\n",
" tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n", " tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n",
" tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n", " tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n",
@ -90,14 +90,6 @@
"# cleanup\n", "# cleanup\n",
"remove_all_from_dir(TMP_PATH)" "remove_all_from_dir(TMP_PATH)"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9269e497",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -116,7 +108,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.0" "version": "3.7.1"
} }
}, },
"nbformat": 4, "nbformat": 4,

View file

@ -0,0 +1,274 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "c21b63a3",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import censusdata\n",
"import csv\n",
"from pathlib import Path\n",
"import os\n",
"import re\n",
"import sys\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from etl.sources.census.etl_utils import get_state_fips_codes\n",
"from utils import unzip_file_from_url, remove_all_from_dir\n",
"\n",
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH = DATA_PATH / \"tmp\"\n",
"OUTPUT_PATH = DATA_PATH / \"dataset\" / \"hud_housing\"\n",
"\n",
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
"\n",
"# We measure households earning less than 80% of HUD Area Median Family Income by county\n",
"# and paying greater than 30% of their income to housing costs.\n",
"HOUSING_BURDEN_FIELD_NAME = \"Housing burden (percent)\"\n",
"HOUSING_BURDEN_NUMERATOR_FIELD_NAME = \"HOUSING_BURDEN_NUMERATOR\"\n",
"HOUSING_BURDEN_DENOMINATOR_FIELD_NAME = \"HOUSING_BURDEN_DENOMINATOR\"\n",
"\n",
"# Note: some variable definitions.\n",
"# HUD-adjusted median family income (HAMFI).\n",
"# The four housing problems are: incomplete kitchen facilities, incomplete plumbing facilities, more than 1 person per room, and cost burden greater than 30%.\n",
"# Table 8 is the desired table."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6696bc66",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Download the data.\n",
"dfs = []\n",
"zip_file_dir = TMP_PATH / \"hud_housing\"\n",
"\n",
"print(f\"Downloading 225MB housing data\")\n",
"unzip_file_from_url(\n",
" \"https://www.huduser.gov/portal/datasets/cp/2012thru2016-140-csv.zip\",\n",
" TMP_PATH,\n",
" zip_file_dir,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e954589",
"metadata": {},
"outputs": [],
"source": [
"# New file name:\n",
"tmp_csv_file_path = (\n",
" zip_file_dir\n",
" / \"2012thru2016-140-csv\"\n",
" / \"2012thru2016-140-csv\"\n",
" / \"140\"\n",
" / \"Table8.csv\"\n",
")\n",
"df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "244e0d03",
"metadata": {},
"outputs": [],
"source": [
"# Rename and reformat block group ID\n",
"df.rename(columns={\"geoid\": GEOID_TRACT_FIELD_NAME}, inplace=True)\n",
"\n",
"# The CHAS data has census tract ids such as `14000US01001020100`\n",
"# Whereas the rest of our data uses, for the same tract, `01001020100`.\n",
"# the characters before `US`:\n",
"df[GEOID_TRACT_FIELD_NAME] = df[GEOID_TRACT_FIELD_NAME].str.replace(\n",
" r\"^.*?US\", \"\", regex=True\n",
")\n",
"\n",
"df[GEOID_TRACT_FIELD_NAME].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "03250026",
"metadata": {},
"outputs": [],
"source": [
"# Calculate housing burden\n",
"# This is quite a number of steps. It does not appear to be accessible nationally in a simpler format, though.\n",
"# See \"CHAS data dictionary 12-16.xlsx\"\n",
"\n",
"# Owner occupied numerator fields\n",
"OWNER_OCCUPIED_NUMERATOR_FIELDS = [\n",
" # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n",
" # T8_est7\tSubtotal\tOwner occupied\tless than or equal to 30% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n",
" \"T8_est7\",\n",
" # T8_est10\tSubtotal\tOwner occupied\tless than or equal to 30% of HAMFI\tgreater than 50%\tAll\n",
" \"T8_est10\",\n",
" # T8_est20\tSubtotal\tOwner occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n",
" \"T8_est20\",\n",
" # T8_est23\tSubtotal\tOwner occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 50%\tAll\n",
" \"T8_est23\",\n",
" # T8_est33\tSubtotal\tOwner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n",
" \"T8_est33\",\n",
" # T8_est36\tSubtotal\tOwner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 50%\tAll\n",
" \"T8_est36\",\n",
"]\n",
"\n",
"# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
"OWNER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
" # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n",
" # T8_est13\tSubtotal\tOwner occupied\tless than or equal to 30% of HAMFI\tnot computed (no/negative income)\tAll\n",
" \"T8_est13\",\n",
" # T8_est26\tSubtotal\tOwner occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tnot computed (no/negative income)\tAll\n",
" \"T8_est26\",\n",
" # T8_est39\tSubtotal\tOwner occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tnot computed (no/negative income)\tAll\n",
" \"T8_est39\",\n",
" # T8_est52\tSubtotal\tOwner occupied\tgreater than 80% but less than or equal to 100% of HAMFI\tnot computed (no/negative income)\tAll\n",
" \"T8_est52\",\n",
" # T8_est65\tSubtotal\tOwner occupied\tgreater than 100% of HAMFI\tnot computed (no/negative income)\tAll\n",
" \"T8_est65\",\n",
"]\n",
"\n",
"# T8_est2\tSubtotal\tOwner occupied\tAll\tAll\tAll\n",
"OWNER_OCCUPIED_POPULATION_FIELD = \"T8_est2\"\n",
"\n",
"# Renter occupied numerator fields\n",
"RENTER_OCCUPIED_NUMERATOR_FIELDS = [\n",
" # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n",
" # T8_est73\tSubtotal\tRenter occupied\tless than or equal to 30% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n",
" \"T8_est73\",\n",
" # T8_est76\tSubtotal\tRenter occupied\tless than or equal to 30% of HAMFI\tgreater than 50%\tAll\n",
" \"T8_est76\",\n",
" # T8_est86\tSubtotal\tRenter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n",
" \"T8_est86\",\n",
" # T8_est89\tSubtotal\tRenter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tgreater than 50%\tAll\n",
" \"T8_est89\",\n",
" # T8_est99\tSubtotal\tRenter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 30% but less than or equal to 50%\tAll\n",
" \"T8_est99\",\n",
" # T8_est102\tSubtotal\tRenter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tgreater than 50%\tAll\n",
" \"T8_est102\",\n",
"]\n",
"\n",
"# These rows have the values where HAMFI was not computed, b/c of no or negative income.\n",
"RENTER_OCCUPIED_NOT_COMPUTED_FIELDS = [\n",
" # Key: Column Name\tLine_Type\tTenure\tHousehold income\tCost burden\tFacilities\n",
" # T8_est79\tSubtotal\tRenter occupied\tless than or equal to 30% of HAMFI\tnot computed (no/negative income)\tAll\n",
" \"T8_est79\",\n",
" # T8_est92\tSubtotal\tRenter occupied\tgreater than 30% but less than or equal to 50% of HAMFI\tnot computed (no/negative income)\tAll\n",
" \"T8_est92\",\n",
" # T8_est105\tSubtotal\tRenter occupied\tgreater than 50% but less than or equal to 80% of HAMFI\tnot computed (no/negative income)\tAll\n",
" \"T8_est105\",\n",
" # T8_est118\tSubtotal\tRenter occupied\tgreater than 80% but less than or equal to 100% of HAMFI\tnot computed (no/negative income)\tAll\n",
" \"T8_est118\",\n",
" # T8_est131\tSubtotal\tRenter occupied\tgreater than 100% of HAMFI\tnot computed (no/negative income)\tAll\n",
" \"T8_est131\",\n",
"]\n",
"\n",
"\n",
"# T8_est68\tSubtotal\tRenter occupied\tAll\tAll\tAll\n",
"RENTER_OCCUPIED_POPULATION_FIELD = \"T8_est68\"\n",
"\n",
"\n",
"# Math:\n",
"# (\n",
"# # of Owner Occupied Units Meeting Criteria\n",
"# + # of Renter Occupied Units Meeting Criteria\n",
"# )\n",
"# divided by\n",
"# (\n",
"# Total # of Owner Occupied Units\n",
"# + Total # of Renter Occupied Units\n",
"# - # of Owner Occupied Units with HAMFI Not Computed\n",
"# - # of Renter Occupied Units with HAMFI Not Computed\n",
"# )\n",
"\n",
"df[HOUSING_BURDEN_NUMERATOR_FIELD_NAME] = df[OWNER_OCCUPIED_NUMERATOR_FIELDS].sum(\n",
" axis=1\n",
") + df[RENTER_OCCUPIED_NUMERATOR_FIELDS].sum(axis=1)\n",
"\n",
"df[HOUSING_BURDEN_DENOMINATOR_FIELD_NAME] = (\n",
" df[OWNER_OCCUPIED_POPULATION_FIELD]\n",
" + df[RENTER_OCCUPIED_POPULATION_FIELD]\n",
" - df[OWNER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
" - df[RENTER_OCCUPIED_NOT_COMPUTED_FIELDS].sum(axis=1)\n",
")\n",
"\n",
"# TODO: add small sample size checks\n",
"df[HOUSING_BURDEN_FIELD_NAME] = df[HOUSING_BURDEN_NUMERATOR_FIELD_NAME].astype(\n",
" float\n",
") / df[HOUSING_BURDEN_DENOMINATOR_FIELD_NAME].astype(float)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8275c1ef",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n",
"\n",
"# Drop unnecessary fields\n",
"df[\n",
" [\n",
" GEOID_TRACT_FIELD_NAME,\n",
" HOUSING_BURDEN_NUMERATOR_FIELD_NAME,\n",
" HOUSING_BURDEN_DENOMINATOR_FIELD_NAME,\n",
" HOUSING_BURDEN_FIELD_NAME,\n",
" ]\n",
"].to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef5bb862",
"metadata": {},
"outputs": [],
"source": [
"# cleanup\n",
"remove_all_from_dir(TMP_PATH)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -11,6 +11,7 @@
"# 1. `ejscreen_etl.ipynb`\n", "# 1. `ejscreen_etl.ipynb`\n",
"# 2. `census_etl.ipynb`\n", "# 2. `census_etl.ipynb`\n",
"# 3. `housing_and_transportation_etl.ipynb`\n", "# 3. `housing_and_transportation_etl.ipynb`\n",
"# 4. `hud_housing_etl.ipynb`\n",
"\n", "\n",
"import collections\n", "import collections\n",
"import functools\n", "import functools\n",
@ -20,7 +21,7 @@
"import os\n", "import os\n",
"import sys\n", "import sys\n",
"\n", "\n",
"module_path = os.path.abspath(os.path.join('..'))\n", "module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n", "if module_path not in sys.path:\n",
" sys.path.append(module_path)\n", " sys.path.append(module_path)\n",
"\n", "\n",
@ -28,6 +29,7 @@
"\n", "\n",
"# Define some global parameters\n", "# Define some global parameters\n",
"GEOID_FIELD_NAME = \"GEOID10\"\n", "GEOID_FIELD_NAME = \"GEOID10\"\n",
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
"BUCKET_SOCIOECONOMIC = \"Socioeconomic Factors\"\n", "BUCKET_SOCIOECONOMIC = \"Socioeconomic Factors\"\n",
"BUCKET_SENSITIVE = \"Sensitive populations\"\n", "BUCKET_SENSITIVE = \"Sensitive populations\"\n",
"BUCKET_ENVIRONMENTAL = \"Environmental effects\"\n", "BUCKET_ENVIRONMENTAL = \"Environmental effects\"\n",
@ -39,11 +41,22 @@
" BUCKET_EXPOSURES,\n", " BUCKET_EXPOSURES,\n",
"]\n", "]\n",
"\n", "\n",
"# A few specific field names\n",
"# TODO: clean this up, I name some fields but not others.\n",
"UNEMPLOYED_FIELD_NAME = \"Unemployed civilians (percent)\"\n",
"LINGUISTIC_ISOLATION_FIELD_NAME = \"Linguistic isolation (percent)\"\n",
"HOUSING_BURDEN_FIELD_NAME = \"Housing burden (percent)\"\n",
"POVERTY_FIELD_NAME = \"Poverty (Less than 200% of federal poverty line)\"\n",
"HIGH_SCHOOL_FIELD_NAME = (\n",
" \"Percent individuals age 25 or over with less than high school degree\"\n",
")\n",
"\n",
"# There's another aggregation level (a second level of \"buckets\").\n", "# There's another aggregation level (a second level of \"buckets\").\n",
"AGGREGATION_POLLUTION = \"Pollution Burden\"\n", "AGGREGATION_POLLUTION = \"Pollution Burden\"\n",
"AGGREGATION_POPULATION = \"Population Characteristics\"\n", "AGGREGATION_POPULATION = \"Population Characteristics\"\n",
"\n", "\n",
"PERCENTILE_FIELD_SUFFIX = \" (percentile)\"\n", "PERCENTILE_FIELD_SUFFIX = \" (percentile)\"\n",
"MIN_MAX_FIELD_SUFFIX = \" (min-max normalized)\"\n",
"\n", "\n",
"DATA_PATH = Path.cwd().parent / \"data\"\n", "DATA_PATH = Path.cwd().parent / \"data\"\n",
"SCORE_CSV_PATH = DATA_PATH / \"score\" / \"csv\"\n", "SCORE_CSV_PATH = DATA_PATH / \"score\" / \"csv\"\n",
@ -102,6 +115,23 @@
"housing_and_transportation_df.head()" "housing_and_transportation_df.head()"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "a9202e5d",
"metadata": {},
"outputs": [],
"source": [
"# Load HUD housing data\n",
"hud_housing_csv = DATA_PATH / \"dataset\" / \"hud_housing\" / \"usa.csv\"\n",
"hud_housing_df = pd.read_csv(\n",
" hud_housing_csv,\n",
" dtype={GEOID_TRACT_FIELD_NAME: \"string\"},\n",
" low_memory=False,\n",
")\n",
"hud_housing_df.head()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -110,15 +140,59 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Join all the data sources that use census block groups\n", "# Join all the data sources that use census block groups\n",
"dfs = [ejscreen_df, census_df, housing_and_transportation_df]\n", "census_block_group_dfs = [ejscreen_df, census_df, housing_and_transportation_df]\n",
"\n", "\n",
"df = functools.reduce(\n", "census_block_group_df = functools.reduce(\n",
" lambda left, right: pd.merge(\n", " lambda left, right: pd.merge(\n",
" left=left, right=right, on=GEOID_FIELD_NAME, how=\"outer\"\n", " left=left, right=right, on=GEOID_FIELD_NAME, how=\"outer\"\n",
" ),\n", " ),\n",
" dfs,\n", " census_block_group_dfs,\n",
")\n", ")\n",
"\n", "\n",
"\n",
"if len(census_block_group_df) > 220333:\n",
" raise ValueError(\"Too many rows in the join.\")\n",
"\n",
"census_block_group_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e79ec27a",
"metadata": {},
"outputs": [],
"source": [
"# Sanity check the join.\n",
"if len(census_block_group_df[GEOID_FIELD_NAME].str.len().unique()) != 1:\n",
" raise ValueError(\n",
" f\"One of the input CSVs uses {GEOID_FIELD_NAME} with a different length.\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d0d2915",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Join all the data sources that use census tracts\n",
"# TODO: when there's more than one data source using census tract, reduce/merge them here.\n",
"census_tract_df = hud_housing_df\n",
"\n",
"# Calculate the tract for the CBG data.\n",
"census_block_group_df[GEOID_TRACT_FIELD_NAME] = census_block_group_df[\n",
" GEOID_FIELD_NAME\n",
"].str[0:11]\n",
"\n",
"df = census_block_group_df.merge(census_tract_df, on=GEOID_TRACT_FIELD_NAME)\n",
"\n",
"if len(census_block_group_df) > 220333:\n",
" raise ValueError(\"Too many rows in the join.\")\n",
"\n",
"df.head()" "df.head()"
] ]
}, },
@ -135,13 +209,18 @@
")\n", ")\n",
"\n", "\n",
"data_sets = [\n", "data_sets = [\n",
" # The following data sets have `bucket=None`, because it's not used in the score.\n", " # The following data sets have `bucket=None`, because it's not used in the bucket based score (\"Score C\").\n",
" DataSet(\n", " DataSet(\n",
" input_field=GEOID_FIELD_NAME,\n", " input_field=GEOID_FIELD_NAME,\n",
" # Use the name `GEOID10` to enable geoplatform.gov's workflow.\n", " # Use the name `GEOID10` to enable geoplatform.gov's workflow.\n",
" renamed_field=GEOID_FIELD_NAME,\n", " renamed_field=GEOID_FIELD_NAME,\n",
" bucket=None,\n", " bucket=None,\n",
" ),\n", " ),\n",
" DataSet(\n",
" input_field=HOUSING_BURDEN_FIELD_NAME,\n",
" renamed_field=HOUSING_BURDEN_FIELD_NAME,\n",
" bucket=None,\n",
" ),\n",
" DataSet(input_field=\"ACSTOTPOP\", renamed_field=\"Total population\", bucket=None),\n", " DataSet(input_field=\"ACSTOTPOP\", renamed_field=\"Total population\", bucket=None),\n",
" # The following data sets have buckets, because they're used in the score\n", " # The following data sets have buckets, because they're used in the score\n",
" DataSet(\n", " DataSet(\n",
@ -206,24 +285,28 @@
" bucket=BUCKET_SENSITIVE,\n", " bucket=BUCKET_SENSITIVE,\n",
" ),\n", " ),\n",
" DataSet(\n", " DataSet(\n",
" input_field=LINGUISTIC_ISOLATION_FIELD_NAME,\n",
" renamed_field=LINGUISTIC_ISOLATION_FIELD_NAME,\n",
" bucket=BUCKET_SENSITIVE,\n",
" ),\n",
" DataSet(\n",
" input_field=\"LINGISOPCT\",\n", " input_field=\"LINGISOPCT\",\n",
" renamed_field=\"Percent of households in linguistic isolation\",\n", " renamed_field=\"Percent of households in linguistic isolation\",\n",
" bucket=BUCKET_SOCIOECONOMIC,\n", " bucket=BUCKET_SOCIOECONOMIC,\n",
" ),\n", " ),\n",
" DataSet(\n", " DataSet(\n",
" input_field=\"LOWINCPCT\",\n", " input_field=\"LOWINCPCT\",\n",
" renamed_field=\"Poverty (Less than 200% of federal poverty line)\",\n", " renamed_field=POVERTY_FIELD_NAME,\n",
" bucket=BUCKET_SOCIOECONOMIC,\n", " bucket=BUCKET_SOCIOECONOMIC,\n",
" ),\n", " ),\n",
" DataSet(\n", " DataSet(\n",
" input_field=\"LESSHSPCT\",\n", " input_field=\"LESSHSPCT\",\n",
" renamed_field=\"Percent individuals age 25 or over with less than high school degree\",\n", " renamed_field=HIGH_SCHOOL_FIELD_NAME,\n",
" bucket=BUCKET_SOCIOECONOMIC,\n", " bucket=BUCKET_SOCIOECONOMIC,\n",
" ),\n", " ),\n",
" DataSet(\n", " DataSet(\n",
" input_field=\"Unemployed Civilians (fraction)\",\n", " input_field=UNEMPLOYED_FIELD_NAME,\n",
" # Following EJSCREEN conventions, where fractional data is named as a percent.\n", " renamed_field=UNEMPLOYED_FIELD_NAME,\n",
" renamed_field=\"Unemployed Civilians (percent)\",\n",
" bucket=BUCKET_SOCIOECONOMIC,\n", " bucket=BUCKET_SOCIOECONOMIC,\n",
" ),\n", " ),\n",
" DataSet(\n", " DataSet(\n",
@ -256,6 +339,21 @@
"df.head()" "df.head()"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "1280cbd4",
"metadata": {},
"outputs": [],
"source": [
"# Convert all columns to numeric.\n",
"for data_set in data_sets:\n",
" # Skip GEOID_FIELD_NAME, because it's a string.\n",
" if data_set.renamed_field == GEOID_FIELD_NAME:\n",
" continue\n",
" df[f\"{data_set.renamed_field}\"] = pd.to_numeric(df[data_set.renamed_field])"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -274,6 +372,44 @@
"df.head()" "df.head()"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "f2088013",
"metadata": {},
"outputs": [],
"source": [
"# calculate min max\n",
"# Math:\n",
"# (\n",
"# Observed value\n",
"# - minimum of all values\n",
"# )\n",
"# divided by\n",
"# (\n",
"# Maximum of all values\n",
"# - minimum of all values\n",
"# )\n",
"for data_set in data_sets:\n",
" # Skip GEOID_FIELD_NAME, because it's a string.\n",
" if data_set.renamed_field == GEOID_FIELD_NAME:\n",
" continue\n",
"\n",
" min_value = df[data_set.renamed_field].min(skipna=True)\n",
"\n",
" max_value = df[data_set.renamed_field].max(skipna=True)\n",
"\n",
" print(\n",
" f\"For data set {data_set.renamed_field}, the min value is {min_value} and the max value is {max_value}.\"\n",
" )\n",
"\n",
" df[f\"{data_set.renamed_field}{MIN_MAX_FIELD_SUFFIX}\"] = (\n",
" df[data_set.renamed_field] - min_value\n",
" ) / (max_value - min_value)\n",
"\n",
"df.head()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -333,6 +469,35 @@
"df.head()" "df.head()"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "f70106f5",
"metadata": {},
"outputs": [],
"source": [
"fields_to_use_in_score = [\n",
" UNEMPLOYED_FIELD_NAME,\n",
" LINGUISTIC_ISOLATION_FIELD_NAME,\n",
" HOUSING_BURDEN_FIELD_NAME,\n",
" POVERTY_FIELD_NAME,\n",
" HIGH_SCHOOL_FIELD_NAME,\n",
"]\n",
"\n",
"fields_min_max = [f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score]\n",
"fields_percentile = [\n",
" f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score\n",
"]\n",
"\n",
"# Calculate \"Score D\", which uses min-max normalization\n",
"# and calculate \"Score E\", which uses percentile normalization for the same fields\n",
"df[\"Score D\"] = df[fields_min_max].mean(axis=1)\n",
"df[\"Score E\"] = df[fields_percentile].mean(axis=1)\n",
"\n",
"print(df[\"Score D\"].describe())\n",
"print(df[\"Score E\"].describe())"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -343,7 +508,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Create percentiles for the scores\n", "# Create percentiles for the scores\n",
"for score_field in [\"Score A\", \"Score B\", \"Score C\"]:\n", "for score_field in [\"Score A\", \"Score B\", \"Score C\", \"Score D\", \"Score E\"]:\n",
" df[f\"{score_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[score_field].rank(pct=True)\n", " df[f\"{score_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[score_field].rank(pct=True)\n",
" df[f\"{score_field} (top 25th percentile)\"] = (\n", " df[f\"{score_field} (top 25th percentile)\"] = (\n",
" df[f\"{score_field}{PERCENTILE_FIELD_SUFFIX}\"] >= 0.75\n", " df[f\"{score_field}{PERCENTILE_FIELD_SUFFIX}\"] >= 0.75\n",
@ -376,14 +541,6 @@
" # we need to name the file data01.csv for ogr2ogr csv merge to work\n", " # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
" df1.to_csv(SCORE_CSV_PATH / f\"data{states_fips}.csv\", index=False)" " df1.to_csv(SCORE_CSV_PATH / f\"data{states_fips}.csv\", index=False)"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "167ebba3",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -402,7 +559,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.0" "version": "3.7.1"
} }
}, },
"nbformat": 4, "nbformat": 4,

View file

@ -15,7 +15,12 @@
"import pandas as pd\n", "import pandas as pd\n",
"from pathlib import Path\n", "from pathlib import Path\n",
"import requests\n", "import requests\n",
"import zipfile" "import zipfile\n",
"from datetime import datetime\n",
"from tqdm.notebook import tqdm_notebook\n",
"\n",
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
"tqdm_notebook.pandas()"
] ]
}, },
{ {
@ -50,8 +55,11 @@
"CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n", "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
"\n", "\n",
"# Comparison field names\n", "# Comparison field names\n",
"any_tract_has_at_least_one_cbg = \"Tract has at least one CEJST CBG?\"\n",
"tract_has_at_least_one_cbg = \"CES Tract has at least one CEJST CBG?\"\n", "tract_has_at_least_one_cbg = \"CES Tract has at least one CEJST CBG?\"\n",
"tract_has_100_percent_cbg = \"CES Tract has 100% CEJST CBGs?\"" "tract_has_100_percent_cbg = \"CES Tract has 100% CEJST CBGs?\"\n",
"non_ces_tract_has_at_least_one_cbg = \"Non-CES Tract has at least one CEJST CBG?\"\n",
"non_ces_tract_has_100_percent_cbg = \"Non-CES Tract has 100% CEJST CBGs?\""
] ]
}, },
{ {
@ -69,12 +77,15 @@
"cejst_df.head()\n", "cejst_df.head()\n",
"\n", "\n",
"# Rename unclear name \"id\" to \"census_block_group_id\", as well as other renamings.\n", "# Rename unclear name \"id\" to \"census_block_group_id\", as well as other renamings.\n",
"\n",
"score_used = \"Score A\"\n",
"\n",
"cejst_df.rename(\n", "cejst_df.rename(\n",
" columns={\n", " columns={\n",
" \"GEOID10\": CENSUS_BLOCK_GROUP_ID_FIELD,\n", " \"GEOID10\": CENSUS_BLOCK_GROUP_ID_FIELD,\n",
" \"Total population\": CENSUS_BLOCK_GROUP_POPULATION_FIELD,\n", " \"Total population\": CENSUS_BLOCK_GROUP_POPULATION_FIELD,\n",
" \"Score C\": CEJST_SCORE_FIELD,\n", " score_used: CEJST_SCORE_FIELD,\n",
" \"Score C (percentile)\": CEJST_PERCENTILE_FIELD,\n", " f\"{score_used} (percentile)\": CEJST_PERCENTILE_FIELD,\n",
" },\n", " },\n",
" inplace=True,\n", " inplace=True,\n",
" errors=\"raise\",\n", " errors=\"raise\",\n",
@ -239,18 +250,42 @@
" # Recall that NaN values are not falsy, so we need to check if `is_a_ces_priority_tract` is True.\n", " # Recall that NaN values are not falsy, so we need to check if `is_a_ces_priority_tract` is True.\n",
" is_a_ces_priority_tract = is_a_ces_priority_tract is True\n", " is_a_ces_priority_tract = is_a_ces_priority_tract is True\n",
"\n", "\n",
" # Calculate whether the tract (whether or not it is a CES priority tract) includes CBGs that are priority\n",
" # according to the current CEJST score.\n",
" df[any_tract_has_at_least_one_cbg] = (\n",
" frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n",
" )\n",
"\n",
" # Calculate comparison\n", " # Calculate comparison\n",
" # A CES priority tract has at least one CEJST priority CBG.\n",
" df[tract_has_at_least_one_cbg] = (\n", " df[tract_has_at_least_one_cbg] = (\n",
" frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n", " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n",
" if is_a_ces_priority_tract\n", " if is_a_ces_priority_tract\n",
" else None\n", " else None\n",
" )\n", " )\n",
"\n",
" # A CES priority tract has all of its contained CBGs as CEJST priority CBGs.\n",
" df[tract_has_100_percent_cbg] = (\n", " df[tract_has_100_percent_cbg] = (\n",
" frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].mean() == 1\n", " frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].mean() == 1\n",
" if is_a_ces_priority_tract\n", " if is_a_ces_priority_tract\n",
" else None\n", " else None\n",
" )\n", " )\n",
"\n", "\n",
" # Calculate the inverse\n",
" # A tract that is _not_ a CES priority has at least one CEJST priority CBG.\n",
" df[non_ces_tract_has_at_least_one_cbg] = (\n",
" frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum() > 0\n",
" if not is_a_ces_priority_tract\n",
" else None\n",
" )\n",
"\n",
" # A tract that is _not_ a CES priority has all of its contained CBGs as CEJST priority CBGs.\n",
" df[non_ces_tract_has_100_percent_cbg] = (\n",
" frame.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].mean() == 1\n",
" if not is_a_ces_priority_tract\n",
" else None\n",
" )\n",
"\n",
" return df\n", " return df\n",
"\n", "\n",
"\n", "\n",
@ -258,7 +293,7 @@
"grouped_df = merged_df.groupby(CENSUS_TRACT_ID_FIELD)\n", "grouped_df = merged_df.groupby(CENSUS_TRACT_ID_FIELD)\n",
"\n", "\n",
"# Run the comparison function on the groups.\n", "# Run the comparison function on the groups.\n",
"comparison_df = grouped_df.apply(calculate_comparison)\n", "comparison_df = grouped_df.progress_apply(calculate_comparison)\n",
"\n", "\n",
"# Sort descending by highest CES Score for convenience when viewing output file\n", "# Sort descending by highest CES Score for convenience when viewing output file\n",
"comparison_df.sort_values(\n", "comparison_df.sort_values(\n",
@ -283,15 +318,37 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Prepare some constants for use in the following Markdown cell.\n", "# Prepare some constants for use in the following Markdown cell.\n",
"\n", "total_cbgs_ca_only = len(cejst_df)\n",
"cejst_cbgs_ca_only = cejst_df.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum()\n", "cejst_cbgs_ca_only = cejst_df.loc[:, CEJST_PRIORITY_COMMUNITY_FIELD].sum()\n",
"cejst_cbgs_ca_only_percent = f\"{cejst_cbgs_ca_only / total_cbgs_ca_only:.0%}\"\n",
"\n",
"total_tracts_count = len(comparison_df)\n",
"ces_tracts_count = comparison_df.loc[:, CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].sum()\n", "ces_tracts_count = comparison_df.loc[:, CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD].sum()\n",
"ces_tracts_count_percent = f\"{ces_tracts_count / total_tracts_count:.0%}\"\n",
"non_ces_tracts_count = total_tracts_count - ces_tracts_count\n",
"\n",
"total_tracts_count = len(comparison_df[CENSUS_TRACT_ID_FIELD])\n",
"cejst_tracts_count = comparison_df.loc[:, any_tract_has_at_least_one_cbg].sum()\n",
"cejst_tracts_count_percent = f\"{cejst_tracts_count / total_tracts_count:.0%}\"\n",
"\n",
"# CES stats\n",
"at_least_one_sum = comparison_df.loc[:, tract_has_at_least_one_cbg].sum()\n", "at_least_one_sum = comparison_df.loc[:, tract_has_at_least_one_cbg].sum()\n",
"at_least_one_sum_percent = f\"{at_least_one_sum / ces_tracts_count:.0%}\"\n", "at_least_one_sum_percent = f\"{at_least_one_sum / ces_tracts_count:.0%}\"\n",
"\n", "\n",
"all_100_sum = comparison_df.loc[:, tract_has_100_percent_cbg].sum()\n", "all_100_sum = comparison_df.loc[:, tract_has_100_percent_cbg].sum()\n",
"all_100_sum_percent = f\"{all_100_sum / ces_tracts_count:.0%}\"\n", "all_100_sum_percent = f\"{all_100_sum / ces_tracts_count:.0%}\"\n",
"\n", "\n",
"# Non-CES stats:\n",
"non_ces_at_least_one_sum = comparison_df.loc[\n",
" :, non_ces_tract_has_at_least_one_cbg\n",
"].sum()\n",
"non_ces_at_least_one_sum_percent = (\n",
" f\"{non_ces_at_least_one_sum / non_ces_tracts_count:.0%}\"\n",
")\n",
"\n",
"non_ces_all_100_sum = comparison_df.loc[:, non_ces_tract_has_100_percent_cbg].sum()\n",
"non_ces_all_100_sum_percent = f\"{non_ces_all_100_sum / non_ces_tracts_count:.0%}\"\n",
"\n",
"# Note, for the following Markdown cell to render the variables properly, follow the steps at\n", "# Note, for the following Markdown cell to render the variables properly, follow the steps at\n",
"# \"Activating variable-enabled Markdown for Jupyter notebooks\" within `score/README.md`." "# \"Activating variable-enabled Markdown for Jupyter notebooks\" within `score/README.md`."
] ]
@ -301,26 +358,44 @@
"id": "0c534966", "id": "0c534966",
"metadata": { "metadata": {
"variables": { "variables": {
"all_100_sum": "1373", " total_tracts_count": "8057",
"all_100_sum_percent": "69%", "all_100_sum": "1168",
"at_least_one_sum": "1866", "all_100_sum_percent": "59%",
"at_least_one_sum_percent": "94%", "at_least_one_sum": "1817",
"cejst_cbgs_ca_only": "10849", "at_least_one_sum_percent": "92%",
"ces_tracts_count": "1983" "cejst_cbgs_ca_only": "6987",
"cejst_cbgs_ca_only_percent": "30%",
"cejst_tracts_count": "3516",
"cejst_tracts_count_percent": "44%",
"ces_tracts_count": "1983",
"ces_tracts_count_percent": "25%",
"datetime.today().strftime('%Y-%m-%d')": "2021-06-28",
"non_ces_all_100_sum": "438",
"non_ces_all_100_sum_percent": "7%",
"non_ces_at_least_one_sum": "1699",
"non_ces_at_least_one_sum_percent": "28%",
"score_used": "Score A",
"total_cbgs_ca_only": "23212"
} }
}, },
"source": [ "source": [
"# Summary of findings\n", "# Summary of findings for {{score_used}}\n",
"\n",
"(Calculated on {{datetime.today().strftime('%Y-%m-%d')}})\n",
"\n", "\n",
"Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.\n", "Recall that census tracts contain one or more census block groups, with up to nine census block groups per tract.\n",
"\n", "\n",
"There are {{ces_tracts_count}} census tracts designated as Disadvantaged Communities by CalEnviroScreen 4.0. \n", "There are {{ces_tracts_count}} census tracts designated as Disadvantaged Communities by CalEnviroScreen 4.0, out of {{total_tracts_count}} total tracts ({{ces_tracts_count_percent}}). \n",
"\n", "\n",
"Within California, there are {{cejst_cbgs_ca_only}} census block groups considered as priority communities by the current version of the CEJST score used in this analysis.\n", "Within California, there are {{cejst_cbgs_ca_only}} census block groups considered as priority communities by the current version of the CEJST score used in this analysis, out of {{total_cbgs_ca_only}} CBGs in the state ({{cejst_cbgs_ca_only_percent}}). They occupy {{cejst_tracts_count}} ({{cejst_tracts_count_percent}}) of all the census tracts in California.\n",
"\n", "\n",
"Out of every CalEnviroScreen Disadvantaged Community census tract, {{at_least_one_sum}} ({{at_least_one_sum_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n", "Out of every CalEnviroScreen Disadvantaged Community census tract, {{at_least_one_sum}} ({{at_least_one_sum_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n",
"\n", "\n",
"Out of every CalEnviroScreen Disadvantaged Community census tract, {{all_100_sum}} ({{all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score." "Out of every CalEnviroScreen Disadvantaged Community census tract, {{all_100_sum}} ({{all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score.\n",
"\n",
"Out of every census tract in California that is __not__ marked as a CalEnviroScreen Disadvantaged Community, {{non_ces_at_least_one_sum}} ({{non_ces_at_least_one_sum_percent}}) of these census tracts have at least one census block group within them that is considered a priority community by the current version of the CEJST score.\n",
"\n",
"Out of every census tract in California that is __not__ marked as a CalEnviroScreen Disadvantaged Community, {{non_ces_all_100_sum}} ({{non_ces_all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score."
] ]
} }
], ],