Comparison tool refactor & ETL HUD RECAP (#272)

* Refactoring comparison tool and creating two new ETL notebooks
This commit is contained in:
Lucas Merrill Brown 2021-07-06 12:10:58 -05:00 committed by GitHub
parent e8385e1439
commit 11d13e034e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 2071 additions and 274 deletions

View file

@ -0,0 +1,141 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "20aa3891",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import numpy as np\n",
"import pandas as pd\n",
"import csv\n",
"import sys\n",
"import os\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from etl.sources.census.etl_utils import get_state_fips_codes\n",
"from utils import unzip_file_from_url, remove_all_from_dir\n",
"\n",
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH = DATA_PATH / \"tmp\"\n",
"CALENVIROSCREEN_FTP_URL = \"https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip\"\n",
"CSV_PATH = DATA_PATH / \"dataset\" / \"calenviroscreen4\"\n",
"\n",
"# Definining some variable names\n",
"CALENVIROSCREEN_SCORE_FIELD_NAME = \"calenviroscreen_score\"\n",
"CALENVIROSCREEN_PERCENTILE_FIELD_NAME = \"calenviroscreen_percentile\"\n",
"CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME = \"calenviroscreen_priority_community\"\n",
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
"\n",
"# Choosing constants.\n",
"# None of these numbers are final, but just for the purposes of comparison.\n",
"CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD = 75\n",
"\n",
"print(DATA_PATH)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cc3fb9ec",
"metadata": {},
"outputs": [],
"source": [
"# download file from ejscreen ftp\n",
"unzip_file_from_url(CALENVIROSCREEN_FTP_URL, TMP_PATH, TMP_PATH)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15f66756",
"metadata": {},
"outputs": [],
"source": [
"# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:\n",
"# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip\n",
"calenviroscreen_4_csv_name = \"CalEnviroScreen_4.0_2021.csv\"\n",
"calenviroscreen_data_path = TMP_PATH.joinpath(calenviroscreen_4_csv_name)\n",
"\n",
"# Load comparison index (CalEnviroScreen 4)\n",
"calenviroscreen_df = pd.read_csv(\n",
" calenviroscreen_data_path, dtype={\"Census Tract\": \"string\"}\n",
")\n",
"\n",
"calenviroscreen_df.rename(\n",
" columns={\n",
" \"Census Tract\": GEOID_TRACT_FIELD_NAME,\n",
" \"DRAFT CES 4.0 Score\": CALENVIROSCREEN_SCORE_FIELD_NAME,\n",
" \"DRAFT CES 4.0 Percentile\": CALENVIROSCREEN_PERCENTILE_FIELD_NAME,\n",
" },\n",
" inplace=True,\n",
")\n",
"\n",
"# Add a leading \"0\" to the Census Tract to match our format in other data frames.\n",
"\n",
"calenviroscreen_df[GEOID_TRACT_FIELD_NAME] = (\n",
" \"0\" + calenviroscreen_df[GEOID_TRACT_FIELD_NAME]\n",
")\n",
"\n",
"# Calculate the top K% of prioritized communities\n",
"calenviroscreen_df[CALENVIROSCREEN_PRIORITY_COMMUNITY_FIELD_NAME] = (\n",
" calenviroscreen_df[CALENVIROSCREEN_PERCENTILE_FIELD_NAME]\n",
" >= CALENVIROSCREEN_PRIORITY_COMMUNITY_THRESHOLD\n",
")\n",
"\n",
"calenviroscreen_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fa2077a",
"metadata": {},
"outputs": [],
"source": [
"# write csv\n",
"CSV_PATH.mkdir(parents=True, exist_ok=True)\n",
"\n",
"# Matching other conventions in the ETL scripts, write only for the state (FIPS code 06).\n",
"calenviroscreen_df.to_csv(CSV_PATH / \"data06.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "81b977f8",
"metadata": {},
"outputs": [],
"source": [
"# cleanup\n",
"remove_all_from_dir(TMP_PATH)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -0,0 +1,115 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "20aa3891",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import numpy as np\n",
"import pandas as pd\n",
"import csv\n",
"import sys\n",
"import os\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from etl.sources.census.etl_utils import get_state_fips_codes\n",
"from utils import unzip_file_from_url, remove_all_from_dir\n",
"\n",
"DATA_PATH = Path.cwd().parent / \"data\"\n",
"TMP_PATH = DATA_PATH / \"tmp\"\n",
"HUD_RECAP_CSV_URL = \"https://opendata.arcgis.com/api/v3/datasets/56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326\"\n",
"CSV_PATH = DATA_PATH / \"dataset\" / \"hud_recap\"\n",
"\n",
"# Definining some variable names\n",
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
"HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = \"hud_recap_priority_community\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9455da5",
"metadata": {},
"outputs": [],
"source": [
"# Data from https://hudgis-hud.opendata.arcgis.com/datasets/HUD::racially-or-ethnically-concentrated-areas-of-poverty-r-ecaps/about\n",
"df = pd.read_csv(HUD_RECAP_CSV_URL, dtype={\"GEOID\": \"string\"})\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca63e66c",
"metadata": {},
"outputs": [],
"source": [
"# Rename some fields\n",
"df.rename(\n",
" columns={\n",
" \"GEOID\": GEOID_TRACT_FIELD_NAME,\n",
" # Interestingly, there's no data dictionary for the RECAP data that I could find.\n",
" # However, this site (http://www.schousing.com/library/Tax%20Credit/2020/QAP%20Instructions%20(2).pdf)\n",
" # suggests:\n",
" # \"If RCAP_Current for the tract in which the site is located is 1, the tract is an R/ECAP. If RCAP_Current is 0, it is not.\"\n",
" \"RCAP_Current\": HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME,\n",
" },\n",
" inplace=True,\n",
")\n",
"\n",
"# Convert to boolean\n",
"df[HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME] = df[\n",
" HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME\n",
"].astype(\"bool\")\n",
"\n",
"df[HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME].value_counts()\n",
"\n",
"df.sort_values(by=GEOID_TRACT_FIELD_NAME, inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fa2077a",
"metadata": {},
"outputs": [],
"source": [
"# write csv\n",
"CSV_PATH.mkdir(parents=True, exist_ok=True)\n",
"\n",
"# Drop unnecessary columns.\n",
"df[[GEOID_TRACT_FIELD_NAME, HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME]].to_csv(\n",
" CSV_PATH / \"usa.csv\", index=False\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -16,6 +16,7 @@
"import collections\n", "import collections\n",
"import functools\n", "import functools\n",
"from pathlib import Path\n", "from pathlib import Path\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n", "import pandas as pd\n",
"import csv\n", "import csv\n",
"import os\n", "import os\n",
@ -363,7 +364,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# calculate percentiles\n", "# Calculate percentiles for each data set.\n",
"for data_set in data_sets:\n", "for data_set in data_sets:\n",
" df[f\"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[\n", " df[f\"{data_set.renamed_field}{PERCENTILE_FIELD_SUFFIX}\"] = df[\n",
" data_set.renamed_field\n", " data_set.renamed_field\n",
@ -379,7 +380,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# calculate min max\n", "# Calculate min-max for each data set.\n",
"# Math:\n", "# Math:\n",
"# (\n", "# (\n",
"# Observed value\n", "# Observed value\n",
@ -410,6 +411,28 @@
"df.head()" "df.head()"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "f4eec326",
"metadata": {},
"outputs": [],
"source": [
"# Graph distributions and correlations.\n",
"min_max_fields = [\n",
" f\"{data_set.renamed_field}{MIN_MAX_FIELD_SUFFIX}\"\n",
" for data_set in data_sets\n",
" if data_set.renamed_field != GEOID_FIELD_NAME\n",
"]\n",
"df.hist(\n",
" column=min_max_fields, layout=(len(min_max_fields), 1), figsize=(10, 30), bins=30\n",
")\n",
"\n",
"plt.tight_layout()\n",
"\n",
"plt.show()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -476,7 +499,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"fields_to_use_in_score = [\n", "# Calculate scores D and E.\n",
"fields_to_use_in_score_d_and_e = [\n",
" UNEMPLOYED_FIELD_NAME,\n", " UNEMPLOYED_FIELD_NAME,\n",
" LINGUISTIC_ISOLATION_FIELD_NAME,\n", " LINGUISTIC_ISOLATION_FIELD_NAME,\n",
" HOUSING_BURDEN_FIELD_NAME,\n", " HOUSING_BURDEN_FIELD_NAME,\n",
@ -484,9 +508,11 @@
" HIGH_SCHOOL_FIELD_NAME,\n", " HIGH_SCHOOL_FIELD_NAME,\n",
"]\n", "]\n",
"\n", "\n",
"fields_min_max = [f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score]\n", "fields_min_max = [\n",
" f\"{field}{MIN_MAX_FIELD_SUFFIX}\" for field in fields_to_use_in_score_d_and_e\n",
"]\n",
"fields_percentile = [\n", "fields_percentile = [\n",
" f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score\n", " f\"{field}{PERCENTILE_FIELD_SUFFIX}\" for field in fields_to_use_in_score_d_and_e\n",
"]\n", "]\n",
"\n", "\n",
"# Calculate \"Score D\", which uses min-max normalization\n", "# Calculate \"Score D\", which uses min-max normalization\n",
@ -498,6 +524,32 @@
"print(df[\"Score E\"].describe())" "print(df[\"Score E\"].describe())"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "a02e5bac",
"metadata": {},
"outputs": [],
"source": [
"# Graph distributions\n",
"df.hist(\n",
" column=fields_min_max, layout=(len(fields_min_max), 1), figsize=(10, 30), bins=30\n",
")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0e608c8",
"metadata": {},
"outputs": [],
"source": [
"# Calculate correlations\n",
"df[fields_min_max].corr()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff