This commit is contained in:
lucasmbrown-usds 2021-08-09 22:24:14 -05:00
parent cf13036d20
commit ebe6180f7c
3 changed files with 180 additions and 4 deletions

View file

@ -31,6 +31,9 @@ class ScoreETL(ExtractTransformLoad):
self.HIGH_SCHOOL_FIELD_NAME = ( self.HIGH_SCHOOL_FIELD_NAME = (
"Percent individuals age 25 or over with less than high school degree" "Percent individuals age 25 or over with less than high school degree"
) )
self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = (
"Median household income (% of state median household income)"
)
# There's another aggregation level (a second level of "buckets"). # There's another aggregation level (a second level of "buckets").
self.AGGREGATION_POLLUTION = "Pollution Burden" self.AGGREGATION_POLLUTION = "Pollution Burden"
@ -145,7 +148,12 @@ class ScoreETL(ExtractTransformLoad):
renamed_field="Total population", renamed_field="Total population",
bucket=None, bucket=None,
), ),
# The following data sets have buckets, because they're used in the score DataSet(
input_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
bucket=None,
),
# The following data sets have buckets, because they're used in Score C
DataSet( DataSet(
input_field="CANCER", input_field="CANCER",
renamed_field="Air toxics cancer risk", renamed_field="Air toxics cancer risk",
@ -375,9 +383,6 @@ class ScoreETL(ExtractTransformLoad):
self.df["Score D"] = self.df[fields_min_max].mean(axis=1) self.df["Score D"] = self.df[fields_min_max].mean(axis=1)
self.df["Score E"] = self.df[fields_percentile].mean(axis=1) self.df["Score E"] = self.df[fields_percentile].mean(axis=1)
# Calculate correlations
self.df[fields_min_max].corr()
# Create percentiles for the scores # Create percentiles for the scores
for score_field in [ for score_field in [
"Score A", "Score A",
@ -400,9 +405,40 @@ class ScoreETL(ExtractTransformLoad):
>= 1 - threshold >= 1 - threshold
) )
# Now for binary (non index) scores.
# Calculate "Score F", which uses "either/or" thresholds.
ami_and_high_school_field_name = "Low AMI, Low HS graduation"
meets_socio_field_name = "Meets socioeconomic criteria"
meets_burden_field_name = "Meets burden criteria"
self.df[ami_and_high_school_field_name] = (
self.df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] < 0.80
) & (self.df[self.HIGH_SCHOOL_FIELD_NAME] > 0.2)
self.df[meets_socio_field_name] = (
self.df[ami_and_high_school_field_name]
| (self.df[self.POVERTY_FIELD_NAME] > 0.40)
| (self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] > 0.10)
| (self.df[self.HIGH_SCHOOL_FIELD_NAME] > 0.4)
)
self.df[meets_burden_field_name] = (
self.df["Particulate matter (PM2.5)"] > 10
) | (self.df["Respiratory hazard " "index"] > 0.75)
self.df["Score F (communities)"] = (
self.df[ami_and_high_school_field_name] & self.df[meets_burden_field_name]
)
def load(self) -> None: def load(self) -> None:
logger.info("Saving Score CSV") logger.info("Saving Score CSV")
# write nationwide csv # write nationwide csv
self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True) self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
# TODO: drop
self.df[0:10000].to_csv(self.SCORE_CSV_PATH / "usa-10000.csv", index=False)
self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False) self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)

View file

@ -0,0 +1,135 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1b0b3db8",
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"import functools\n",
"import IPython\n",
"import itertools\n",
"import matplotlib\n",
"import numpy as np\n",
"import os\n",
"import pandas as pd\n",
"import pathlib\n",
"import pypandoc\n",
"import requests\n",
"import string\n",
"import sys\n",
"import typing\n",
"import us\n",
"import zipfile\n",
"\n",
"from datetime import datetime\n",
"from tqdm.notebook import tqdm_notebook\n",
"\n",
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
"if module_path not in sys.path:\n",
" sys.path.append(module_path)\n",
"\n",
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
"\n",
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
"tqdm_notebook.pandas()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "be9bff9f",
"metadata": {},
"outputs": [],
"source": [
"# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
"pd.options.display.float_format = \"{:.2f}\".format\n",
"\n",
"# Set some global parameters\n",
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
"\n",
"GEOID_FIELD_NAME = \"GEOID10\"\n",
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
"COUNTRY_FIELD_NAME = \"Country\"\n",
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94407baa",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Load CEJST score data\n",
"cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa.csv\"\n",
"cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n",
"\n",
"cejst_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29babd55",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"columns_to_plot = [\n",
" \"Respiratory hazard index\",\n",
" \"Particulate matter (PM2.5)\",\n",
" \"Poverty (Less than 200% of federal poverty line)\",\n",
" \"Percent individuals age 25 or over with less than high school degree\",\n",
" \"Unemployed civilians (percent)\",\n",
" \"Linguistic isolation (percent)\"\n",
"]\n",
"\n",
"column_to_plot = columns_to_plot[0]\n",
"print(f\"Plotting {column_to_plot}\")\n",
"print(cejst_df[\n",
" column_to_plot\n",
"].hist())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e5c8dcf",
"metadata": {},
"outputs": [],
"source": [
"for i in cejst_df.columns:\n",
" print(i)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -12,11 +12,16 @@ geopandas = "^0.9.0"
ipython = "^7.24.1" ipython = "^7.24.1"
jupyter = "^1.0.0" jupyter = "^1.0.0"
jupyter-contrib-nbextensions = "^0.5.1" jupyter-contrib-nbextensions = "^0.5.1"
matplotlib = "^3.4.2"
numpy = "^1.21.0" numpy = "^1.21.0"
pandas = "^1.2.5" pandas = "^1.2.5"
python = "^3.7.1" python = "^3.7.1"
pypandoc = "^1.6.3"
requests = "^2.25.1" requests = "^2.25.1"
tqdm = "4.62.0"
types-requests = "^2.25.0" types-requests = "^2.25.0"
us = "^2.0.2"
xlsxwriter = "^2.0.0"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
black = {version = "^21.6b0", allow-prereleases = true} black = {version = "^21.6b0", allow-prereleases = true}