mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 17:44:20 -08:00
wip
This commit is contained in:
parent
cf13036d20
commit
ebe6180f7c
3 changed files with 180 additions and 4 deletions
|
@ -31,6 +31,9 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
self.HIGH_SCHOOL_FIELD_NAME = (
|
self.HIGH_SCHOOL_FIELD_NAME = (
|
||||||
"Percent individuals age 25 or over with less than high school degree"
|
"Percent individuals age 25 or over with less than high school degree"
|
||||||
)
|
)
|
||||||
|
self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = (
|
||||||
|
"Median household income (% of state median household income)"
|
||||||
|
)
|
||||||
|
|
||||||
# There's another aggregation level (a second level of "buckets").
|
# There's another aggregation level (a second level of "buckets").
|
||||||
self.AGGREGATION_POLLUTION = "Pollution Burden"
|
self.AGGREGATION_POLLUTION = "Pollution Burden"
|
||||||
|
@ -145,7 +148,12 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
renamed_field="Total population",
|
renamed_field="Total population",
|
||||||
bucket=None,
|
bucket=None,
|
||||||
),
|
),
|
||||||
# The following data sets have buckets, because they're used in the score
|
DataSet(
|
||||||
|
input_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
|
||||||
|
renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
|
||||||
|
bucket=None,
|
||||||
|
),
|
||||||
|
# The following data sets have buckets, because they're used in Score C
|
||||||
DataSet(
|
DataSet(
|
||||||
input_field="CANCER",
|
input_field="CANCER",
|
||||||
renamed_field="Air toxics cancer risk",
|
renamed_field="Air toxics cancer risk",
|
||||||
|
@ -375,9 +383,6 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
self.df["Score D"] = self.df[fields_min_max].mean(axis=1)
|
self.df["Score D"] = self.df[fields_min_max].mean(axis=1)
|
||||||
self.df["Score E"] = self.df[fields_percentile].mean(axis=1)
|
self.df["Score E"] = self.df[fields_percentile].mean(axis=1)
|
||||||
|
|
||||||
# Calculate correlations
|
|
||||||
self.df[fields_min_max].corr()
|
|
||||||
|
|
||||||
# Create percentiles for the scores
|
# Create percentiles for the scores
|
||||||
for score_field in [
|
for score_field in [
|
||||||
"Score A",
|
"Score A",
|
||||||
|
@ -400,9 +405,40 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
>= 1 - threshold
|
>= 1 - threshold
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Now for binary (non index) scores.
|
||||||
|
|
||||||
|
# Calculate "Score F", which uses "either/or" thresholds.
|
||||||
|
ami_and_high_school_field_name = "Low AMI, Low HS graduation"
|
||||||
|
meets_socio_field_name = "Meets socioeconomic criteria"
|
||||||
|
meets_burden_field_name = "Meets burden criteria"
|
||||||
|
|
||||||
|
self.df[ami_and_high_school_field_name] = (
|
||||||
|
self.df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] < 0.80
|
||||||
|
) & (self.df[self.HIGH_SCHOOL_FIELD_NAME] > 0.2)
|
||||||
|
|
||||||
|
self.df[meets_socio_field_name] = (
|
||||||
|
self.df[ami_and_high_school_field_name]
|
||||||
|
| (self.df[self.POVERTY_FIELD_NAME] > 0.40)
|
||||||
|
| (self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] > 0.10)
|
||||||
|
| (self.df[self.HIGH_SCHOOL_FIELD_NAME] > 0.4)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.df[meets_burden_field_name] = (
|
||||||
|
self.df["Particulate matter (PM2.5)"] > 10
|
||||||
|
) | (self.df["Respiratory hazard " "index"] > 0.75)
|
||||||
|
|
||||||
|
self.df["Score F (communities)"] = (
|
||||||
|
self.df[ami_and_high_school_field_name] & self.df[meets_burden_field_name]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
logger.info("Saving Score CSV")
|
logger.info("Saving Score CSV")
|
||||||
|
|
||||||
# write nationwide csv
|
# write nationwide csv
|
||||||
self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
|
self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# TODO: drop
|
||||||
|
self.df[0:10000].to_csv(self.SCORE_CSV_PATH / "usa-10000.csv", index=False)
|
||||||
|
|
||||||
self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)
|
self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)
|
||||||
|
|
135
data/data-pipeline/data_pipeline/ipython/score_explore.ipynb
Normal file
135
data/data-pipeline/data_pipeline/ipython/score_explore.ipynb
Normal file
|
@ -0,0 +1,135 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "1b0b3db8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import collections\n",
|
||||||
|
"import functools\n",
|
||||||
|
"import IPython\n",
|
||||||
|
"import itertools\n",
|
||||||
|
"import matplotlib\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import os\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import pathlib\n",
|
||||||
|
"import pypandoc\n",
|
||||||
|
"import requests\n",
|
||||||
|
"import string\n",
|
||||||
|
"import sys\n",
|
||||||
|
"import typing\n",
|
||||||
|
"import us\n",
|
||||||
|
"import zipfile\n",
|
||||||
|
"\n",
|
||||||
|
"from datetime import datetime\n",
|
||||||
|
"from tqdm.notebook import tqdm_notebook\n",
|
||||||
|
"\n",
|
||||||
|
"module_path = os.path.abspath(os.path.join(\"../..\"))\n",
|
||||||
|
"if module_path not in sys.path:\n",
|
||||||
|
" sys.path.append(module_path)\n",
|
||||||
|
"\n",
|
||||||
|
"from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
|
||||||
|
"from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
|
||||||
|
"\n",
|
||||||
|
"# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
|
||||||
|
"tqdm_notebook.pandas()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "be9bff9f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
|
||||||
|
"pd.options.display.float_format = \"{:.2f}\".format\n",
|
||||||
|
"\n",
|
||||||
|
"# Set some global parameters\n",
|
||||||
|
"DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
|
||||||
|
"\n",
|
||||||
|
"GEOID_FIELD_NAME = \"GEOID10\"\n",
|
||||||
|
"GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
|
||||||
|
"GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
|
||||||
|
"COUNTRY_FIELD_NAME = \"Country\"\n",
|
||||||
|
"CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "94407baa",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Load CEJST score data\n",
|
||||||
|
"cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa.csv\"\n",
|
||||||
|
"cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n",
|
||||||
|
"\n",
|
||||||
|
"cejst_df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "29babd55",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"columns_to_plot = [\n",
|
||||||
|
" \"Respiratory hazard index\",\n",
|
||||||
|
" \"Particulate matter (PM2.5)\",\n",
|
||||||
|
" \"Poverty (Less than 200% of federal poverty line)\",\n",
|
||||||
|
" \"Percent individuals age 25 or over with less than high school degree\",\n",
|
||||||
|
" \"Unemployed civilians (percent)\",\n",
|
||||||
|
" \"Linguistic isolation (percent)\"\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"column_to_plot = columns_to_plot[0]\n",
|
||||||
|
"print(f\"Plotting {column_to_plot}\")\n",
|
||||||
|
"print(cejst_df[\n",
|
||||||
|
" column_to_plot\n",
|
||||||
|
"].hist())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3e5c8dcf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for i in cejst_df.columns:\n",
|
||||||
|
" print(i)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
|
@ -12,11 +12,16 @@ geopandas = "^0.9.0"
|
||||||
ipython = "^7.24.1"
|
ipython = "^7.24.1"
|
||||||
jupyter = "^1.0.0"
|
jupyter = "^1.0.0"
|
||||||
jupyter-contrib-nbextensions = "^0.5.1"
|
jupyter-contrib-nbextensions = "^0.5.1"
|
||||||
|
matplotlib = "^3.4.2"
|
||||||
numpy = "^1.21.0"
|
numpy = "^1.21.0"
|
||||||
pandas = "^1.2.5"
|
pandas = "^1.2.5"
|
||||||
python = "^3.7.1"
|
python = "^3.7.1"
|
||||||
|
pypandoc = "^1.6.3"
|
||||||
requests = "^2.25.1"
|
requests = "^2.25.1"
|
||||||
|
tqdm = "4.62.0"
|
||||||
types-requests = "^2.25.0"
|
types-requests = "^2.25.0"
|
||||||
|
us = "^2.0.2"
|
||||||
|
xlsxwriter = "^2.0.0"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
black = {version = "^21.6b0", allow-prereleases = true}
|
black = {version = "^21.6b0", allow-prereleases = true}
|
||||||
|
|
Loading…
Add table
Reference in a new issue