diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 98e61260..87150092 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -31,6 +31,9 @@ class ScoreETL(ExtractTransformLoad): self.HIGH_SCHOOL_FIELD_NAME = ( "Percent individuals age 25 or over with less than high school degree" ) + self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = ( + "Median household income (% of state median household income)" + ) # There's another aggregation level (a second level of "buckets"). self.AGGREGATION_POLLUTION = "Pollution Burden" @@ -145,7 +148,12 @@ class ScoreETL(ExtractTransformLoad): renamed_field="Total population", bucket=None, ), - # The following data sets have buckets, because they're used in the score + DataSet( + input_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME, + renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME, + bucket=None, + ), + # The following data sets have buckets, because they're used in Score C DataSet( input_field="CANCER", renamed_field="Air toxics cancer risk", @@ -375,9 +383,6 @@ class ScoreETL(ExtractTransformLoad): self.df["Score D"] = self.df[fields_min_max].mean(axis=1) self.df["Score E"] = self.df[fields_percentile].mean(axis=1) - # Calculate correlations - self.df[fields_min_max].corr() - # Create percentiles for the scores for score_field in [ "Score A", @@ -400,9 +405,40 @@ class ScoreETL(ExtractTransformLoad): >= 1 - threshold ) + # Now for binary (non index) scores. + + # Calculate "Score F", which uses "either/or" thresholds. + ami_and_high_school_field_name = "Low AMI, Low HS graduation" + meets_socio_field_name = "Meets socioeconomic criteria" + meets_burden_field_name = "Meets burden criteria" + + self.df[ami_and_high_school_field_name] = ( + self.df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] < 0.80 + ) & (self.df[self.HIGH_SCHOOL_FIELD_NAME] > 0.2) + + self.df[meets_socio_field_name] = ( + self.df[ami_and_high_school_field_name] + | (self.df[self.POVERTY_FIELD_NAME] > 0.40) + | (self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] > 0.10) + | (self.df[self.HIGH_SCHOOL_FIELD_NAME] > 0.4) + ) + + self.df[meets_burden_field_name] = ( + self.df["Particulate matter (PM2.5)"] > 10 + ) | (self.df["Respiratory hazard " "index"] > 0.75) + + self.df["Score F (communities)"] = ( + self.df[ami_and_high_school_field_name] & self.df[meets_burden_field_name] + ) + + def load(self) -> None: logger.info("Saving Score CSV") # write nationwide csv self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True) + + # TODO: drop + self.df[0:10000].to_csv(self.SCORE_CSV_PATH / "usa-10000.csv", index=False) + self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False) diff --git a/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb b/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb new file mode 100644 index 00000000..eb5b55bf --- /dev/null +++ b/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb @@ -0,0 +1,135 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "1b0b3db8", + "metadata": {}, + "outputs": [], + "source": [ + "import collections\n", + "import functools\n", + "import IPython\n", + "import itertools\n", + "import matplotlib\n", + "import numpy as np\n", + "import os\n", + "import pandas as pd\n", + "import pathlib\n", + "import pypandoc\n", + "import requests\n", + "import string\n", + "import sys\n", + "import typing\n", + "import us\n", + "import zipfile\n", + "\n", + "from datetime import datetime\n", + "from tqdm.notebook import tqdm_notebook\n", + "\n", + "module_path = os.path.abspath(os.path.join(\"../..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)\n", + "\n", + "from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n", + "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n", + "\n", + "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n", + "tqdm_notebook.pandas()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be9bff9f", + "metadata": {}, + "outputs": [], + "source": [ + "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "\n", + "# Set some global parameters\n", + "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n", + "\n", + "GEOID_FIELD_NAME = \"GEOID10\"\n", + "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n", + "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n", + "COUNTRY_FIELD_NAME = \"Country\"\n", + "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94407baa", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Load CEJST score data\n", + "cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa.csv\"\n", + "cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n", + "\n", + "cejst_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29babd55", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "columns_to_plot = [\n", + " \"Respiratory hazard index\",\n", + " \"Particulate matter (PM2.5)\",\n", + " \"Poverty (Less than 200% of federal poverty line)\",\n", + " \"Percent individuals age 25 or over with less than high school degree\",\n", + " \"Unemployed civilians (percent)\",\n", + " \"Linguistic isolation (percent)\"\n", + "]\n", + "\n", + "column_to_plot = columns_to_plot[0]\n", + "print(f\"Plotting {column_to_plot}\")\n", + "print(cejst_df[\n", + " column_to_plot\n", + "].hist())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e5c8dcf", + "metadata": {}, + "outputs": [], + "source": [ + "for i in cejst_df.columns:\n", + " print(i)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/data-pipeline/pyproject.toml b/data/data-pipeline/pyproject.toml index 1f6fdbf2..c6dd9722 100644 --- a/data/data-pipeline/pyproject.toml +++ b/data/data-pipeline/pyproject.toml @@ -12,11 +12,16 @@ geopandas = "^0.9.0" ipython = "^7.24.1" jupyter = "^1.0.0" jupyter-contrib-nbextensions = "^0.5.1" +matplotlib = "^3.4.2" numpy = "^1.21.0" pandas = "^1.2.5" python = "^3.7.1" +pypandoc = "^1.6.3" requests = "^2.25.1" +tqdm = "4.62.0" types-requests = "^2.25.0" +us = "^2.0.2" +xlsxwriter = "^2.0.0" [tool.poetry.dev-dependencies] black = {version = "^21.6b0", allow-prereleases = true}