wip

2025-02-23 01:54:18 -08:00 · 2021-08-09 22:24:14 -05:00 · 2021-08-09 22:24:14 -05:00 · ebe6180f7c
commit ebe6180f7c
parent cf13036d20
3 changed files with 180 additions and 4 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -31,6 +31,9 @@ class ScoreETL(ExtractTransformLoad):
        self.HIGH_SCHOOL_FIELD_NAME = (
            "Percent individuals age 25 or over with less than high school degree"
        )
+        self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = (
+            "Median household income (% of state median household income)"
+        )

        # There's another aggregation level (a second level of "buckets").
        self.AGGREGATION_POLLUTION = "Pollution Burden"
@ -145,7 +148,12 @@ class ScoreETL(ExtractTransformLoad):
                renamed_field="Total population",
                bucket=None,
            ),
-            # The following data sets have buckets, because they're used in the score
+            DataSet(
+                input_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
+                renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
+                bucket=None,
+            ),
+            # The following data sets have buckets, because they're used in Score C
            DataSet(
                input_field="CANCER",
                renamed_field="Air toxics cancer risk",
@ -375,9 +383,6 @@ class ScoreETL(ExtractTransformLoad):
        self.df["Score D"] = self.df[fields_min_max].mean(axis=1)
        self.df["Score E"] = self.df[fields_percentile].mean(axis=1)

-        # Calculate correlations
-        self.df[fields_min_max].corr()
-
        # Create percentiles for the scores
        for score_field in [
            "Score A",
@ -400,9 +405,40 @@ class ScoreETL(ExtractTransformLoad):
                    >= 1 - threshold
                )

+        # Now for binary (non index) scores.
+
+        # Calculate "Score F", which uses "either/or" thresholds.
+        ami_and_high_school_field_name = "Low AMI, Low HS graduation"
+        meets_socio_field_name = "Meets socioeconomic criteria"
+        meets_burden_field_name = "Meets burden criteria"
+
+        self.df[ami_and_high_school_field_name] = (
+            self.df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] < 0.80
+        ) & (self.df[self.HIGH_SCHOOL_FIELD_NAME] > 0.2)
+
+        self.df[meets_socio_field_name] = (
+            self.df[ami_and_high_school_field_name]
+            | (self.df[self.POVERTY_FIELD_NAME] > 0.40)
+            | (self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] > 0.10)
+            | (self.df[self.HIGH_SCHOOL_FIELD_NAME] > 0.4)
+        )
+
+        self.df[meets_burden_field_name] = (
+            self.df["Particulate matter (PM2.5)"] > 10
+        ) | (self.df["Respiratory hazard " "index"] > 0.75)
+
+        self.df["Score F (communities)"] = (
+            self.df[ami_and_high_school_field_name] & self.df[meets_burden_field_name]
+        )
+
+
    def load(self) -> None:
        logger.info("Saving Score CSV")

        # write nationwide csv
        self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
+
+        # TODO: drop
+        self.df[0:10000].to_csv(self.SCORE_CSV_PATH / "usa-10000.csv", index=False)
+
        self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)
--- a/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb
@ -0,0 +1,135 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b0b3db8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import collections\n",
+    "import functools\n",
+    "import IPython\n",
+    "import itertools\n",
+    "import matplotlib\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "import pathlib\n",
+    "import pypandoc\n",
+    "import requests\n",
+    "import string\n",
+    "import sys\n",
+    "import typing\n",
+    "import us\n",
+    "import zipfile\n",
+    "\n",
+    "from datetime import datetime\n",
+    "from tqdm.notebook import tqdm_notebook\n",
+    "\n",
+    "module_path = os.path.abspath(os.path.join(\"../..\"))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)\n",
+    "\n",
+    "from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
+    "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
+    "\n",
+    "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
+    "tqdm_notebook.pandas()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be9bff9f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
+    "pd.options.display.float_format = \"{:.2f}\".format\n",
+    "\n",
+    "# Set some global parameters\n",
+    "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
+    "\n",
+    "GEOID_FIELD_NAME = \"GEOID10\"\n",
+    "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
+    "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
+    "COUNTRY_FIELD_NAME = \"Country\"\n",
+    "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94407baa",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Load CEJST score data\n",
+    "cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa.csv\"\n",
+    "cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n",
+    "\n",
+    "cejst_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29babd55",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "columns_to_plot = [\n",
+    "    \"Respiratory hazard index\",\n",
+    "    \"Particulate matter (PM2.5)\",\n",
+    "    \"Poverty (Less than 200% of federal poverty line)\",\n",
+    "    \"Percent individuals age 25 or over with less than high school degree\",\n",
+    "    \"Unemployed civilians (percent)\",\n",
+    "    \"Linguistic isolation (percent)\"\n",
+    "]\n",
+    "\n",
+    "column_to_plot = columns_to_plot[0]\n",
+    "print(f\"Plotting {column_to_plot}\")\n",
+    "print(cejst_df[\n",
+    "    column_to_plot\n",
+    "].hist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e5c8dcf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in cejst_df.columns:\n",
+    "    print(i)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/data/data-pipeline/pyproject.toml
+++ b/data/data-pipeline/pyproject.toml
@ -12,11 +12,16 @@ geopandas = "^0.9.0"
 ipython = "^7.24.1"
 jupyter = "^1.0.0"
 jupyter-contrib-nbextensions = "^0.5.1"
+matplotlib = "^3.4.2"
 numpy = "^1.21.0"
 pandas = "^1.2.5"
 python = "^3.7.1"
+pypandoc = "^1.6.3"
 requests = "^2.25.1"
+tqdm = "4.62.0"
 types-requests = "^2.25.0"
+us = "^2.0.2"
+xlsxwriter = "^2.0.0"

 [tool.poetry.dev-dependencies]
 black = {version = "^21.6b0", allow-prereleases = true}