wip

2025-02-22 17:44:20 -08:00 · 2021-08-09 22:24:14 -05:00 · 2021-08-09 22:24:14 -05:00 · ebe6180f7c
commit ebe6180f7c
parent cf13036d20
3 changed files with 180 additions and 4 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -31,6 +31,9 @@ class ScoreETL(ExtractTransformLoad):
        self.HIGH_SCHOOL_FIELD_NAME = (
            "Percent individuals age 25 or over with less than high school degree"
        )
        self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = (
            "Median household income (% of state median household income)"
        )
        # There's another aggregation level (a second level of "buckets").
        self.AGGREGATION_POLLUTION = "Pollution Burden"
@ -145,7 +148,12 @@ class ScoreETL(ExtractTransformLoad):
                renamed_field="Total population",
                bucket=None,
            ),
-            # The following data sets have buckets, because they're used in the score
+            DataSet(
                input_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
                renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
                bucket=None,
            ),
            # The following data sets have buckets, because they're used in Score C
            DataSet(
                input_field="CANCER",
                renamed_field="Air toxics cancer risk",
@ -375,9 +383,6 @@ class ScoreETL(ExtractTransformLoad):
        self.df["Score D"] = self.df[fields_min_max].mean(axis=1)
        self.df["Score E"] = self.df[fields_percentile].mean(axis=1)
        # Calculate correlations
        self.df[fields_min_max].corr()
        # Create percentiles for the scores
        for score_field in [
            "Score A",
@ -400,9 +405,40 @@ class ScoreETL(ExtractTransformLoad):
                    >= 1 - threshold
                )
        # Now for binary (non index) scores.
        # Calculate "Score F", which uses "either/or" thresholds.
        ami_and_high_school_field_name = "Low AMI, Low HS graduation"
        meets_socio_field_name = "Meets socioeconomic criteria"
        meets_burden_field_name = "Meets burden criteria"
        self.df[ami_and_high_school_field_name] = (
            self.df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] < 0.80
        ) & (self.df[self.HIGH_SCHOOL_FIELD_NAME] > 0.2)
        self.df[meets_socio_field_name] = (
            self.df[ami_and_high_school_field_name]
            | (self.df[self.POVERTY_FIELD_NAME] > 0.40)
            | (self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME] > 0.10)
            | (self.df[self.HIGH_SCHOOL_FIELD_NAME] > 0.4)
        )
        self.df[meets_burden_field_name] = (
            self.df["Particulate matter (PM2.5)"] > 10
        ) | (self.df["Respiratory hazard " "index"] > 0.75)
        self.df["Score F (communities)"] = (
            self.df[ami_and_high_school_field_name] & self.df[meets_burden_field_name]
        )
    def load(self) -> None:
        logger.info("Saving Score CSV")
        # write nationwide csv
        self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
        # TODO: drop
        self.df[0:10000].to_csv(self.SCORE_CSV_PATH / "usa-10000.csv", index=False)
        self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)
--- a/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/score_explore.ipynb
@ -0,0 +1,135 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b0b3db8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "import functools\n",
    "import IPython\n",
    "import itertools\n",
    "import matplotlib\n",
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "import pathlib\n",
    "import pypandoc\n",
    "import requests\n",
    "import string\n",
    "import sys\n",
    "import typing\n",
    "import us\n",
    "import zipfile\n",
    "\n",
    "from datetime import datetime\n",
    "from tqdm.notebook import tqdm_notebook\n",
    "\n",
    "module_path = os.path.abspath(os.path.join(\"../..\"))\n",
    "if module_path not in sys.path:\n",
    "    sys.path.append(module_path)\n",
    "\n",
    "from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
    "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
    "\n",
    "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
    "tqdm_notebook.pandas()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be9bff9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Suppress scientific notation in pandas (this shows up for census tract IDs)\n",
    "pd.options.display.float_format = \"{:.2f}\".format\n",
    "\n",
    "# Set some global parameters\n",
    "DATA_DIR = pathlib.Path.cwd().parent / \"data\"\n",
    "\n",
    "GEOID_FIELD_NAME = \"GEOID10\"\n",
    "GEOID_TRACT_FIELD_NAME = \"GEOID10_TRACT\"\n",
    "GEOID_STATE_FIELD_NAME = \"GEOID10_STATE\"\n",
    "COUNTRY_FIELD_NAME = \"Country\"\n",
    "CENSUS_BLOCK_GROUP_POPULATION_FIELD = \"Total population\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94407baa",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Load CEJST score data\n",
    "cejst_data_path = DATA_DIR / \"score\" / \"csv\" / \"full\" / \"usa.csv\"\n",
    "cejst_df = pd.read_csv(cejst_data_path, dtype={GEOID_FIELD_NAME: \"string\"})\n",
    "\n",
    "cejst_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29babd55",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "columns_to_plot = [\n",
    "    \"Respiratory hazard index\",\n",
    "    \"Particulate matter (PM2.5)\",\n",
    "    \"Poverty (Less than 200% of federal poverty line)\",\n",
    "    \"Percent individuals age 25 or over with less than high school degree\",\n",
    "    \"Unemployed civilians (percent)\",\n",
    "    \"Linguistic isolation (percent)\"\n",
    "]\n",
    "\n",
    "column_to_plot = columns_to_plot[0]\n",
    "print(f\"Plotting {column_to_plot}\")\n",
    "print(cejst_df[\n",
    "    column_to_plot\n",
    "].hist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e5c8dcf",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in cejst_df.columns:\n",
    "    print(i)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/data/data-pipeline/pyproject.toml
+++ b/data/data-pipeline/pyproject.toml
@ -12,11 +12,16 @@ geopandas = "^0.9.0"
 ipython = "^7.24.1"
 jupyter = "^1.0.0"
 jupyter-contrib-nbextensions = "^0.5.1"
 matplotlib = "^3.4.2"
 numpy = "^1.21.0"
 pandas = "^1.2.5"
 python = "^3.7.1"
 pypandoc = "^1.6.3"
 requests = "^2.25.1"
 tqdm = "4.62.0"
 types-requests = "^2.25.0"
 us = "^2.0.2"
 xlsxwriter = "^2.0.0"
 [tool.poetry.dev-dependencies]
 black = {version = "^21.6b0", allow-prereleases = true}