From 589ec483e3ecdb24d24c72227eb754aa9449d0d3 Mon Sep 17 00:00:00 2001
From: Lucas Merrill Brown <lucas.m.brown@omb.eop.gov>
Date: Thu, 24 Jun 2021 14:11:07 -0700
Subject: [PATCH] Ingest census data directly, add unemployment to the score
 (#214)

* Ingest two data sources and add to score

Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
---
 score/data/dataset/ejscreen_2020/__init__.py  |   0
 score/ipython/census_etl.ipynb                | 152 ++++++++++++++++++
 score/ipython/ejscreen_etl.ipynb              |  11 +-
 .../housing_and_transportation_etl.ipynb      | 132 +++++++++++++++
 score/ipython/score_calc.ipynb                |  90 ++++++++++-
 score/ipython/scoring_comparison.ipynb        |  35 ++--
 score/requirements.txt                        |   1 +
 7 files changed, 395 insertions(+), 26 deletions(-)
 delete mode 100644 score/data/dataset/ejscreen_2020/__init__.py
 create mode 100644 score/ipython/census_etl.ipynb
 create mode 100644 score/ipython/housing_and_transportation_etl.ipynb

diff --git a/score/data/dataset/ejscreen_2020/__init__.py b/score/data/dataset/ejscreen_2020/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/score/ipython/census_etl.ipynb b/score/ipython/census_etl.ipynb
new file mode 100644
index 00000000..291ce9b2
--- /dev/null
+++ b/score/ipython/census_etl.ipynb
@@ -0,0 +1,152 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0491828b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import censusdata\n",
+    "import csv\n",
+    "from pathlib import Path\n",
+    "import os\n",
+    "\n",
+    "ACS_YEAR = 2019\n",
+    "\n",
+    "DATA_PATH = Path.cwd().parent / \"data\"\n",
+    "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
+    "OUTPUT_PATH = DATA_PATH / \"dataset\" / f\"census_acs_{ACS_YEAR}\"\n",
+    "\n",
+    "GEOID_FIELD_NAME = \"GEOID10\"\n",
+    "UNEMPLOYED_FIELD_NAME = \"Unemployed Civilians (fraction)\"\n",
+    "\n",
+    "# Some display settings to make pandas outputs more readable.\n",
+    "pd.set_option(\"display.expand_frame_repr\", False)\n",
+    "pd.set_option(\"display.precision\", 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "654f25a1",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
+    "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
+    "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B23025\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8999cea4",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
+    "    \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
+    "    fips = \"\".join([value for (key, value) in censusgeo.params()])\n",
+    "    return fips\n",
+    "\n",
+    "\n",
+    "dfs = []\n",
+    "with open(FIPS_CSV_PATH) as csv_file:\n",
+    "    csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
+    "    line_count = 0\n",
+    "\n",
+    "    for row in csv_reader:\n",
+    "        if line_count == 0:\n",
+    "            line_count += 1\n",
+    "        else:\n",
+    "            fips = row[0].strip()\n",
+    "            print(f\"Downloading data for state/territory with FIPS code {fips}\")\n",
+    "\n",
+    "            dfs.append(\n",
+    "                censusdata.download(\n",
+    "                    src=\"acs5\",\n",
+    "                    year=ACS_YEAR,\n",
+    "                    geo=censusdata.censusgeo(\n",
+    "                        [(\"state\", fips), (\"county\", \"*\"), (\"block group\", \"*\")]\n",
+    "                    ),\n",
+    "                    var=[\"B23025_005E\", \"B23025_003E\"],\n",
+    "                )\n",
+    "            )\n",
+    "\n",
+    "df = pd.concat(dfs)\n",
+    "\n",
+    "df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "803cce31",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# Calculate percent unemployment.\n",
+    "# TODO: remove small-sample data that should be `None` instead of a high-variance fraction.\n",
+    "df[UNEMPLOYED_FIELD_NAME] = df.B23025_005E / df.B23025_003E\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a269bb1",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# mkdir census\n",
+    "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "columns_to_include = [GEOID_FIELD_NAME, UNEMPLOYED_FIELD_NAME]\n",
+    "\n",
+    "df[columns_to_include].to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91932af5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/score/ipython/ejscreen_etl.ipynb b/score/ipython/ejscreen_etl.ipynb
index 48807596..b91549d2 100644
--- a/score/ipython/ejscreen_etl.ipynb
+++ b/score/ipython/ejscreen_etl.ipynb
@@ -69,6 +69,7 @@
    "outputs": [],
    "source": [
     "# write nationwide csv\n",
+    "csv_path.mkdir(parents=True, exist_ok=True)\n",
     "df.to_csv(csv_path / f\"usa.csv\", index=False)"
    ]
   },
@@ -94,6 +95,14 @@
     "            # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
     "            df1.to_csv(csv_path / f\"data{fips}.csv\", index=False)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81b977f8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -112,7 +121,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.9.0"
   }
  },
  "nbformat": 4,
diff --git a/score/ipython/housing_and_transportation_etl.ipynb b/score/ipython/housing_and_transportation_etl.ipynb
new file mode 100644
index 00000000..63a86216
--- /dev/null
+++ b/score/ipython/housing_and_transportation_etl.ipynb
@@ -0,0 +1,132 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c21b63a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import censusdata\n",
+    "import csv\n",
+    "import requests\n",
+    "import zipfile\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "ACS_YEAR = 2019\n",
+    "\n",
+    "DATA_PATH = Path.cwd().parent / \"data\"\n",
+    "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
+    "OUTPUT_PATH = DATA_PATH / \"dataset\" / \"housing_and_transportation_index\"\n",
+    "\n",
+    "GEOID_FIELD_NAME = \"GEOID10\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6696bc66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid=01\n",
+    "\n",
+    "# Download each state / territory individually\n",
+    "dfs = []\n",
+    "with open(FIPS_CSV_PATH) as csv_file:\n",
+    "    csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
+    "    line_count = 0\n",
+    "\n",
+    "    for row in csv_reader:\n",
+    "        if line_count == 0:\n",
+    "            line_count += 1\n",
+    "        else:\n",
+    "            fips = row[0].strip()\n",
+    "\n",
+    "            print(f\"Downloading data for state/territory with FIPS code {fips}\")\n",
+    "\n",
+    "            download = requests.get(\n",
+    "                f\"https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid={fips}\",\n",
+    "                verify=False,\n",
+    "            )\n",
+    "            file_contents = download.content\n",
+    "            zip_file_dir = DATA_PATH / \"tmp\" / \"housing_and_transportation_index\"\n",
+    "\n",
+    "            # Make the directory if it doesn't exist\n",
+    "            zip_file_dir.mkdir(parents=True, exist_ok=True)\n",
+    "            zip_file_path = zip_file_dir / f\"{fips}-downloaded.zip\"\n",
+    "            zip_file = open(zip_file_path, \"wb\")\n",
+    "            zip_file.write(file_contents)\n",
+    "            zip_file.close()\n",
+    "\n",
+    "            with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n",
+    "                zip_ref.extractall(zip_file_dir)\n",
+    "\n",
+    "            # New file name:\n",
+    "            tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n",
+    "            tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n",
+    "\n",
+    "            dfs.append(tmp_df)\n",
+    "\n",
+    "df = pd.concat(dfs)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "244e0d03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rename and reformat block group ID\n",
+    "df.rename(columns={\"blkgrp\": GEOID_FIELD_NAME}, inplace=True)\n",
+    "df[GEOID_FIELD_NAME] = df[GEOID_FIELD_NAME].str.replace('\"', \"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8275c1ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "df.to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef5bb862",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/score/ipython/score_calc.ipynb b/score/ipython/score_calc.ipynb
index f2f032c8..853473f7 100644
--- a/score/ipython/score_calc.ipynb
+++ b/score/ipython/score_calc.ipynb
@@ -7,14 +7,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Before running this notebook, you must run the notebook `ejscreen_etl.ipynb`.\n",
+    "# Before running this notebook, you must run the following notebooks (in any order):\n",
+    "# 1. `ejscreen_etl.ipynb`\n",
+    "# 2. `census_etl.ipynb`\n",
+    "# 3. `housing_and_transportation_etl.ipynb`\n",
     "\n",
     "import collections\n",
+    "import functools\n",
     "from pathlib import Path\n",
     "import pandas as pd\n",
     "import csv\n",
     "\n",
     "# Define some global parameters\n",
+    "GEOID_FIELD_NAME = \"GEOID10\"\n",
     "BUCKET_SOCIOECONOMIC = \"Socioeconomic Factors\"\n",
     "BUCKET_SENSITIVE = \"Sensitive populations\"\n",
     "BUCKET_ENVIRONMENTAL = \"Environmental effects\"\n",
@@ -51,7 +56,62 @@
    "source": [
     "# EJSCreen csv Load\n",
     "ejscreen_csv = data_path / \"dataset\" / \"ejscreen_2020\" / \"usa.csv\"\n",
-    "df = pd.read_csv(ejscreen_csv, dtype={\"ID\": \"string\"}, low_memory=False)\n",
+    "ejscreen_df = pd.read_csv(ejscreen_csv, dtype={\"ID\": \"string\"}, low_memory=False)\n",
+    "ejscreen_df.rename(columns={\"ID\": GEOID_FIELD_NAME}, inplace=True)\n",
+    "ejscreen_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "daba69fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load census data\n",
+    "census_csv = data_path / \"dataset\" / \"census_acs_2019\" / \"usa.csv\"\n",
+    "census_df = pd.read_csv(\n",
+    "    census_csv, dtype={GEOID_FIELD_NAME: \"string\"}, low_memory=False\n",
+    ")\n",
+    "census_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "144bdde2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load housing and transportation data\n",
+    "housing_and_transportation_index_csv = (\n",
+    "    data_path / \"dataset\" / \"housing_and_transportation_index\" / \"usa.csv\"\n",
+    ")\n",
+    "housing_and_transportation_df = pd.read_csv(\n",
+    "    housing_and_transportation_index_csv,\n",
+    "    dtype={GEOID_FIELD_NAME: \"string\"},\n",
+    "    low_memory=False,\n",
+    ")\n",
+    "housing_and_transportation_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf89efd8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Join all the data sources that use census block groups\n",
+    "dfs = [ejscreen_df, census_df, housing_and_transportation_df]\n",
+    "\n",
+    "df = functools.reduce(\n",
+    "    lambda left, right: pd.merge(\n",
+    "        left=left, right=right, on=GEOID_FIELD_NAME, how=\"outer\"\n",
+    "    ),\n",
+    "    dfs,\n",
+    ")\n",
+    "\n",
     "df.head()"
    ]
   },
@@ -70,9 +130,10 @@
     "data_sets = [\n",
     "    # The following data sets have `bucket=None`, because it's not used in the score.\n",
     "    DataSet(\n",
-    "        input_field=\"ID\", \n",
+    "        input_field=GEOID_FIELD_NAME,\n",
     "        # Use the name `GEOID10` to enable geoplatform.gov's workflow.\n",
-    "        renamed_field=\"GEOID10\", bucket=None\n",
+    "        renamed_field=GEOID_FIELD_NAME,\n",
+    "        bucket=None,\n",
     "    ),\n",
     "    DataSet(input_field=\"ACSTOTPOP\", renamed_field=\"Total population\", bucket=None),\n",
     "    # The following data sets have buckets, because they're used in the score\n",
@@ -152,6 +213,17 @@
     "        renamed_field=\"Percent individuals age 25 or over with less than high school degree\",\n",
     "        bucket=BUCKET_SOCIOECONOMIC,\n",
     "    ),\n",
+    "    DataSet(\n",
+    "        input_field=\"Unemployed Civilians (fraction)\",\n",
+    "        # Following EJSCREEN conventions, where fractional data is named as a percent.\n",
+    "        renamed_field=\"Unemployed Civilians (percent)\",\n",
+    "        bucket=BUCKET_SOCIOECONOMIC,\n",
+    "    ),\n",
+    "    DataSet(\n",
+    "        input_field=\"ht_ami\",\n",
+    "        renamed_field=\"Housing + Transportation Costs % Income for the Regional Typical Household\",\n",
+    "        bucket=BUCKET_SOCIOECONOMIC,\n",
+    "    ),\n",
     "]"
    ]
   },
@@ -305,6 +377,14 @@
     "            # we need to name the file data01.csv for ogr2ogr csv merge to work\n",
     "            df1.to_csv(score_csv_path / f\"data{states_fips}.csv\", index=False)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "167ebba3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -323,7 +403,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.9.0"
   }
  },
  "nbformat": 4,
diff --git a/score/ipython/scoring_comparison.ipynb b/score/ipython/scoring_comparison.ipynb
index 6bd965c2..49e66835 100644
--- a/score/ipython/scoring_comparison.ipynb
+++ b/score/ipython/scoring_comparison.ipynb
@@ -71,10 +71,10 @@
     "# Rename unclear name \"id\" to \"census_block_group_id\", as well as other renamings.\n",
     "cejst_df.rename(\n",
     "    columns={\n",
-    "        \"ID\": CENSUS_BLOCK_GROUP_ID_FIELD,\n",
-    "        \"ACSTOTPOP\": CENSUS_BLOCK_GROUP_POPULATION_FIELD,\n",
-    "        \"score_a\": CEJST_SCORE_FIELD,\n",
-    "        \"score_a_percentile\": CEJST_PERCENTILE_FIELD,\n",
+    "        \"GEOID10\": CENSUS_BLOCK_GROUP_ID_FIELD,\n",
+    "        \"Total population\": CENSUS_BLOCK_GROUP_POPULATION_FIELD,\n",
+    "        \"Score C\": CEJST_SCORE_FIELD,\n",
+    "        \"Score C (percentile)\": CEJST_PERCENTILE_FIELD,\n",
     "    },\n",
     "    inplace=True,\n",
     "    errors=\"raise\",\n",
@@ -110,10 +110,13 @@
     "# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:\n",
     "# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip\n",
     "\n",
-    "download = requests.get(\"https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip\", verify=False)\n",
+    "download = requests.get(\n",
+    "    \"https://justice40-data.s3.amazonaws.com/CalEnviroScreen/CalEnviroScreen_4.0_2021.zip\",\n",
+    "    verify=False,\n",
+    ")\n",
     "file_contents = download.content\n",
     "zip_file_path = TEMP_DATA_DIR\n",
-    "zip_file = open(zip_file_path  / \"downloaded.zip\", \"wb\")\n",
+    "zip_file = open(zip_file_path / \"downloaded.zip\", \"wb\")\n",
     "zip_file.write(file_contents)\n",
     "zip_file.close()"
    ]
@@ -298,11 +301,11 @@
    "id": "0c534966",
    "metadata": {
     "variables": {
-     "all_100_sum": "1168",
-     "all_100_sum_percent": "59%",
-     "at_least_one_sum": "1817",
-     "at_least_one_sum_percent": "92%",
-     "cejst_cbgs_ca_only": "6987",
+     "all_100_sum": "1373",
+     "all_100_sum_percent": "69%",
+     "at_least_one_sum": "1866",
+     "at_least_one_sum_percent": "94%",
+     "cejst_cbgs_ca_only": "10849",
      "ces_tracts_count": "1983"
     }
    },
@@ -319,14 +322,6 @@
     "\n",
     "Out of every CalEnviroScreen Disadvantaged Community census tract, {{all_100_sum}} ({{all_100_sum_percent}}) of these census tracts have 100% of the included census block groups within them considered priority communities by the current version of the CEJST score."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "db3c7d38",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -345,7 +340,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.0"
+   "version": "3.7.1"
   }
  },
  "nbformat": 4,
diff --git a/score/requirements.txt b/score/requirements.txt
index 87d00c77..e8b20fbe 100644
--- a/score/requirements.txt
+++ b/score/requirements.txt
@@ -1,3 +1,4 @@
+censusdata
 ipython
 jupyter
 jupyter_contrib_nbextensions