From 678ab8c081f0bb3d5c3dc79eb9bc4e025ae745d4 Mon Sep 17 00:00:00 2001
From: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
Date: Wed, 23 Jun 2021 12:26:45 -0700
Subject: [PATCH] adding housing CSV to score

---
 .../housing_and_transportation_etl.ipynb      | 124 ++++++++++++++++++
 score/ipython/score_calc.ipynb                |  41 +++++-
 score/ipython/scoring_comparison.ipynb        |  12 +-
 3 files changed, 165 insertions(+), 12 deletions(-)
 create mode 100644 score/ipython/housing_and_transportation_etl.ipynb

diff --git a/score/ipython/housing_and_transportation_etl.ipynb b/score/ipython/housing_and_transportation_etl.ipynb
new file mode 100644
index 00000000..e7fb0daa
--- /dev/null
+++ b/score/ipython/housing_and_transportation_etl.ipynb
@@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c21b63a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import censusdata\n",
+    "import csv\n",
+    "import requests\n",
+    "import zipfile\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "ACS_YEAR = 2019\n",
+    "\n",
+    "DATA_PATH = Path.cwd().parent / \"data\"\n",
+    "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
+    "OUTPUT_PATH = DATA_PATH / \"dataset\" / \"housing_and_transportation_index\"\n",
+    "\n",
+    "GEOID_FIELD_NAME = \"GEOID10\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6696bc66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid=01\n",
+    "\n",
+    "# Download each state / territory individually\n",
+    "dfs = []\n",
+    "with open(FIPS_CSV_PATH) as csv_file:\n",
+    "    csv_reader = csv.reader(csv_file, delimiter=\",\")\n",
+    "    line_count = 0\n",
+    "\n",
+    "    for row in csv_reader:\n",
+    "        if line_count == 0:\n",
+    "            line_count += 1\n",
+    "        else:\n",
+    "            fips = row[0].strip()\n",
+    "\n",
+    "            print(f\"Downloading data for state/territory with FIPS code {fips}\")\n",
+    "\n",
+    "            download = requests.get(\n",
+    "                f\"https://htaindex.cnt.org/download/download.php?focus=blkgrp&geoid={fips}\",\n",
+    "                verify=False,\n",
+    "            )\n",
+    "            file_contents = download.content\n",
+    "            zip_file_dir = DATA_PATH / \"tmp\" / \"housing_and_transportation_index\"\n",
+    "\n",
+    "            # Make the directory if it doesn't exist\n",
+    "            zip_file_dir.mkdir(parents=True, exist_ok=True)\n",
+    "            zip_file_path = zip_file_dir / f\"{fips}-downloaded.zip\"\n",
+    "            zip_file = open(zip_file_name, \"wb\")\n",
+    "            zip_file.write(file_contents)\n",
+    "            zip_file.close()\n",
+    "\n",
+    "            with zipfile.ZipFile(zip_file_name, \"r\") as zip_ref:\n",
+    "                zip_ref.extractall(zip_file_dir)\n",
+    "\n",
+    "            # New file name:\n",
+    "            tmp_csv_file_path = zip_file_dir / f\"htaindex_data_blkgrps_{fips}.csv\"\n",
+    "            tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)\n",
+    "\n",
+    "            dfs.append(tmp_df)\n",
+    "\n",
+    "df = pd.concat(dfs)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "244e0d03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rename and reformat block group ID\n",
+    "df.rename(columns={\"blkgrp\": GEOID_FIELD_NAME}, inplace=True)\n",
+    "df[GEOID_FIELD_NAME] = df[GEOID_FIELD_NAME].str.replace('\"', \"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8275c1ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUTPUT_PATH.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "df.to_csv(path_or_buf=OUTPUT_PATH / \"usa.csv\", index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/score/ipython/score_calc.ipynb b/score/ipython/score_calc.ipynb
index 277c9646..a4a9652e 100644
--- a/score/ipython/score_calc.ipynb
+++ b/score/ipython/score_calc.ipynb
@@ -7,11 +7,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Before running this notebook, you must run the following notebooks:\n",
+    "# Before running this notebook, you must run the following notebooks (in any order):\n",
     "# 1. `ejscreen_etl.ipynb`\n",
     "# 2. `census_etl.ipynb`\n",
+    "# 3. `housing_and_transportation_etl.ipynb`\n",
     "\n",
     "import collections\n",
+    "import functools\n",
     "from pathlib import Path\n",
     "import pandas as pd\n",
     "import csv\n",
@@ -74,6 +76,25 @@
     "census_df.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "144bdde2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load housing and transportation data\n",
+    "housing_and_transportation_index_csv = (\n",
+    "    data_path / \"dataset\" / \"housing_and_transportation_index\" / \"usa.csv\"\n",
+    ")\n",
+    "housing_and_transportation_df = pd.read_csv(\n",
+    "    housing_and_transportation_index_csv,\n",
+    "    dtype={GEOID_FIELD_NAME: \"string\"},\n",
+    "    low_memory=False,\n",
+    ")\n",
+    "housing_and_transportation_df.head()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -81,11 +102,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Join the two datasets\n",
-    "df = ejscreen_df.merge(\n",
-    "    census_df,\n",
-    "    how=\"left\",\n",
-    "    on=GEOID_FIELD_NAME,\n",
+    "# Join all the data sources that use census block groups\n",
+    "dfs = [ejscreen_df, census_df, housing_and_transportation_df]\n",
+    "\n",
+    "df = functools.reduce(\n",
+    "    lambda left, right: pd.merge(\n",
+    "        left=left, right=right, on=GEOID_FIELD_NAME, how=\"outer\"\n",
+    "    ),\n",
+    "    dfs,\n",
     ")\n",
     "\n",
     "df.head()"
@@ -195,6 +219,11 @@
     "        renamed_field=\"Unemployed Civilians (percent)\",\n",
     "        bucket=BUCKET_SOCIOECONOMIC,\n",
     "    ),\n",
+    "    DataSet(\n",
+    "        input_field=\"ht_ami\",\n",
+    "        renamed_field=\"Housing + Transportation Costs % Income for the Regional Typical Household\",\n",
+    "        bucket=BUCKET_SOCIOECONOMIC,\n",
+    "    ),\n",
     "]"
    ]
   },
diff --git a/score/ipython/scoring_comparison.ipynb b/score/ipython/scoring_comparison.ipynb
index 1b3f2683..49e66835 100644
--- a/score/ipython/scoring_comparison.ipynb
+++ b/score/ipython/scoring_comparison.ipynb
@@ -301,12 +301,12 @@
    "id": "0c534966",
    "metadata": {
     "variables": {
-     "all_100_sum": {},
-     "all_100_sum_percent": {},
-     "at_least_one_sum": {},
-     "at_least_one_sum_percent": {},
-     "cejst_cbgs_ca_only": {},
-     "ces_tracts_count": {}
+     "all_100_sum": "1373",
+     "all_100_sum_percent": "69%",
+     "at_least_one_sum": "1866",
+     "at_least_one_sum_percent": "94%",
+     "cejst_cbgs_ca_only": "10849",
+     "ces_tracts_count": "1983"
     }
    },
    "source": [