including fraction of state AMI

2025-07-28 14:11:17 -07:00 · 2021-08-09 21:30:41 -05:00 · 2021-08-09 21:30:41 -05:00 · ce5e8c5351
commit ce5e8c5351
parent 4ae7eff4c4
4 changed files with 1016 additions and 775 deletions
--- a/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv
+++ b/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv
@ -0,0 +1,53 @@
+GEOID2,Median household income (State)
+01,50536
+02,77640
+04,58945
+05,47597
+06,75235
+08,72331
+09,78444
+10,68287
+11,86420
+12,55660
+13,58700
+15,81275
+16,55785
+17,65886
+18,56303
+19,60523
+20,59597
+21,50589
+22,49469
+23,57918
+24,84805
+25,81215
+26,57144
+27,71306
+28,45081
+29,55461
+30,54970
+31,61439
+32,60365
+33,76768
+34,82545
+35,49754
+36,68486
+37,54602
+38,64894
+39,56602
+40,52919
+41,62818
+42,61744
+44,67167
+45,53199
+46,58275
+47,53320
+48,61874
+49,71621
+50,61973
+51,74222
+53,73775
+54,46711
+55,61747
+56,64049
+72,20539
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -24,7 +24,20 @@ class CensusACSETL(ExtractTransformLoad):
        ]
        self.MEDIAN_INCOME_FIELD = "B19013_001E"
        self.MEDIAN_INCOME_FIELD_NAME = "Median household income in the past 12 months"
+        self.MEDIAN_INCOME_STATE_FIELD_NAME = "Median household income (State)"
+        self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = (
+            "Median household income (% of state median household income)"
+        )
+        self.STATE_GEOID_FIELD_NAME = "GEOID2"
        self.df: pd.DataFrame
+        self.state_median_income_df: pd.DataFrame
+
+        # TODO: refactor this to put this file on s3 and download it from there
+        self.STATE_MEDIAN_INCOME_FILE_PATH = (
+            self.DATA_PATH
+            / "needs_to_be_moved_to_s3"
+            / "2014_to_2019_state_median_income.csv"
+        )

    def _fips_from_censusdata_censusgeo(self, censusgeo: censusdata.censusgeo) -> str:
        """Create a FIPS code from the proprietary censusgeo index."""
@ -59,12 +72,36 @@ class CensusACSETL(ExtractTransformLoad):
            func=self._fips_from_censusdata_censusgeo
        )

+        self.state_median_income_df = pd.read_csv(
+            # TODO: Replace with reading from S3.
+            filepath_or_buffer=self.STATE_MEDIAN_INCOME_FILE_PATH,
+            dtype={self.STATE_GEOID_FIELD_NAME: "string"},
+        )
+
    def transform(self) -> None:
        logger.info("Starting Census ACS Transform")

        # Rename median income
        self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[self.MEDIAN_INCOME_FIELD]

+        # TODO: handle null values for CBG median income, which are `-666666666`.
+
+        # Join state data on CBG data:
+        self.df[self.STATE_GEOID_FIELD_NAME] = (
+            self.df[self.GEOID_FIELD_NAME].astype(str).str[0:2]
+        )
+        self.df = self.df.merge(
+            self.state_median_income_df,
+            how="left",
+            on=self.STATE_GEOID_FIELD_NAME,
+        )
+
+        # Calculate the income of the block group as a fraction of the state income:
+        self.df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] = (
+            self.df[self.MEDIAN_INCOME_FIELD_NAME]
+            / self.df[self.MEDIAN_INCOME_STATE_FIELD_NAME]
+        )
+
        # Calculate percent unemployment.
        # TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
        self.df[self.UNEMPLOYED_FIELD_NAME] = self.df.B23025_005E / self.df.B23025_003E
@ -98,6 +135,8 @@ class CensusACSETL(ExtractTransformLoad):
            self.UNEMPLOYED_FIELD_NAME,
            self.LINGUISTIC_ISOLATION_FIELD_NAME,
            self.MEDIAN_INCOME_FIELD_NAME,
+            self.MEDIAN_INCOME_STATE_FIELD_NAME,
+            self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME,
        ]

        self.df[columns_to_include].to_csv(
--- a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb
@ -0,0 +1,136 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0491828b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import censusdata\n",
+    "import csv\n",
+    "from pathlib import Path\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "module_path = os.path.abspath(os.path.join(\"../..\"))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)\n",
+    "\n",
+    "from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes\n",
+    "\n",
+    "\n",
+    "ACS_YEAR = 2019\n",
+    "\n",
+    "DATA_PATH = Path.cwd().parent / \"data\"\n",
+    "FIPS_CSV_PATH = DATA_PATH / \"fips_states_2010.csv\"\n",
+    "\n",
+    "GEOID_FIELD_NAME = \"GEOID10\"\n",
+    "UNEMPLOYED_FIELD_NAME = \"Unemployed Civilians (fraction)\"\n",
+    "\n",
+    "# Some display settings to make pandas outputs more readable.\n",
+    "pd.set_option(\"display.expand_frame_repr\", False)\n",
+    "pd.set_option(\"display.precision\", 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "654f25a1",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
+    "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
+    "censusdata.printtable(censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8999cea4",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
+    "    \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
+    "    fips = \"\".join([value for (key, value) in censusgeo.params()])\n",
+    "    return fips\n",
+    "\n",
+    "\n",
+    "dfs = []\n",
+    "for fips in get_state_fips_codes(DATA_PATH):\n",
+    "    print(f\"Fetching data for fips {fips}\")\n",
+    "    dfs.append(\n",
+    "        censusdata.download(\n",
+    "            src=\"acs5\",\n",
+    "            year=ACS_YEAR,\n",
+    "            geo=censusdata.censusgeo(\n",
+    "                [\n",
+    "                    (\"state\", fips) \n",
+    "                 #, (\"county\", \"*\"), (\"block group\", \"*\")\n",
+    "                ]\n",
+    "            ),\n",
+    "            var=[\"B23025_005E\", \"B23025_003E\", \"B19013_001E\"],\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "df = pd.concat(dfs)\n",
+    "\n",
+    "df[GEOID_FIELD_NAME] = df.index.to_series().apply(func=fips_from_censusdata_censusgeo)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a269bb1",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
+    "\n",
+    "df.rename(columns={\"GEOID10\": \"GEOID2\", \"B19013_001E\": \"Median household income (State)\"}, inplace=True)\n",
+    "\n",
+    "df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91932af5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb