From 05ebf9b48cab92fbe5166ddaaae43a4f0a236907 Mon Sep 17 00:00:00 2001
From: Lucas Merrill Brown <lucas.m.brown@omb.eop.gov>
Date: Sat, 13 Nov 2021 10:29:23 -0500
Subject: [PATCH] Add median house value to Definition L (#882)

* Added house value to ETL

* Adding house value to score formula and comp tool
---
 .../data_pipeline/etl/score/etl_score.py      |  1 +
 .../etl/sources/census_acs/etl.py             | 46 ++++++++++------
 .../etl/sources/census_decennial/etl.py       | 10 ++--
 .../ipython/census_explore.ipynb              | 53 +++++++++++--------
 .../ipython/scoring_comparison.ipynb          |  2 +
 .../data_pipeline/score/field_names.py        |  2 +
 .../data_pipeline/score/score_l.py            | 24 +++++++--
 7 files changed, 90 insertions(+), 48 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
index 729586f0..8c610fac 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -312,6 +312,7 @@ class ScoreETL(ExtractTransformLoad):
             field_names.HIGH_SCHOOL_ED_FIELD,
             field_names.UNEMPLOYMENT_FIELD,
             field_names.HT_INDEX_FIELD,
+            field_names.MEDIAN_HOUSE_VALUE_FIELD,
         ]
 
         non_numeric_columns = [
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
index 25b1bf06..dda15015 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@@ -50,6 +50,11 @@ class CensusACSETL(ExtractTransformLoad):
             "Percent of individuals < 200% Federal Poverty Line"
         )
 
+        self.MEDIAN_HOUSE_VALUE_FIELD = "B25077_001E"
+        self.MEDIAN_HOUSE_VALUE_FIELD_NAME = (
+            "Median value ($) of owner-occupied housing units"
+        )
+
         self.STATE_GEOID_FIELD_NAME = "GEOID2"
         self.df: pd.DataFrame
 
@@ -78,7 +83,10 @@ class CensusACSETL(ExtractTransformLoad):
                         # Emploment fields
                         "B23025_005E",
                         "B23025_003E",
+                        # Income field
                         self.MEDIAN_INCOME_FIELD,
+                        # House value
+                        self.MEDIAN_HOUSE_VALUE_FIELD,
                     ]
                     + self.LINGUISTIC_ISOLATION_FIELDS
                     + self.POVERTY_FIELDS,
@@ -94,22 +102,27 @@ class CensusACSETL(ExtractTransformLoad):
     def transform(self) -> None:
         logger.info("Starting Census ACS Transform")
 
-        # Rename median income
-        self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[
-            self.MEDIAN_INCOME_FIELD
-        ]
+        # Rename two fields.
+        self.df = self.df.rename(
+            columns={
+                self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
+                self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
+            }
+        )
 
-        # Handle null values for CBG median income, which are `-666666666`.
-        missing_value_count = sum(
-            self.df[self.MEDIAN_INCOME_FIELD_NAME] == -666666666
-        )
-        logger.info(
-            f"There are {missing_value_count} ({int(100*missing_value_count/self.df[self.MEDIAN_INCOME_FIELD_NAME].count())}%) values of "
-            + f"`{self.MEDIAN_INCOME_FIELD_NAME}` being marked as null values."
-        )
-        self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[
-            self.MEDIAN_INCOME_FIELD_NAME
-        ].replace(to_replace=-666666666, value=None)
+        # Handle null values for various fields, which are `-666666666`.
+        for field in [
+            self.MEDIAN_INCOME_FIELD_NAME,
+            self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
+        ]:
+            missing_value_count = sum(self.df[field] == -666666666)
+            logger.info(
+                f"There are {missing_value_count} ({int(100*missing_value_count/self.df[field].count())}%) values of "
+                + f"`{field}` being marked as null values."
+            )
+            self.df[field] = self.df[field].replace(
+                to_replace=-666666666, value=None
+            )
 
         # Calculate percent unemployment.
         # TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
@@ -133,8 +146,6 @@ class CensusACSETL(ExtractTransformLoad):
             / self.df["C16002_001E"]
         )
 
-        self.df[self.LINGUISTIC_ISOLATION_FIELD_NAME].describe()
-
         # Calculate percent at different poverty thresholds
         self.df[self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME] = (
             self.df["C17002_002E"] + self.df["C17002_003E"]
@@ -170,6 +181,7 @@ class CensusACSETL(ExtractTransformLoad):
             self.POVERTY_LESS_THAN_100_PERCENT_FPL_FIELD_NAME,
             self.POVERTY_LESS_THAN_150_PERCENT_FPL_FIELD_NAME,
             self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
+            self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
         ]
 
         self.df[columns_to_include].to_csv(
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
index 6b51a9c7..f3ba33fe 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
@@ -56,13 +56,17 @@ class CensusDecennialETL(ExtractTransformLoad):
 
         self.MALE_HIGH_SCHOOL_ED_FIELD = "PBG026005"
         self.MALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032011"
-        self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = "Total!!Male!!High school graduate, GED, or alternative; "\
+        self.MALE_HIGH_SCHOOL_ED_FIELD_NAME = (
+            "Total!!Male!!High school graduate, GED, or alternative; "
             "SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
+        )
 
         self.FEMALE_HIGH_SCHOOL_ED_FIELD = "PBG026012"
         self.FEMALE_HIGH_SCHOOL_ED_VI_FIELD = "PCT032028"
-        self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = "Total!!Female!!High school graduate, GED, or alternative; "\
-           "SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
+        self.FEMALE_HIGH_SCHOOL_ED_FIELD_NAME = (
+            "Total!!Female!!High school graduate, GED, or alternative; "
+            "SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER"
+        )
 
         self.PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME = (
             "PERCENTAGE_HIGH_SCHOOL_ED_FIELD_NAME"
diff --git a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb
index 731b7371..d29076d2 100644
--- a/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/census_explore.ipynb
@@ -3,6 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "4899d2ef",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import censusdata\n",
@@ -29,28 +32,34 @@
     "# Some display settings to make pandas outputs more readable.\n",
     "pd.set_option(\"display.expand_frame_repr\", False)\n",
     "pd.set_option(\"display.precision\", 2)"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "4dd8feec",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
    "source": [
     "# Following the tutorial at https://jtleider.github.io/censusdata/example1.html.\n",
     "# Full list of fields is at https://www2.census.gov/programs-surveys/acs/summary_file/2019/documentation/user_tools/ACS2019_Table_Shells.xlsx\n",
     "censusdata.printtable(\n",
-    "    censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B19013\")\n",
-    ")"
-   ],
-   "outputs": [],
-   "metadata": {
-    "scrolled": true
-   }
+    "    censusdata.censustable(src=\"acs5\", year=ACS_YEAR, table=\"B25077\")\n",
+    ")\n",
+    "\n",
+    "# censusdata.search(src=\"acs5\", year=ACS_YEAR, field='label', criterion='Owner-occupied units!!Median')"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "7b40afd3",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
    "source": [
     "def fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:\n",
     "    \"\"\"Create a FIPS code from the proprietary censusgeo index.\"\"\"\n",
@@ -82,15 +91,16 @@
     ")\n",
     "\n",
     "df.head()"
-   ],
-   "outputs": [],
-   "metadata": {
-    "scrolled": true
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "caa0b502",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
    "source": [
     "columns_to_include = [\"GEOID2\", \"Median household income (State)\"]\n",
     "\n",
@@ -103,18 +113,15 @@
     ")\n",
     "\n",
     "# df[columns_to_include].to_csv(path_or_buf= \"/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/data/needs_to_be_moved_to_s3/2014_to_2019_state_median_income.csv\", index=False)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "scrolled": true
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "source": [],
+   "id": "f2bddf6a",
+   "metadata": {},
    "outputs": [],
-   "metadata": {}
+   "source": []
   }
  ],
  "metadata": {
@@ -138,4 +145,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
index 3beb7a0f..b33ae10d 100644
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@@ -86,6 +86,7 @@
     "BAD_HEALTH_FIELD = (\n",
     "    \"Physical health not good for >=14 days among adults aged >=18 years\"\n",
     ")\n",
+    "MEDIAN_HOUSE_VALUE_FIELD = \"Median value ($) of owner-occupied housing units\"\n",
     "\n",
     "# Define some suffixes\n",
     "POPULATION_SUFFIX = \" (priority population)\""
@@ -186,6 +187,7 @@
     "    \"Particulate matter (PM2.5) (percentile)\",\n",
     "    \"Traffic proximity and volume (percentile)\",\n",
     "    \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n",
+    "    MEDIAN_HOUSE_VALUE_FIELD,\n",
     "]:\n",
     "    print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
     "    print(cejst_df[field].describe())\n",
diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py
index 9c7c22d9..f1f30a76 100644
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@@ -114,6 +114,8 @@ OVER_64_FIELD = "Individuals over 64 years old"
 # Urban Rural Map
 URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag"
 
+# Housing value
+MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units"
 
 # EJSCREEN Areas of Concern
 EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
diff --git a/data/data-pipeline/data_pipeline/score/score_l.py b/data/data-pipeline/data_pipeline/score/score_l.py
index fa149300..6eba9cb1 100644
--- a/data/data-pipeline/data_pipeline/score/score_l.py
+++ b/data/data-pipeline/data_pipeline/score/score_l.py
@@ -11,6 +11,7 @@ class ScoreL(Score):
     def __init__(self, df: pd.DataFrame) -> None:
         self.LOW_INCOME_THRESHOLD: float = 0.65
         self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
+        self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
         super().__init__(df)
 
     def add_columns(self) -> pd.DataFrame:
@@ -135,8 +136,12 @@ class ScoreL(Score):
         ) & transportation_criteria
 
     def _housing_factor(self) -> bool:
+        # (
         # In Xth percentile or above for lead paint (Source: Census's American Community Survey’s
         # percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
+        # AND
+        # In Yth percentile or below for Median House Value (Source: Census's American Community Survey)
+        # )
         # or
         # In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
         # AND
@@ -144,11 +149,20 @@ class ScoreL(Score):
         # of households where household income is less than or equal to twice the federal
         # poverty level. Source: Census's American Community Survey]
         housing_criteria = (
-            self.df[
-                field_names.LEAD_PAINT_FIELD
-                + field_names.PERCENTILE_FIELD_SUFFIX
-            ]
-            > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            (
+                self.df[
+                    field_names.LEAD_PAINT_FIELD
+                    + field_names.PERCENTILE_FIELD_SUFFIX
+                ]
+                > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+            & (
+                self.df[
+                    field_names.MEDIAN_HOUSE_VALUE_FIELD
+                    + field_names.PERCENTILE_FIELD_SUFFIX
+                ]
+                < self.MEDIAN_HOUSE_VALUE_THRESHOLD
+            )
         ) | (
             self.df[
                 field_names.HOUSING_BURDEN_FIELD