Issue 242: Add HOLC Grades to data inputs (#978)

* Add mapping inequality data to data inputs * Add mapping inequality data to comparison tool
2025-07-25 07:10:16 -07:00 · 2021-12-04 12:23:01 -05:00 · 2021-12-04 12:23:01 -05:00 · c5dff6e5f7
commit c5dff6e5f7
parent 1d101c93d2
10 changed files with 317 additions and 15 deletions
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -49,6 +49,11 @@ DATASET_LIST = [
        "module_dir": "geocorr",
        "class_name": "GeoCorrETL",
    },
+    {
+        "name": "mapping_inequality",
+        "module_dir": "mapping_inequality",
+        "class_name": "MappingInequalityETL",
+    },
    {
        "name": "persistent_poverty",
        "module_dir": "persistent_poverty",
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -1,7 +1,9 @@
 import pandas as pd

 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
+from data_pipeline.etl.sources.census_acs.etl_utils import (
+    retrieve_census_acs_data,
+)
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_utils.py
@ -9,9 +9,7 @@ from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)


-def _fips_from_censusdata_censusgeo(
-        censusgeo: censusdata.censusgeo
-) -> str:
+def _fips_from_censusdata_censusgeo(censusgeo: censusdata.censusgeo) -> str:
    """Create a FIPS code from the proprietary censusgeo index."""
    fips = "".join([value for (key, value) in censusgeo.params()])
    return fips
@ -19,12 +17,12 @@ def _fips_from_censusdata_censusgeo(

 # pylint: disable=too-many-arguments
 def retrieve_census_acs_data(
-        acs_year: int,
-        variables: List[str],
-        tract_output_field_name: str,
-        data_path_for_fips_codes: Path,
-        acs_type="acs5",
-        raise_errors: bool = False,
+    acs_year: int,
+    variables: List[str],
+    tract_output_field_name: str,
+    data_path_for_fips_codes: Path,
+    acs_type="acs5",
+    raise_errors: bool = False,
 ) -> pd.DataFrame:
    """Retrieves and combines census ACS data for a given year."""
    dfs = []
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
@ -1,7 +1,9 @@
 import pandas as pd

 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.etl.sources.census_acs.etl_utils import retrieve_census_acs_data
+from data_pipeline.etl.sources.census_acs.etl_utils import (
+    retrieve_census_acs_data,
+)
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/README.md
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/data/holc_grades_manually_mapped.csv
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/data/holc_grades_manually_mapped.csv
@ -0,0 +1,40 @@
+city,holc_id,HOLC Grade (manually mapped)
+Providence,25,D
+Providence,26,D
+Oklahoma City,46R,D
+Oklahoma City,47R,D
+Oklahoma City,48R,D
+Oklahoma City,49R,D
+Oklahoma City,50R,D
+Oklahoma City,51R,D
+Oklahoma City,52R,D
+Oklahoma City,53R,D
+Oklahoma City,54R,D
+Oklahoma City,55R,D
+Oklahoma City,56R,D
+Oklahoma City,57R,D
+Oklahoma City,58R,D
+Oklahoma City,59R,D
+Oklahoma City,60R,D
+Oklahoma City,61R,D
+Oklahoma City,62B,D
+Oklahoma City,63R,D
+Oklahoma City,64R,D
+Oklahoma City,65R,D
+Oklahoma City,66R,D
+Oklahoma City,67R,D
+Oklahoma City,68R,D
+Oklahoma City,69R,D
+Oklahoma City,70R,D
+Oklahoma City,80R,D
+Oklahoma City,81R,D
+Oklahoma City,85R,D
+Oklahoma City,86R,D
+Oklahoma City,87R,D
+Oklahoma City,88R,D
+Oklahoma City,89R,D
+Oklahoma City,90R,D
+Milwaukee Co.,S-D1,D
+Milwaukee Co.,S-D2,D
+Milwaukee Co.,S-D3,D
+Milwaukee Co.,S-D4,D
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
@ -0,0 +1,177 @@
+import pathlib
+import numpy as np
+import pandas as pd
+
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.score import field_names
+from data_pipeline.utils import download_file_from_url, get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+class MappingInequalityETL(ExtractTransformLoad):
+    """Load Mapping Inequality data.
+
+    Information on the source data is available at
+    https://dsl.richmond.edu/panorama/redlining/.
+
+    Information on the mapping of this data to census tracts is available at
+    https://github.com/americanpanorama/Census_HOLC_Research.
+
+    """
+
+    def __init__(self):
+        self.MAPPING_INEQUALITY_CSV_URL = (
+            "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
+            "main/2010_Census_Tracts/holc_tract_lookup.csv"
+        )
+        self.MAPPING_INEQUALITY_CSV = self.TMP_PATH / "holc_tract_lookup.csv"
+        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
+
+        self.HOLC_MANUAL_MAPPING_CSV_PATH = (
+            pathlib.Path(__file__).parent
+            / "data"
+            / "holc_grades_manually_mapped.csv"
+        )
+
+        # Some input field names. From documentation: 'Census Tracts were intersected
+        # with HOLC Polygons. Census information can be joined via the "geoid" field.
+        # There are two field "holc_prop" and "tract_prop" which give the proportion
+        # of the HOLC polygon in the Census Tract and the proportion of Census Tract
+        # in the HOLC Polygon respectively.'
+        # https://github.com/americanpanorama/Census_HOLC_Research/blob/main/2010_Census_Tracts/README.md
+        self.TRACT_INPUT_FIELD: str = "geoid"
+        self.TRACT_PROPORTION_FIELD: str = "tract_prop"
+        self.HOLC_GRADE_AND_ID_FIELD: str = "holc_id"
+        self.CITY_INPUT_FIELD: str = "city"
+
+        self.HOLC_GRADE_D_FIELD: str = "HOLC Grade D"
+        self.HOLC_GRADE_MANUAL_FIELD: str = "HOLC Grade (manually mapped)"
+        self.HOLC_GRADE_DERIVED_FIELD: str = "HOLC Grade (derived)"
+
+        self.COLUMNS_TO_KEEP = [
+            self.GEOID_TRACT_FIELD_NAME,
+            field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD,
+            field_names.HOLC_GRADE_D_TRACT_20_PERCENT_FIELD,
+            field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD,
+            field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD,
+        ]
+
+        self.df: pd.DataFrame
+
+    def extract(self) -> None:
+        logger.info("Downloading Mapping Inequality Data")
+        download_file_from_url(
+            file_url=self.MAPPING_INEQUALITY_CSV_URL,
+            download_file_name=self.MAPPING_INEQUALITY_CSV,
+        )
+
+    def transform(self) -> None:
+        logger.info("Transforming Mapping Inequality Data")
+        df: pd.DataFrame = pd.read_csv(
+            self.MAPPING_INEQUALITY_CSV,
+            dtype={self.TRACT_INPUT_FIELD: "string"},
+            low_memory=False,
+        )
+
+        # rename Tract ID
+        df.rename(
+            columns={
+                self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
+            },
+            inplace=True,
+        )
+
+        # Keep the first character, which is the HOLC grade (A, B, C, D).
+        # TODO: investigate why this dataframe triggers these pylint errors.
+        # pylint: disable=unsupported-assignment-operation, unsubscriptable-object
+        df[self.HOLC_GRADE_DERIVED_FIELD] = df[
+            self.HOLC_GRADE_AND_ID_FIELD
+        ].str[0:1]
+
+        # Remove nonsense when the field has no grade or invalid grades.
+        valid_grades = ["A", "B", "C", "D"]
+        df.loc[
+            # pylint: disable=unsubscriptable-object
+            ~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
+            self.HOLC_GRADE_DERIVED_FIELD,
+        ] = None
+
+        # Some data needs to be manually mapped to its grade.
+        # TODO: Investigate more data that may need to be manually mapped.
+        holc_manually_mapped_df = pd.read_csv(
+            filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH,
+            low_memory=False,
+        )
+
+        # Join on the existing data
+        merged_df = df.merge(
+            right=holc_manually_mapped_df,
+            on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
+            how="left",
+        )
+
+        # Create a single field that combines the 'derived' grade D field with the
+        # manually mapped grade D field into a single grade D field.
+        merged_df[self.HOLC_GRADE_D_FIELD] = np.where(
+            (merged_df[self.HOLC_GRADE_DERIVED_FIELD] == "D")
+            | (merged_df[self.HOLC_GRADE_MANUAL_FIELD] == "D"),
+            True,
+            None,
+        )
+
+        # Start grouping by, to sum all of the grade D parts of each tract.
+        grouped_df = (
+            merged_df.groupby(
+                by=[
+                    self.GEOID_TRACT_FIELD_NAME,
+                    self.HOLC_GRADE_D_FIELD,
+                ],
+                # Keep the nulls, so we know the non-D proportion.
+                dropna=False,
+            )[self.TRACT_PROPORTION_FIELD]
+            .sum()
+            .reset_index()
+        )
+
+        # Create a field that is only the percent that is grade D.
+        grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] = np.where(
+            grouped_df[self.HOLC_GRADE_D_FIELD],
+            grouped_df[self.TRACT_PROPORTION_FIELD],
+            0,
+        )
+
+        # Calculate some specific threshold cutoffs, for convenience.
+        grouped_df[field_names.HOLC_GRADE_D_TRACT_20_PERCENT_FIELD] = (
+            grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] > 0.2
+        )
+        grouped_df[field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD] = (
+            grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] > 0.5
+        )
+        grouped_df[field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD] = (
+            grouped_df[field_names.HOLC_GRADE_D_TRACT_PERCENT_FIELD] > 0.75
+        )
+
+        # Drop the non-True values of `self.HOLC_GRADE_D_FIELD` -- we only
+        # want one row per tract for future joins.
+        # Note this means not all tracts will be in this data.
+        # Note: this singleton comparison warning may be a pylint bug:
+        # https://stackoverflow.com/questions/51657715/pylint-pandas-comparison-to-true-should-be-just-expr-or-expr-is-true-sin#comment90876517_51657715
+        # pylint: disable=singleton-comparison
+        grouped_df = grouped_df[
+            grouped_df[self.HOLC_GRADE_D_FIELD] == True  # noqa: E712
+        ]
+
+        # Sort for convenience.
+        grouped_df.sort_values(by=self.GEOID_TRACT_FIELD_NAME, inplace=True)
+
+        # Save to self.
+        self.df = grouped_df
+
+    def load(self) -> None:
+        logger.info("Saving Mapping Inequality CSV")
+        # write nationwide csv
+        self.CSV_PATH.mkdir(parents=True, exist_ok=True)
+        self.df[self.COLUMNS_TO_KEEP].to_csv(
+            self.CSV_PATH / "usa.csv", index=False
+        )
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@ -3,6 +3,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "71c4acd0",
   "metadata": {
    "scrolled": true
   },
@ -48,6 +49,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "2ce3170c",
   "metadata": {
    "scrolled": true
   },
@ -79,6 +81,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "8bd39090",
   "metadata": {
    "scrolled": true
   },
@ -105,6 +108,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "a251a0fb",
   "metadata": {},
   "outputs": [],
   "source": [
@ -138,6 +142,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "e43a9e23",
   "metadata": {},
   "outputs": [],
   "source": [
@ -160,6 +165,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "38c0dc2f",
   "metadata": {
    "scrolled": false
   },
@ -186,8 +192,9 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "8c3e462c",
   "metadata": {
-    "scrolled": false
+    "scrolled": true
   },
   "outputs": [],
   "source": [
@ -215,6 +222,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "d8ec43dc",
   "metadata": {},
   "outputs": [],
   "source": [
@ -247,13 +255,43 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "81826d29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load mapping inequality data\n",
+    "HOLC_FACTORS = [\n",
+    "    field_names.HOLC_GRADE_D_TRACT_20_PERCENT_FIELD,\n",
+    "    field_names.HOLC_GRADE_D_TRACT_50_PERCENT_FIELD,\n",
+    "    field_names.HOLC_GRADE_D_TRACT_75_PERCENT_FIELD,\n",
+    "]\n",
+    "mapping_inequality_path = (\n",
+    "    DATA_DIR / \"dataset\" / \"mapping_inequality\" / \"usa.csv\"\n",
+    ")\n",
+    "mapping_inequality_df = pd.read_csv(\n",
+    "    mapping_inequality_path,\n",
+    "    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
+    ")\n",
+    "\n",
+    "mapping_inequality_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65659c26",
   "metadata": {
-    "scrolled": false
+    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Join all dataframes that use tracts\n",
-    "census_tract_dfs = [cejst_df, calenviroscreen_df, persistent_poverty_df]\n",
+    "census_tract_dfs = [\n",
+    "    cejst_df,\n",
+    "    calenviroscreen_df,\n",
+    "    persistent_poverty_df,\n",
+    "    mapping_inequality_df,\n",
+    "]\n",
    "\n",
    "merged_df = functools.reduce(\n",
    "    lambda left, right: pd.merge(\n",
@ -281,6 +319,23 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "2de78f71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Special handling for HOLC.\n",
+    "# Fill in the null HOLC values as `False`. Otherwise the comparison tool will not run comparisons in states\n",
+    "# without HOLC scores, and for HOLC, we'd like to see it across the whole US.\n",
+    "for holc_factor in HOLC_FACTORS:\n",
+    "    merged_df[holc_factor] = merged_df[holc_factor].fillna(False)\n",
+    "\n",
+    "merged_df[HOLC_FACTORS].head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "980c0f66",
   "metadata": {
    "scrolled": true
   },
@ -377,6 +432,16 @@
    "            other_census_tract_fields_to_keep=[],\n",
    "        ),\n",
    "    ]\n",
+    "    # Insert indices for each of the HOLC factors.\n",
+    "    # Note: since these involve no renaming, we write them using list comprehension.\n",
+    "    + [\n",
+    "        Index(\n",
+    "            method_name=factor,\n",
+    "            priority_communities_field=factor,\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        )\n",
+    "        for factor in HOLC_FACTORS\n",
+    "    ]\n",
    ")\n",
    "\n",
    "\n",
@ -429,6 +494,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "4b510cb1",
   "metadata": {
    "scrolled": true
   },
@ -711,6 +777,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "2bcbcabf",
   "metadata": {},
   "outputs": [],
   "source": [
@ -816,6 +883,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "d1eec560",
   "metadata": {
    "scrolled": true
   },
@ -1014,6 +1082,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "48005fad",
   "metadata": {
    "scrolled": true
   },
@ -1190,6 +1259,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "7d095ebd",
   "metadata": {},
   "outputs": [],
   "source": [
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -126,7 +126,9 @@ CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 = (
    "Percentage households below 100% of federal poverty line in 2009"
 )
 CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009 = "Percent individuals age 25 or over with less than high school degree in 2009"
-CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = "Unemployed civilians (percent) in 2009"
+CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = (
+    "Unemployed civilians (percent) in 2009"
+)
 CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009 = "Total population in 2009"

 # Fields from 2010 ACS (loaded for comparison with the territories)
@ -188,3 +190,9 @@ EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD = (
 EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, State, 95th percentile (communities)"
 )
+
+# Mapping inequality data.
+HOLC_GRADE_D_TRACT_PERCENT_FIELD: str = "Percent of tract that is HOLC Grade D"
+HOLC_GRADE_D_TRACT_20_PERCENT_FIELD: str = "Tract is >20% HOLC Grade D"
+HOLC_GRADE_D_TRACT_50_PERCENT_FIELD: str = "Tract is >50% HOLC Grade D"
+HOLC_GRADE_D_TRACT_75_PERCENT_FIELD: str = "Tract is >75% HOLC Grade D"