Adding HOLC indicator (#1579)

Added HOLC indicator (Historic Redlining Score) from NCRC work; included 3.25 cutoff and low income as part of the housing burden category.
2025-09-30 09:13:17 -07:00 · 2022-05-12 12:07:08 -04:00 · 2022-05-12 12:07:08 -04:00 · 1782d022a9
commit 1782d022a9
parent f047ca9d83
10 changed files with 202 additions and 40 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -205,7 +205,8 @@ TILES_SCORE_COLUMNS = {
    field_names.M_HEALTH: "M_HLTH",
    # temporarily update this so that it's the Narwhal score that gets visualized on the map
    field_names.SCORE_N_COMMUNITIES: "SM_C",
-    field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
+    field_names.SCORE_N_COMMUNITIES
+    + field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
    field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
    field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EALRLI",
    field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EBLRLI",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -1,5 +1,6 @@
 import functools
 from collections import namedtuple
+from attr import field

 import numpy as np
 import pandas as pd
@ -36,6 +37,7 @@ class ScoreETL(ExtractTransformLoad):
        self.census_decennial_df: pd.DataFrame
        self.census_2010_df: pd.DataFrame
        self.child_opportunity_index_df: pd.DataFrame
+        self.hrs_df: pd.DataFrame

    def extract(self) -> None:
        logger.info("Loading data sets from disk.")
@ -172,6 +174,17 @@ class ScoreETL(ExtractTransformLoad):
            low_memory=False,
        )

+        # Load HRS data
+        hrs_csv = (
+            constants.DATA_PATH / "dataset" / "historic_redlining" / "usa.csv"
+        )
+
+        self.hrs_df = pd.read_csv(
+            hrs_csv,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
+            low_memory=False,
+        )
+
    def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
        logger.info("Joining Census Tract dataframes")

@ -376,6 +389,7 @@ class ScoreETL(ExtractTransformLoad):
            self.census_decennial_df,
            self.census_2010_df,
            self.child_opportunity_index_df,
+            self.hrs_df,
        ]

        # Sanity check each data frame before merging.
@ -405,7 +419,6 @@ class ScoreETL(ExtractTransformLoad):
            df[field_names.MEDIAN_INCOME_FIELD] / df[field_names.AMI_FIELD]
        )

-        # QQ: why don't we just filter to the numeric columns by type?
        numeric_columns = [
            field_names.HOUSING_BURDEN_FIELD,
            field_names.TOTAL_POP_FIELD,
@ -465,6 +478,7 @@ class ScoreETL(ExtractTransformLoad):
        non_numeric_columns = [
            self.GEOID_TRACT_FIELD_NAME,
            field_names.PERSISTENT_POVERTY_FIELD,
+            field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
        ]

        # For some columns, high values are "good", so we want to reverse the percentile
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -46,10 +46,11 @@ class GeoScoreETL(ExtractTransformLoad):
            self.DATA_PATH / "census" / "geojson" / "us.json"
        )

-        # Import the shortened name for Score M percentile ("SM_PFS") that's used on the
+        # Import the shortened name for Score N percentile ("SM_PFS") that's used on the
        # tiles.
+        ## TEMPORARY update
        self.TARGET_SCORE_SHORT_FIELD = constants.TILES_SCORE_COLUMNS[
-            field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX
+            field_names.SCORE_N + field_names.PERCENTILE_FIELD_SUFFIX
        ]
        self.TARGET_SCORE_RENAME_TO = "M_SCORE"

@ -284,21 +285,28 @@ class GeoScoreETL(ExtractTransformLoad):

        def create_esri_codebook(codebook):
            """temporary: helper to make a codebook for esri shapefile only"""
+<<<<<<< HEAD

            shapefile_column_field = "shapefile_column"
            internal_column_name_field = "column_name"
            column_description_field = "column_description"

+=======
+>>>>>>> 8c255f0e (Adding HOLC indicator (#1579))
            logger.info("Creating a codebook that uses the csv names")
            codebook = (
                pd.Series(codebook)
                .reset_index()
                .rename(
                    # kept as strings because no downstream impacts
+<<<<<<< HEAD
                    columns={
                        0: internal_column_name_field,
                        "index": shapefile_column_field,
                    }
+=======
+                    columns={0: "column_name", "index": "shapefile_column"}
+>>>>>>> 8c255f0e (Adding HOLC indicator (#1579))
                )
            )

@ -374,7 +382,7 @@ class GeoScoreETL(ExtractTransformLoad):
                for task in [
                    write_high_to_file,
                    write_low_to_file,
-                    write_esri_shapefile,
+                    # write_esri_shapefile,
                ]
            }