Add donut hole calculation to score (#1828)

Adds adjacency index to the pipeline. Requires thorough QA
2025-08-03 18:44:18 -07:00 · 2022-08-18 12:04:46 -04:00 · 2022-08-18 12:04:46 -04:00 · 6e41e0d9f0
commit 6e41e0d9f0
parent 88dc2e5a8e
17 changed files with 969 additions and 8 deletions
--- a/data/data-pipeline/data_pipeline/content/config/csv.yml
+++ b/data/data-pipeline/data_pipeline/content/config/csv.yml
@ -20,9 +20,21 @@ fields:
  - score_name: Total categories exceeded
    label: Total categories exceeded
    format: int64
-  - score_name: Definition M (communities)
+  - score_name: Definition N (communities)
    label: Identified as disadvantaged
    format: bool
  - score_name: Definition N (communities) (including adjacency index)
    label: Identified as disadvantaged (including adjacency index)
    format: bool
  - score_name: Is the tract surrounded by disadvantaged communities?
    label: Is the tract surrounded by disadvantaged communities?
    format: bool
  - score_name: Meets the less stringent low income criterion for the adjacency index?
    label: Meets the less stringent low income criterion for the adjacency index?
    format: bool
  - score_name: Definition N (communities) (average of neighbors)
    label: Share of neighbors that are identified as disadvantaged
    format: percentage
  - score_name: Total population
    label: Total population
    format: float
--- a/data/data-pipeline/data_pipeline/content/config/excel.yml
+++ b/data/data-pipeline/data_pipeline/content/config/excel.yml
@ -24,9 +24,21 @@ sheets:
      - score_name: Total categories exceeded
        label: Total categories exceeded
        format: int64
-      - score_name: Definition M (communities)
+      - score_name: Definition N (communities)
        label: Identified as disadvantaged
        format: bool
      - score_name: Definition N (communities) (including adjacency index)
        label: Identified as disadvantaged (including adjacency index)
        format: bool
      - score_name: Is the tract surrounded by disadvantaged communities?
        label: Is the tract surrounded by disadvantaged communities?
        format: bool
      - score_name: Meets the less stringent low income criterion for the adjacency index?
        label: Meets the less stringent low income criterion for the adjacency index?
        format: bool
      - score_name: Definition N (communities) (average of neighbors)
        label: Share of neighbors that are identified as disadvantaged
        format: percentage
      - score_name: Total population
        label: Total population
        format: float
@ -315,4 +327,3 @@ sheets:
      - score_name: Does the tract have at least 35 acres in it?
        label: Does the tract have at least 35 acres in it?
        format: bool
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -207,7 +207,8 @@ TILES_SCORE_COLUMNS = {
    field_names.M_POLLUTION: "M_PLN",
    field_names.M_HEALTH: "M_HLTH",
    # temporarily update this so that it's the Narwhal score that gets visualized on the map
-    field_names.SCORE_N_COMMUNITIES: "SM_C",
+    # The NEW final score value INCLUDES the adjacency index.
    field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX: "SM_C",
    field_names.SCORE_N_COMMUNITIES
    + field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
    field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
@ -305,6 +306,9 @@ TILES_SCORE_COLUMNS = {
    + field_names.PERCENTILE_FIELD_SUFFIX: "WF_PFS",
    field_names.HIGH_FUTURE_FLOOD_RISK_FIELD: "FLD_ET",
    field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD: "WF_ET",
    field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD: "ADJ_ET",
    field_names.SCORE_N_COMMUNITIES
    + field_names.ADJACENCY_INDEX_SUFFIX: "ADJ_PFS",
    field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
    + field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
    field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
@ -364,6 +368,7 @@ TILES_SCORE_FLOAT_COLUMNS = [
    field_names.FUTURE_FLOOD_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.FUTURE_WILDFIRE_RISK_FIELD
    + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.SCORE_N_COMMUNITIES + field_names.ADJACENCY_INDEX_SUFFIX,
    field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
    + field_names.PERCENTILE_FIELD_SUFFIX,
 ]
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/ipython/explore_adjacency.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/explore_adjacency.ipynb
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -1,6 +1,8 @@
 # Suffixes
 PERCENTILE_FIELD_SUFFIX = " (percentile)"
 ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
 ADJACENT_MEAN_SUFFIX = " (including adjacency index)"
 ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"
 # Geographic field names
 GEOID_TRACT_FIELD = "GEOID10_TRACT"
@ -691,6 +693,9 @@ CATEGORY_COUNT = "Total categories exceeded"
 FPL_200_SERIES = "Is low income?"
 FPL_200_SERIES_IMPUTED_AND_ADJUSTED = "Is low income (imputed and adjusted)?"
 FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS = (
    "Meets the less stringent low income criterion for the adjacency index?"
 )
 FPL_200_AND_COLLEGE_ATTENDANCE_SERIES = (
    "Is low income and has a low percent of higher ed students?"
 )
@ -715,5 +720,10 @@ HISTORIC_REDLINING_SCORE_EXCEEDED_LOW_INCOME_FIELD = (
    "Tract-level redlining score meets or exceeds 3.25 and is low income"
 )
 ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD = (
    "Is the tract surrounded by disadvantaged communities?"
 )
 # End of names for individual factors being exceeded
 ####
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -6,6 +6,7 @@ from data_pipeline.score.score import Score
 import data_pipeline.score.field_names as field_names
 from data_pipeline.utils import get_module_logger
 import data_pipeline.etl.score.constants as constants
 from data_pipeline.score.utils import calculate_tract_adjacency_scores
 logger = get_module_logger(__name__)
@ -20,6 +21,12 @@ class ScoreNarwhal(Score):
        self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
        self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
        # We define a donut hole DAC as a tract that is entirely surrounded by
        # DACs (score threshold = 1) and above median for low income, as a starting
        # point. As we ground-truth, these thresholds might change.
        self.LOW_INCOME_THRESHOLD_DONUT: float = 0.50
        self.SCORE_THRESHOLD_DONUT: float = 1.00
        super().__init__(df)
    def _combine_island_areas_with_states_and_set_thresholds(
@ -907,6 +914,54 @@ class ScoreNarwhal(Score):
            | workforce_combined_criteria_for_island_areas
        )
    def _mark_donut_hole_tracts(self) -> pd.DataFrame:
        """Mark tracts that do not qualify on their own, but are surrounded by those that do
        A donut hole is a tract surrounded by tracts that are marked for inclusion
        by the scoring system AND meet a less stringent low-income threshhold.
        We calculate "donut holes" after the initial score generation
        """
        logger.info("Marking donut hole tracts")
        # This is the boolean we pass to the front end for the donut-hole-specific
        # low income criterion
        self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS] = (
            self.df[
                field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.LOW_INCOME_THRESHOLD_DONUT
        )
        self.df = self.df.merge(
            calculate_tract_adjacency_scores(
                self.df, field_names.SCORE_N_COMMUNITIES
            ),
            how="left",
            on=field_names.GEOID_TRACT_FIELD,
        )
        # This is the boolean we pass to the front end for color
        self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD] = (
            self.df[
                (
                    field_names.SCORE_N_COMMUNITIES
                    + field_names.ADJACENCY_INDEX_SUFFIX
                )
            ]
            >= self.SCORE_THRESHOLD_DONUT
        )
        # This should be the "final list" of Score Narwhal communities, meaning that we would
        # expect this to be True if either the tract is a donut hole community OR the tract is a DAC
        self.df[
            field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
        ] = (
            self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED_DONUTS]
            & self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD]
        )
    def add_columns(self) -> pd.DataFrame:
        logger.info("Adding Score Narhwal")
@ -946,5 +1001,6 @@ class ScoreNarwhal(Score):
            field_names.SCORE_N_COMMUNITIES
            + field_names.PERCENTILE_FIELD_SUFFIX
        ] = self.df[field_names.SCORE_N_COMMUNITIES].astype(int)
        self._mark_donut_hole_tracts()
        return self.df
--- a/data/data-pipeline/data_pipeline/score/utils.py
+++ b/data/data-pipeline/data_pipeline/score/utils.py
@ -0,0 +1,56 @@
 """Utilities to help generate the score."""
 import pandas as pd
 import geopandas as gpd
 import data_pipeline.score.field_names as field_names
 # XXX: @jorge I am torn about the coupling that importing from
 # etl.sources vs keeping the code DRY. Thoughts?
 from data_pipeline.etl.sources.geo_utils import get_tract_geojson
 from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)
 def calculate_tract_adjacency_scores(
    df: pd.DataFrame, score_column: str
 ) -> pd.DataFrame:
    """Calculate the mean score of each tract in df based on its neighbors
    Args:
        df (pandas.DataFrame): A dataframe with at least the following columns:
          * field_names.GEOID_TRACT_FIELD
          * score_column
        score_column (str): The name of the column that contains the scores
                            to average
    Returns:
        df (pandas.DataFrame): A dataframe with two columns:
          * field_names.GEOID_TRACT_FIELD
          * {score_column}_ADJACENT_MEAN, which is the average of score_column for
            each tract that touches the tract identified
            in field_names.GEOID_TRACT_FIELD
    """
    ORIGINAL_TRACT = "ORIGINAL_TRACT"
    logger.debug("Calculating tract adjacency scores")
    tract_data = get_tract_geojson()
    df: gpd.GeoDataFrame = tract_data.merge(
        df, on=field_names.GEOID_TRACT_FIELD
    )
    df = df.rename(columns={field_names.GEOID_TRACT_FIELD: ORIGINAL_TRACT})
    logger.debug("Perfoming spatial join to find all adjacent tracts")
    adjacent_tracts: gpd.GeoDataFrame = df.sjoin(
        tract_data, predicate="touches"
    )
    logger.debug("Calculating means based on adjacency")
    return (
        adjacent_tracts.groupby(field_names.GEOID_TRACT_FIELD)[[score_column]]
        .mean()
        .reset_index()
        .rename(
            columns={
                score_column: f"{score_column}{field_names.ADJACENCY_INDEX_SUFFIX}",
            }
        )
    )
--- a/data/data-pipeline/data_pipeline/tests/score/init.py
+++ b/data/data-pipeline/data_pipeline/tests/score/init.py
--- a/data/data-pipeline/data_pipeline/tests/score/test_utils/init.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/init.py
--- a/data/data-pipeline/data_pipeline/tests/score/test_utils/data/scores.csv
+++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/scores.csv
@ -0,0 +1,10 @@
 GEOID10_TRACT,included
 24027602100,True
 24027602303,True
 24027605503,True
 24027605502,True
 24027603004,False
 24027605104,True
 24027603003,True
 24027603001,True
 24027602201,True
--- a/data/data-pipeline/data_pipeline/tests/score/test_utils/data/us.geojson
+++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/data/us.geojson
--- a/data/data-pipeline/data_pipeline/tests/score/test_utils/test_adjacency.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_utils/test_adjacency.py
@ -0,0 +1,71 @@
 # pylint: disable=protected-access
 # flake8: noqa=F841
 from pathlib import Path
 from unittest import mock
 from functools import partial
 from contextlib import contextmanager
 import pytest
 import pandas as pd
 from data_pipeline.score.utils import (
    calculate_tract_adjacency_scores as original_calculate_tract_adjacency_score,
 )
 from data_pipeline.etl.sources.geo_utils import get_tract_geojson
 from data_pipeline.score import field_names
@contextmanager
 def patch_calculate_tract_adjacency_scores():
    tract_data = Path(__file__).parent / "data" / "us.geojson"
    get_tract_geojson_mock = partial(
        get_tract_geojson, _tract_data_path=tract_data
    )
    with mock.patch(
        "data_pipeline.score.utils.get_tract_geojson",
        new=get_tract_geojson_mock,
    ):
        yield original_calculate_tract_adjacency_score
@pytest.fixture
 def score_data():
    score_csv = Path(__file__).parent / "data" / "scores.csv"
    return pd.read_csv(
        score_csv, dtype={field_names.GEOID_TRACT_FIELD: str, "included": bool}
    )
 def test_all_adjacent_are_true(score_data):
    score_data["included"] = True
    score_data.loc[
        score_data.GEOID10_TRACT == "24027603004", "included"
    ] = False
    with patch_calculate_tract_adjacency_scores() as calculate_tract_adjacency_scores:
        adjancency_scores = calculate_tract_adjacency_scores(
            score_data, "included"
        )
        assert (
            adjancency_scores.loc[
                adjancency_scores.GEOID10_TRACT == "24027603004",
                "included" + field_names.ADJACENCY_INDEX_SUFFIX,
            ].iloc[0]
            == 1.0
        )
 def test_all_adjacent_are_false(score_data):
    score_data["included"] = False
    score_data.loc[
        score_data.GEOID10_TRACT == "24027603004", "included"
    ] = False
    with patch_calculate_tract_adjacency_scores() as calculate_tract_adjacency_scores:
        adjancency_scores = calculate_tract_adjacency_scores(
            score_data, "included"
        )
        assert (
            adjancency_scores.loc[
                adjancency_scores.GEOID10_TRACT == "24027603004",
                "included" + field_names.ADJACENCY_INDEX_SUFFIX,
            ].iloc[0]
            == 0.0
        )