Added grandfathering of v1.0 DACS

2025-07-29 11:11:17 -07:00 · 2024-12-04 10:00:14 -05:00 · 2024-12-04 10:00:14 -05:00 · e0bb33211a
commit e0bb33211a
parent 77e0996441
13 changed files with 74271 additions and 15 deletions
--- a/data/data-pipeline/data_pipeline/comparator.py
+++ b/data/data-pipeline/data_pipeline/comparator.py
@ -6,6 +6,7 @@ from pathlib import Path
 from data_pipeline.etl.score import constants
 from data_pipeline.utils import get_module_logger, download_file_from_url
 from data_pipeline.application import log_title, log_info, log_goodbye
+from data_pipeline.score import field_names

 logger = get_module_logger(__name__)

@ -176,6 +177,7 @@ def compare_score(
    production_row_count = len(production_score_df.index)
    local_row_count = len(local_score_df.index)

+    # Tract comparison
    _add_text(
        f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
    )
@ -191,8 +193,11 @@ def compare_score(
            "\n"
        )

-    production_total_population = production_score_df["Total population"].sum()
-    local_total_population = local_score_df["Total population"].sum()
+    # Population comparison
+    production_total_population = production_score_df[
+        field_names.TOTAL_POP_FIELD
+    ].sum()
+    local_total_population = local_score_df[field_names.TOTAL_POP_FIELD].sum()

    _add_text(
        f"* The total population in all census tracts in the production score is {production_total_population:,}. "
@ -204,12 +209,9 @@ def compare_score(
        else f"The difference is {abs(production_total_population - local_total_population):,}.\n"
    )

-    production_disadvantaged_tracts_df = production_score_df.query(
-        "`Definition N community, including adjacency index tracts` == True"
-    )
-    local_disadvantaged_tracts_df = local_score_df.query(
-        "`Definition N community, including adjacency index tracts` == True"
-    )
+    dacs_query = f"`{field_names.FINAL_SCORE_N_BOOLEAN}` == True"
+    production_disadvantaged_tracts_df = production_score_df.query(dacs_query)
+    local_disadvantaged_tracts_df = local_score_df.query(dacs_query)

    production_disadvantaged_tracts_set = set(
        production_disadvantaged_tracts_df.index.array
@ -219,14 +221,15 @@ def compare_score(
    )

    production_pct_of_population_represented = (
-        production_disadvantaged_tracts_df["Total population"].sum()
+        production_disadvantaged_tracts_df[field_names.TOTAL_POP_FIELD].sum()
        / production_total_population
    )
    local_pct_of_population_represented = (
-        local_disadvantaged_tracts_df["Total population"].sum()
+        local_disadvantaged_tracts_df[field_names.TOTAL_POP_FIELD].sum()
        / local_total_population
    )

+    # DACS comparison
    _add_text(
        f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing"
        f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}"
@ -252,15 +255,27 @@ def compare_score(
        f" generated score (i.e. disadvantaged tracts that were removed by the new score). "
    )
    if len(removed_tracts) > 0:
-        _add_text(f"Those tracts are:\n{removed_tracts}\n")
+        _add_text(f"Those tracts are:\n{removed_tracts}")

    _add_text(
-        f"* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
+        f"\n* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
        f" production score (i.e. disadvantaged tracts that were added by the new score). "
    )
    if len(added_tracts) > 0:
        _add_text(f"Those tracts are:\n{added_tracts}\n")

+    # Grandfathered tracts from v1.0
+    grandfathered_tracts = local_score_df.loc[
+        local_score_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0]
+    ].index
+    if len(grandfathered_tracts) > 0:
+        _add_text(
+            f"* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring. They are:\n"
+            f"{grandfathered_tracts.to_list()}\n"
+        )
+    else:
+        _add_text("* There are NO grandfathered tracts from v1.0 scoring.\n")
+
    ################
    # Create a delta
    ################
--- a/data/data-pipeline/data_pipeline/content/static_data/README.md
+++ b/data/data-pipeline/data_pipeline/content/static_data/README.md
@ -0,0 +1,5 @@
+Static data is used in the computation of the score as stated.
+
+# v1.0-score-results-usa.csv
+This is the v1.0 score results used for grandfathering computations. It is a 
+stripped down version of the v1.0 score to include only the columns needed.
--- a/data/data-pipeline/data_pipeline/content/static_data/v1.0-score-results-usa.csv
+++ b/data/data-pipeline/data_pipeline/content/static_data/v1.0-score-results-usa.csv
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -9,6 +9,7 @@ from data_pipeline.score import field_names

 # Base Paths
 DATA_PATH = Path(settings.APP_ROOT) / "data"
+STATIC_DATA_PATH = Path(settings.APP_ROOT) / "content" / "static_data"
 TMP_PATH = DATA_PATH / "tmp"
 FILES_PATH = Path(settings.APP_ROOT) / "files"

@ -275,6 +276,7 @@ TILES_SCORE_COLUMNS = {
    # temporarily update this so that it's the Narwhal score that gets visualized on the map
    # The NEW final score value INCLUDES the adjacency index.
    field_names.FINAL_SCORE_N_BOOLEAN: "SN_C",
+    field_names.FINAL_SCORE_N_BOOLEAN_V1_0: "SN_C_V10",
    field_names.IS_TRIBAL_DAC: "SN_T",
    field_names.DIABETES_LOW_INCOME_FIELD: "DLI",
    field_names.ASTHMA_LOW_INCOME_FIELD: "ALI",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -54,6 +54,7 @@ class ScoreETL(ExtractTransformLoad):
        self.eamlis_df: pd.DataFrame
        self.fuds_df: pd.DataFrame
        self.tribal_overlap_df: pd.DataFrame
+        self.v1_0_score_results_df: pd.DataFrame

        self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []

@ -205,6 +206,22 @@ class ScoreETL(ExtractTransformLoad):
            header=None,
        )

+        # Load v1.0 score results for grandfathering purposes
+        score_v1_0_csv = (
+            constants.STATIC_DATA_PATH / "v1.0-score-results-usa.csv"
+        )
+        self.v1_0_score_results_df = pd.read_csv(
+            score_v1_0_csv,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
+            low_memory=False,
+        )
+        self.v1_0_score_results_df.rename(
+            columns={
+                field_names.FINAL_SCORE_N_BOOLEAN: field_names.FINAL_SCORE_N_BOOLEAN_V1_0,
+            },
+            inplace=True,
+        )
+
    def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
        logger.debug("Joining Census Tract dataframes")

@ -364,6 +381,7 @@ class ScoreETL(ExtractTransformLoad):
            self.eamlis_df,
            self.fuds_df,
            self.tribal_overlap_df,
+            self.v1_0_score_results_df,
        ]

        # Sanity check each data frame before merging.
@ -514,6 +532,7 @@ class ScoreETL(ExtractTransformLoad):
            field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
            field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
            field_names.IS_TRIBAL_DAC,
+            field_names.FINAL_SCORE_N_BOOLEAN_V1_0,
        ]

        # For some columns, high values are "good", so we want to reverse the percentile
--- a/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
@ -129,6 +129,16 @@ def tile_data_expected():
    return pd.read_pickle(pytest.SNAPSHOT_DIR / "tile_data_expected.pkl")


+@pytest.fixture()
+def create_tile_score_data_input():
+    return pd.read_pickle(pytest.SNAPSHOT_DIR / "create_tile_score_data_input.pkl")
+
+
+@pytest.fixture()
+def create_tile_data_expected():
+    return pd.read_pickle(pytest.SNAPSHOT_DIR / "create_tile_data_expected.pkl")
+
+
@pytest.fixture()
 def downloadable_data_expected():
    return pd.read_pickle(
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/README.md
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/README.md
@ -0,0 +1,23 @@
+These files are used as inputs to unit tests. Some notes in their creation is below.
+
+### create_tile_data_expected.pkl
+1. Set a breakpoint in the `test_create_tile_data` method in `data_pipeline/etl/score/tests/test_score_post.py` 
+after the call to `_create_tile_data` and debug the test.
+2. Extract a subset of the `output_tiles_df_actual` dataframe. Do not extract the whole score as the file 
+will be too big and the test will run slow. Also, you need to extract the same tracts that are in
+the `create_tile_score_data_input.pkl` input data. For example, use the following command once the breakpoint is reached 
+to extract a few rows at the top and bottom of the score. This will some capture states and territories.
+```python
+import pandas as pd
+pd.concat([output_tiles_df_actual.head(3), output_tiles_df_actual.tail(3)], ignore_index=True).to_pickle('data_pipeline/etl/score/tests/snapshots/create_tile_data_expected.pkl')
+```
+
+### create_tile_score_data_input.pkl
+1. Set a breakpoint in the transform method in `data_pipeline/etl/score/etl_score_post.py` before the call to
+`_create_tile_data` and run the post scoring.
+2. Extract a subset of the `output_score_county_state_merged_df` dataframe. Do not extract the whole score as the file 
+will be too big and the test will run slow. For example, use the following command once the breakpoint is reached 
+to extract a few rows at the top and bottom of the score. This will some capture states and territories.
+```python
+pd.concat([output_score_county_state_merged_df.head(3), output_score_county_state_merged_df.tail(3)], ignore_index=True).to_pickle('data_pipeline/etl/score/tests/snapshots/create_tile_score_data_input.pkl')
+```
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/create_tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/create_tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/create_tile_score_data_input.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/create_tile_score_data_input.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
@ -80,11 +80,11 @@ def test_create_score_data(
    )


-def test_create_tile_data(etl, score_data_expected, tile_data_expected):
-    output_tiles_df_actual = etl._create_tile_data(score_data_expected)
+def test_create_tile_data(etl, create_tile_score_data_input, create_tile_data_expected):
+    output_tiles_df_actual = etl._create_tile_data(create_tile_score_data_input)
    pdt.assert_frame_equal(
        output_tiles_df_actual,
-        tile_data_expected,
+        create_tile_data_expected,
    )


--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -4,6 +4,7 @@ ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
 ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)"
 ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"
 ISLAND_AREA_BACKFILL_SUFFIX = " in 2009"
+V1_0_RESULTS_SUFFIX = " v1.0"

 # Geographic field names
 GEOID_TRACT_FIELD = "GEOID10_TRACT"
@ -27,6 +28,10 @@ N_NON_WORKFORCE = "Any Non-Workforce Factor (Definition N)"
 FINAL_SCORE_N_BOOLEAN = (
    "Definition N community, including adjacency index tracts"
 )
+FINAL_SCORE_N_BOOLEAN_V1_0 = f"{FINAL_SCORE_N_BOOLEAN}{V1_0_RESULTS_SUFFIX}"
+GRANDFATHERED_N_COMMUNITIES_V1_0 = (
+    f"Grandfathered {SCORE_N_COMMUNITIES} from v1.0"
+)

 PERCENTILE = 90
 MEDIAN_HOUSE_VALUE_PERCENTILE = 90
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -1024,6 +1024,20 @@ class ScoreNarwhal(Score):
            self.df[field_names.SCORE_N_COMMUNITIES],
        )

+    def _mark_grandfathered_dacs(self) -> None:
+        """Territory tracts that are flagged as DACS in the V1.0 score are also marked."""
+        self.df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0] = np.where(
+            self.df[field_names.FINAL_SCORE_N_BOOLEAN_V1_0]
+            & ~self.df[field_names.FINAL_SCORE_N_BOOLEAN],
+            True,
+            False,
+        )
+        self.df[field_names.FINAL_SCORE_N_BOOLEAN] = np.where(
+            self.df[field_names.FINAL_SCORE_N_BOOLEAN_V1_0],
+            True,
+            self.df[field_names.FINAL_SCORE_N_BOOLEAN],
+        )
+
    def _mark_poverty_flag(self) -> None:
        """Combine poverty less than 200% for territories and update the income flag."""
        # First we set the low income flag for non-territories by themselves, this
@ -1111,6 +1125,7 @@ class ScoreNarwhal(Score):
        ] = self.df[field_names.SCORE_N_COMMUNITIES].astype(int)

        self._mark_donut_hole_tracts()
+        self._mark_grandfathered_dacs()
        self.df[
            field_names.PERCENT_OF_TRACT_IS_DAC
        ] = self._get_percent_of_tract_that_is_dac()
--- a/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_score_narwhal_methods.py
@ -128,3 +128,30 @@ def test_mark_poverty_flag():
    assert not test_data[~expected_low_income_filter][
        field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED
    ].all()
+
+
+def test_mark_grandfathered_dacs():
+    data = {
+        field_names.GEOID_TRACT_FIELD: [
+            "78010971500",
+            "78010970500",
+            "66010954400",
+            "66010953400",
+        ],
+        field_names.FINAL_SCORE_N_BOOLEAN_V1_0: [False, False, True, True],
+        field_names.FINAL_SCORE_N_BOOLEAN: [False, True, False, True],
+    }
+    test_df = pd.DataFrame(data)
+    scorer = ScoreNarwhal(test_df)
+    scorer._mark_grandfathered_dacs()
+    result = scorer.df
+    assert field_names.GRANDFATHERED_N_COMMUNITIES_V1_0 in result.columns
+    assert not result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][0]
+    assert not result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][1]
+    assert result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][2]
+    assert not result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][3]
+
+    assert not result[field_names.FINAL_SCORE_N_BOOLEAN][0]
+    assert result[field_names.FINAL_SCORE_N_BOOLEAN][1]
+    assert result[field_names.FINAL_SCORE_N_BOOLEAN][2]
+    assert result[field_names.FINAL_SCORE_N_BOOLEAN][3]