Added grandfathering of v1.0 DACS

2025-07-26 19:01:17 -07:00 · 2024-12-04 10:00:14 -05:00 · 2024-12-04 10:00:14 -05:00 · e0bb33211a
commit e0bb33211a
parent 77e0996441
13 changed files with 74271 additions and 15 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -9,6 +9,7 @@ from data_pipeline.score import field_names

 # Base Paths
 DATA_PATH = Path(settings.APP_ROOT) / "data"
+STATIC_DATA_PATH = Path(settings.APP_ROOT) / "content" / "static_data"
 TMP_PATH = DATA_PATH / "tmp"
 FILES_PATH = Path(settings.APP_ROOT) / "files"

@ -275,6 +276,7 @@ TILES_SCORE_COLUMNS = {
    # temporarily update this so that it's the Narwhal score that gets visualized on the map
    # The NEW final score value INCLUDES the adjacency index.
    field_names.FINAL_SCORE_N_BOOLEAN: "SN_C",
+    field_names.FINAL_SCORE_N_BOOLEAN_V1_0: "SN_C_V10",
    field_names.IS_TRIBAL_DAC: "SN_T",
    field_names.DIABETES_LOW_INCOME_FIELD: "DLI",
    field_names.ASTHMA_LOW_INCOME_FIELD: "ALI",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -54,6 +54,7 @@ class ScoreETL(ExtractTransformLoad):
        self.eamlis_df: pd.DataFrame
        self.fuds_df: pd.DataFrame
        self.tribal_overlap_df: pd.DataFrame
+        self.v1_0_score_results_df: pd.DataFrame

        self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []

@ -205,6 +206,22 @@ class ScoreETL(ExtractTransformLoad):
            header=None,
        )

+        # Load v1.0 score results for grandfathering purposes
+        score_v1_0_csv = (
+            constants.STATIC_DATA_PATH / "v1.0-score-results-usa.csv"
+        )
+        self.v1_0_score_results_df = pd.read_csv(
+            score_v1_0_csv,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
+            low_memory=False,
+        )
+        self.v1_0_score_results_df.rename(
+            columns={
+                field_names.FINAL_SCORE_N_BOOLEAN: field_names.FINAL_SCORE_N_BOOLEAN_V1_0,
+            },
+            inplace=True,
+        )
+
    def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
        logger.debug("Joining Census Tract dataframes")

@ -364,6 +381,7 @@ class ScoreETL(ExtractTransformLoad):
            self.eamlis_df,
            self.fuds_df,
            self.tribal_overlap_df,
+            self.v1_0_score_results_df,
        ]

        # Sanity check each data frame before merging.
@ -514,6 +532,7 @@ class ScoreETL(ExtractTransformLoad):
            field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
            field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
            field_names.IS_TRIBAL_DAC,
+            field_names.FINAL_SCORE_N_BOOLEAN_V1_0,
        ]

        # For some columns, high values are "good", so we want to reverse the percentile
--- a/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/conftest.py
@ -129,6 +129,16 @@ def tile_data_expected():
    return pd.read_pickle(pytest.SNAPSHOT_DIR / "tile_data_expected.pkl")


+@pytest.fixture()
+def create_tile_score_data_input():
+    return pd.read_pickle(pytest.SNAPSHOT_DIR / "create_tile_score_data_input.pkl")
+
+
+@pytest.fixture()
+def create_tile_data_expected():
+    return pd.read_pickle(pytest.SNAPSHOT_DIR / "create_tile_data_expected.pkl")
+
+
@pytest.fixture()
 def downloadable_data_expected():
    return pd.read_pickle(
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/README.md
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/README.md
@ -0,0 +1,23 @@
+These files are used as inputs to unit tests. Some notes in their creation is below.
+
+### create_tile_data_expected.pkl
+1. Set a breakpoint in the `test_create_tile_data` method in `data_pipeline/etl/score/tests/test_score_post.py` 
+after the call to `_create_tile_data` and debug the test.
+2. Extract a subset of the `output_tiles_df_actual` dataframe. Do not extract the whole score as the file 
+will be too big and the test will run slow. Also, you need to extract the same tracts that are in
+the `create_tile_score_data_input.pkl` input data. For example, use the following command once the breakpoint is reached 
+to extract a few rows at the top and bottom of the score. This will some capture states and territories.
+```python
+import pandas as pd
+pd.concat([output_tiles_df_actual.head(3), output_tiles_df_actual.tail(3)], ignore_index=True).to_pickle('data_pipeline/etl/score/tests/snapshots/create_tile_data_expected.pkl')
+```
+
+### create_tile_score_data_input.pkl
+1. Set a breakpoint in the transform method in `data_pipeline/etl/score/etl_score_post.py` before the call to
+`_create_tile_data` and run the post scoring.
+2. Extract a subset of the `output_score_county_state_merged_df` dataframe. Do not extract the whole score as the file 
+will be too big and the test will run slow. For example, use the following command once the breakpoint is reached 
+to extract a few rows at the top and bottom of the score. This will some capture states and territories.
+```python
+pd.concat([output_score_county_state_merged_df.head(3), output_score_county_state_merged_df.tail(3)], ignore_index=True).to_pickle('data_pipeline/etl/score/tests/snapshots/create_tile_score_data_input.pkl')
+```
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/create_tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/create_tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/create_tile_score_data_input.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/create_tile_score_data_input.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
@ -80,11 +80,11 @@ def test_create_score_data(
    )


-def test_create_tile_data(etl, score_data_expected, tile_data_expected):
-    output_tiles_df_actual = etl._create_tile_data(score_data_expected)
+def test_create_tile_data(etl, create_tile_score_data_input, create_tile_data_expected):
+    output_tiles_df_actual = etl._create_tile_data(create_tile_score_data_input)
    pdt.assert_frame_equal(
        output_tiles_df_actual,
-        tile_data_expected,
+        create_tile_data_expected,
    )