Added grandfathering of v1.0 DACS

This commit is contained in:
Carlos Felix 2024-12-04 10:00:14 -05:00 committed by Carlos Felix
commit e0bb33211a
13 changed files with 74271 additions and 15 deletions

View file

@ -9,6 +9,7 @@ from data_pipeline.score import field_names
# Base Paths
DATA_PATH = Path(settings.APP_ROOT) / "data"
STATIC_DATA_PATH = Path(settings.APP_ROOT) / "content" / "static_data"
TMP_PATH = DATA_PATH / "tmp"
FILES_PATH = Path(settings.APP_ROOT) / "files"
@ -275,6 +276,7 @@ TILES_SCORE_COLUMNS = {
# temporarily update this so that it's the Narwhal score that gets visualized on the map
# The NEW final score value INCLUDES the adjacency index.
field_names.FINAL_SCORE_N_BOOLEAN: "SN_C",
field_names.FINAL_SCORE_N_BOOLEAN_V1_0: "SN_C_V10",
field_names.IS_TRIBAL_DAC: "SN_T",
field_names.DIABETES_LOW_INCOME_FIELD: "DLI",
field_names.ASTHMA_LOW_INCOME_FIELD: "ALI",

View file

@ -54,6 +54,7 @@ class ScoreETL(ExtractTransformLoad):
self.eamlis_df: pd.DataFrame
self.fuds_df: pd.DataFrame
self.tribal_overlap_df: pd.DataFrame
self.v1_0_score_results_df: pd.DataFrame
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
@ -205,6 +206,22 @@ class ScoreETL(ExtractTransformLoad):
header=None,
)
# Load v1.0 score results for grandfathering purposes
score_v1_0_csv = (
constants.STATIC_DATA_PATH / "v1.0-score-results-usa.csv"
)
self.v1_0_score_results_df = pd.read_csv(
score_v1_0_csv,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
self.v1_0_score_results_df.rename(
columns={
field_names.FINAL_SCORE_N_BOOLEAN: field_names.FINAL_SCORE_N_BOOLEAN_V1_0,
},
inplace=True,
)
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
logger.debug("Joining Census Tract dataframes")
@ -364,6 +381,7 @@ class ScoreETL(ExtractTransformLoad):
self.eamlis_df,
self.fuds_df,
self.tribal_overlap_df,
self.v1_0_score_results_df,
]
# Sanity check each data frame before merging.
@ -514,6 +532,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
field_names.IS_TRIBAL_DAC,
field_names.FINAL_SCORE_N_BOOLEAN_V1_0,
]
# For some columns, high values are "good", so we want to reverse the percentile

View file

@ -129,6 +129,16 @@ def tile_data_expected():
return pd.read_pickle(pytest.SNAPSHOT_DIR / "tile_data_expected.pkl")
@pytest.fixture()
def create_tile_score_data_input():
return pd.read_pickle(pytest.SNAPSHOT_DIR / "create_tile_score_data_input.pkl")
@pytest.fixture()
def create_tile_data_expected():
return pd.read_pickle(pytest.SNAPSHOT_DIR / "create_tile_data_expected.pkl")
@pytest.fixture()
def downloadable_data_expected():
return pd.read_pickle(

View file

@ -0,0 +1,23 @@
These files are used as inputs to unit tests. Some notes in their creation is below.
### create_tile_data_expected.pkl
1. Set a breakpoint in the `test_create_tile_data` method in `data_pipeline/etl/score/tests/test_score_post.py`
after the call to `_create_tile_data` and debug the test.
2. Extract a subset of the `output_tiles_df_actual` dataframe. Do not extract the whole score as the file
will be too big and the test will run slow. Also, you need to extract the same tracts that are in
the `create_tile_score_data_input.pkl` input data. For example, use the following command once the breakpoint is reached
to extract a few rows at the top and bottom of the score. This will some capture states and territories.
```python
import pandas as pd
pd.concat([output_tiles_df_actual.head(3), output_tiles_df_actual.tail(3)], ignore_index=True).to_pickle('data_pipeline/etl/score/tests/snapshots/create_tile_data_expected.pkl')
```
### create_tile_score_data_input.pkl
1. Set a breakpoint in the transform method in `data_pipeline/etl/score/etl_score_post.py` before the call to
`_create_tile_data` and run the post scoring.
2. Extract a subset of the `output_score_county_state_merged_df` dataframe. Do not extract the whole score as the file
will be too big and the test will run slow. For example, use the following command once the breakpoint is reached
to extract a few rows at the top and bottom of the score. This will some capture states and territories.
```python
pd.concat([output_score_county_state_merged_df.head(3), output_score_county_state_merged_df.tail(3)], ignore_index=True).to_pickle('data_pipeline/etl/score/tests/snapshots/create_tile_score_data_input.pkl')
```

View file

@ -80,11 +80,11 @@ def test_create_score_data(
)
def test_create_tile_data(etl, score_data_expected, tile_data_expected):
output_tiles_df_actual = etl._create_tile_data(score_data_expected)
def test_create_tile_data(etl, create_tile_score_data_input, create_tile_data_expected):
output_tiles_df_actual = etl._create_tile_data(create_tile_score_data_input)
pdt.assert_frame_equal(
output_tiles_df_actual,
tile_data_expected,
create_tile_data_expected,
)