Added grandfathering of v1.0 DACS

This commit is contained in:
Carlos Felix 2024-12-04 10:00:14 -05:00 committed by Carlos Felix
parent 77e0996441
commit e0bb33211a
13 changed files with 74271 additions and 15 deletions

View file

@ -6,6 +6,7 @@ from pathlib import Path
from data_pipeline.etl.score import constants
from data_pipeline.utils import get_module_logger, download_file_from_url
from data_pipeline.application import log_title, log_info, log_goodbye
from data_pipeline.score import field_names
logger = get_module_logger(__name__)
@ -176,6 +177,7 @@ def compare_score(
production_row_count = len(production_score_df.index)
local_row_count = len(local_score_df.index)
# Tract comparison
_add_text(
f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
)
@ -191,8 +193,11 @@ def compare_score(
"\n"
)
production_total_population = production_score_df["Total population"].sum()
local_total_population = local_score_df["Total population"].sum()
# Population comparison
production_total_population = production_score_df[
field_names.TOTAL_POP_FIELD
].sum()
local_total_population = local_score_df[field_names.TOTAL_POP_FIELD].sum()
_add_text(
f"* The total population in all census tracts in the production score is {production_total_population:,}. "
@ -204,12 +209,9 @@ def compare_score(
else f"The difference is {abs(production_total_population - local_total_population):,}.\n"
)
production_disadvantaged_tracts_df = production_score_df.query(
"`Definition N community, including adjacency index tracts` == True"
)
local_disadvantaged_tracts_df = local_score_df.query(
"`Definition N community, including adjacency index tracts` == True"
)
dacs_query = f"`{field_names.FINAL_SCORE_N_BOOLEAN}` == True"
production_disadvantaged_tracts_df = production_score_df.query(dacs_query)
local_disadvantaged_tracts_df = local_score_df.query(dacs_query)
production_disadvantaged_tracts_set = set(
production_disadvantaged_tracts_df.index.array
@ -219,14 +221,15 @@ def compare_score(
)
production_pct_of_population_represented = (
production_disadvantaged_tracts_df["Total population"].sum()
production_disadvantaged_tracts_df[field_names.TOTAL_POP_FIELD].sum()
/ production_total_population
)
local_pct_of_population_represented = (
local_disadvantaged_tracts_df["Total population"].sum()
local_disadvantaged_tracts_df[field_names.TOTAL_POP_FIELD].sum()
/ local_total_population
)
# DACS comparison
_add_text(
f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing"
f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}"
@ -252,15 +255,27 @@ def compare_score(
f" generated score (i.e. disadvantaged tracts that were removed by the new score). "
)
if len(removed_tracts) > 0:
_add_text(f"Those tracts are:\n{removed_tracts}\n")
_add_text(f"Those tracts are:\n{removed_tracts}")
_add_text(
f"* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
f"\n* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
f" production score (i.e. disadvantaged tracts that were added by the new score). "
)
if len(added_tracts) > 0:
_add_text(f"Those tracts are:\n{added_tracts}\n")
# Grandfathered tracts from v1.0
grandfathered_tracts = local_score_df.loc[
local_score_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0]
].index
if len(grandfathered_tracts) > 0:
_add_text(
f"* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring. They are:\n"
f"{grandfathered_tracts.to_list()}\n"
)
else:
_add_text("* There are NO grandfathered tracts from v1.0 scoring.\n")
################
# Create a delta
################

View file

@ -0,0 +1,5 @@
Static data is used in the computation of the score as stated.
# v1.0-score-results-usa.csv
This is the v1.0 score results used for grandfathering computations. It is a
stripped down version of the v1.0 score to include only the columns needed.

File diff suppressed because it is too large Load diff

View file

@ -9,6 +9,7 @@ from data_pipeline.score import field_names
# Base Paths
DATA_PATH = Path(settings.APP_ROOT) / "data"
STATIC_DATA_PATH = Path(settings.APP_ROOT) / "content" / "static_data"
TMP_PATH = DATA_PATH / "tmp"
FILES_PATH = Path(settings.APP_ROOT) / "files"
@ -275,6 +276,7 @@ TILES_SCORE_COLUMNS = {
# temporarily update this so that it's the Narwhal score that gets visualized on the map
# The NEW final score value INCLUDES the adjacency index.
field_names.FINAL_SCORE_N_BOOLEAN: "SN_C",
field_names.FINAL_SCORE_N_BOOLEAN_V1_0: "SN_C_V10",
field_names.IS_TRIBAL_DAC: "SN_T",
field_names.DIABETES_LOW_INCOME_FIELD: "DLI",
field_names.ASTHMA_LOW_INCOME_FIELD: "ALI",

View file

@ -54,6 +54,7 @@ class ScoreETL(ExtractTransformLoad):
self.eamlis_df: pd.DataFrame
self.fuds_df: pd.DataFrame
self.tribal_overlap_df: pd.DataFrame
self.v1_0_score_results_df: pd.DataFrame
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
@ -205,6 +206,22 @@ class ScoreETL(ExtractTransformLoad):
header=None,
)
# Load v1.0 score results for grandfathering purposes
score_v1_0_csv = (
constants.STATIC_DATA_PATH / "v1.0-score-results-usa.csv"
)
self.v1_0_score_results_df = pd.read_csv(
score_v1_0_csv,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
self.v1_0_score_results_df.rename(
columns={
field_names.FINAL_SCORE_N_BOOLEAN: field_names.FINAL_SCORE_N_BOOLEAN_V1_0,
},
inplace=True,
)
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
logger.debug("Joining Census Tract dataframes")
@ -364,6 +381,7 @@ class ScoreETL(ExtractTransformLoad):
self.eamlis_df,
self.fuds_df,
self.tribal_overlap_df,
self.v1_0_score_results_df,
]
# Sanity check each data frame before merging.
@ -514,6 +532,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
field_names.IS_TRIBAL_DAC,
field_names.FINAL_SCORE_N_BOOLEAN_V1_0,
]
# For some columns, high values are "good", so we want to reverse the percentile

View file

@ -129,6 +129,16 @@ def tile_data_expected():
return pd.read_pickle(pytest.SNAPSHOT_DIR / "tile_data_expected.pkl")
@pytest.fixture()
def create_tile_score_data_input():
return pd.read_pickle(pytest.SNAPSHOT_DIR / "create_tile_score_data_input.pkl")
@pytest.fixture()
def create_tile_data_expected():
return pd.read_pickle(pytest.SNAPSHOT_DIR / "create_tile_data_expected.pkl")
@pytest.fixture()
def downloadable_data_expected():
return pd.read_pickle(

View file

@ -0,0 +1,23 @@
These files are used as inputs to unit tests. Some notes in their creation is below.
### create_tile_data_expected.pkl
1. Set a breakpoint in the `test_create_tile_data` method in `data_pipeline/etl/score/tests/test_score_post.py`
after the call to `_create_tile_data` and debug the test.
2. Extract a subset of the `output_tiles_df_actual` dataframe. Do not extract the whole score as the file
will be too big and the test will run slow. Also, you need to extract the same tracts that are in
the `create_tile_score_data_input.pkl` input data. For example, use the following command once the breakpoint is reached
to extract a few rows at the top and bottom of the score. This will some capture states and territories.
```python
import pandas as pd
pd.concat([output_tiles_df_actual.head(3), output_tiles_df_actual.tail(3)], ignore_index=True).to_pickle('data_pipeline/etl/score/tests/snapshots/create_tile_data_expected.pkl')
```
### create_tile_score_data_input.pkl
1. Set a breakpoint in the transform method in `data_pipeline/etl/score/etl_score_post.py` before the call to
`_create_tile_data` and run the post scoring.
2. Extract a subset of the `output_score_county_state_merged_df` dataframe. Do not extract the whole score as the file
will be too big and the test will run slow. For example, use the following command once the breakpoint is reached
to extract a few rows at the top and bottom of the score. This will some capture states and territories.
```python
pd.concat([output_score_county_state_merged_df.head(3), output_score_county_state_merged_df.tail(3)], ignore_index=True).to_pickle('data_pipeline/etl/score/tests/snapshots/create_tile_score_data_input.pkl')
```

View file

@ -80,11 +80,11 @@ def test_create_score_data(
)
def test_create_tile_data(etl, score_data_expected, tile_data_expected):
output_tiles_df_actual = etl._create_tile_data(score_data_expected)
def test_create_tile_data(etl, create_tile_score_data_input, create_tile_data_expected):
output_tiles_df_actual = etl._create_tile_data(create_tile_score_data_input)
pdt.assert_frame_equal(
output_tiles_df_actual,
tile_data_expected,
create_tile_data_expected,
)

View file

@ -4,6 +4,7 @@ ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)"
ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"
ISLAND_AREA_BACKFILL_SUFFIX = " in 2009"
V1_0_RESULTS_SUFFIX = " v1.0"
# Geographic field names
GEOID_TRACT_FIELD = "GEOID10_TRACT"
@ -27,6 +28,10 @@ N_NON_WORKFORCE = "Any Non-Workforce Factor (Definition N)"
FINAL_SCORE_N_BOOLEAN = (
"Definition N community, including adjacency index tracts"
)
FINAL_SCORE_N_BOOLEAN_V1_0 = f"{FINAL_SCORE_N_BOOLEAN}{V1_0_RESULTS_SUFFIX}"
GRANDFATHERED_N_COMMUNITIES_V1_0 = (
f"Grandfathered {SCORE_N_COMMUNITIES} from v1.0"
)
PERCENTILE = 90
MEDIAN_HOUSE_VALUE_PERCENTILE = 90

View file

@ -1024,6 +1024,20 @@ class ScoreNarwhal(Score):
self.df[field_names.SCORE_N_COMMUNITIES],
)
def _mark_grandfathered_dacs(self) -> None:
"""Territory tracts that are flagged as DACS in the V1.0 score are also marked."""
self.df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0] = np.where(
self.df[field_names.FINAL_SCORE_N_BOOLEAN_V1_0]
& ~self.df[field_names.FINAL_SCORE_N_BOOLEAN],
True,
False,
)
self.df[field_names.FINAL_SCORE_N_BOOLEAN] = np.where(
self.df[field_names.FINAL_SCORE_N_BOOLEAN_V1_0],
True,
self.df[field_names.FINAL_SCORE_N_BOOLEAN],
)
def _mark_poverty_flag(self) -> None:
"""Combine poverty less than 200% for territories and update the income flag."""
# First we set the low income flag for non-territories by themselves, this
@ -1111,6 +1125,7 @@ class ScoreNarwhal(Score):
] = self.df[field_names.SCORE_N_COMMUNITIES].astype(int)
self._mark_donut_hole_tracts()
self._mark_grandfathered_dacs()
self.df[
field_names.PERCENT_OF_TRACT_IS_DAC
] = self._get_percent_of_tract_that_is_dac()

View file

@ -128,3 +128,30 @@ def test_mark_poverty_flag():
assert not test_data[~expected_low_income_filter][
field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED
].all()
def test_mark_grandfathered_dacs():
data = {
field_names.GEOID_TRACT_FIELD: [
"78010971500",
"78010970500",
"66010954400",
"66010953400",
],
field_names.FINAL_SCORE_N_BOOLEAN_V1_0: [False, False, True, True],
field_names.FINAL_SCORE_N_BOOLEAN: [False, True, False, True],
}
test_df = pd.DataFrame(data)
scorer = ScoreNarwhal(test_df)
scorer._mark_grandfathered_dacs()
result = scorer.df
assert field_names.GRANDFATHERED_N_COMMUNITIES_V1_0 in result.columns
assert not result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][0]
assert not result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][1]
assert result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][2]
assert not result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][3]
assert not result[field_names.FINAL_SCORE_N_BOOLEAN][0]
assert result[field_names.FINAL_SCORE_N_BOOLEAN][1]
assert result[field_names.FINAL_SCORE_N_BOOLEAN][2]
assert result[field_names.FINAL_SCORE_N_BOOLEAN][3]