mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 01:31:25 -08:00
Added grandfathering of v1.0 DACS
This commit is contained in:
parent
77e0996441
commit
e0bb33211a
13 changed files with 74271 additions and 15 deletions
|
@ -6,6 +6,7 @@ from pathlib import Path
|
|||
from data_pipeline.etl.score import constants
|
||||
from data_pipeline.utils import get_module_logger, download_file_from_url
|
||||
from data_pipeline.application import log_title, log_info, log_goodbye
|
||||
from data_pipeline.score import field_names
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -176,6 +177,7 @@ def compare_score(
|
|||
production_row_count = len(production_score_df.index)
|
||||
local_row_count = len(local_score_df.index)
|
||||
|
||||
# Tract comparison
|
||||
_add_text(
|
||||
f"* The production score has {production_row_count:,} census tracts, and the freshly calculated score has {local_row_count:,}."
|
||||
)
|
||||
|
@ -191,8 +193,11 @@ def compare_score(
|
|||
"\n"
|
||||
)
|
||||
|
||||
production_total_population = production_score_df["Total population"].sum()
|
||||
local_total_population = local_score_df["Total population"].sum()
|
||||
# Population comparison
|
||||
production_total_population = production_score_df[
|
||||
field_names.TOTAL_POP_FIELD
|
||||
].sum()
|
||||
local_total_population = local_score_df[field_names.TOTAL_POP_FIELD].sum()
|
||||
|
||||
_add_text(
|
||||
f"* The total population in all census tracts in the production score is {production_total_population:,}. "
|
||||
|
@ -204,12 +209,9 @@ def compare_score(
|
|||
else f"The difference is {abs(production_total_population - local_total_population):,}.\n"
|
||||
)
|
||||
|
||||
production_disadvantaged_tracts_df = production_score_df.query(
|
||||
"`Definition N community, including adjacency index tracts` == True"
|
||||
)
|
||||
local_disadvantaged_tracts_df = local_score_df.query(
|
||||
"`Definition N community, including adjacency index tracts` == True"
|
||||
)
|
||||
dacs_query = f"`{field_names.FINAL_SCORE_N_BOOLEAN}` == True"
|
||||
production_disadvantaged_tracts_df = production_score_df.query(dacs_query)
|
||||
local_disadvantaged_tracts_df = local_score_df.query(dacs_query)
|
||||
|
||||
production_disadvantaged_tracts_set = set(
|
||||
production_disadvantaged_tracts_df.index.array
|
||||
|
@ -219,14 +221,15 @@ def compare_score(
|
|||
)
|
||||
|
||||
production_pct_of_population_represented = (
|
||||
production_disadvantaged_tracts_df["Total population"].sum()
|
||||
production_disadvantaged_tracts_df[field_names.TOTAL_POP_FIELD].sum()
|
||||
/ production_total_population
|
||||
)
|
||||
local_pct_of_population_represented = (
|
||||
local_disadvantaged_tracts_df["Total population"].sum()
|
||||
local_disadvantaged_tracts_df[field_names.TOTAL_POP_FIELD].sum()
|
||||
/ local_total_population
|
||||
)
|
||||
|
||||
# DACS comparison
|
||||
_add_text(
|
||||
f"* There are {len(production_disadvantaged_tracts_set):,} disadvantaged tracts in the production score representing"
|
||||
f" {production_pct_of_population_represented:.1%} of the total population, and {len(local_disadvantaged_tracts_set):,}"
|
||||
|
@ -252,15 +255,27 @@ def compare_score(
|
|||
f" generated score (i.e. disadvantaged tracts that were removed by the new score). "
|
||||
)
|
||||
if len(removed_tracts) > 0:
|
||||
_add_text(f"Those tracts are:\n{removed_tracts}\n")
|
||||
_add_text(f"Those tracts are:\n{removed_tracts}")
|
||||
|
||||
_add_text(
|
||||
f"* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
|
||||
f"\n* There are {len(added_tracts):,} tract(s) marked as disadvantaged in the locally generated score that are not disadvantaged in the"
|
||||
f" production score (i.e. disadvantaged tracts that were added by the new score). "
|
||||
)
|
||||
if len(added_tracts) > 0:
|
||||
_add_text(f"Those tracts are:\n{added_tracts}\n")
|
||||
|
||||
# Grandfathered tracts from v1.0
|
||||
grandfathered_tracts = local_score_df.loc[
|
||||
local_score_df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0]
|
||||
].index
|
||||
if len(grandfathered_tracts) > 0:
|
||||
_add_text(
|
||||
f"* This includes {len(grandfathered_tracts)} grandfathered tract(s) from v1.0 scoring. They are:\n"
|
||||
f"{grandfathered_tracts.to_list()}\n"
|
||||
)
|
||||
else:
|
||||
_add_text("* There are NO grandfathered tracts from v1.0 scoring.\n")
|
||||
|
||||
################
|
||||
# Create a delta
|
||||
################
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
Static data is used in the computation of the score as stated.
|
||||
|
||||
# v1.0-score-results-usa.csv
|
||||
This is the v1.0 score results used for grandfathering computations. It is a
|
||||
stripped down version of the v1.0 score to include only the columns needed.
|
File diff suppressed because it is too large
Load diff
|
@ -9,6 +9,7 @@ from data_pipeline.score import field_names
|
|||
|
||||
# Base Paths
|
||||
DATA_PATH = Path(settings.APP_ROOT) / "data"
|
||||
STATIC_DATA_PATH = Path(settings.APP_ROOT) / "content" / "static_data"
|
||||
TMP_PATH = DATA_PATH / "tmp"
|
||||
FILES_PATH = Path(settings.APP_ROOT) / "files"
|
||||
|
||||
|
@ -275,6 +276,7 @@ TILES_SCORE_COLUMNS = {
|
|||
# temporarily update this so that it's the Narwhal score that gets visualized on the map
|
||||
# The NEW final score value INCLUDES the adjacency index.
|
||||
field_names.FINAL_SCORE_N_BOOLEAN: "SN_C",
|
||||
field_names.FINAL_SCORE_N_BOOLEAN_V1_0: "SN_C_V10",
|
||||
field_names.IS_TRIBAL_DAC: "SN_T",
|
||||
field_names.DIABETES_LOW_INCOME_FIELD: "DLI",
|
||||
field_names.ASTHMA_LOW_INCOME_FIELD: "ALI",
|
||||
|
|
|
@ -54,6 +54,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.eamlis_df: pd.DataFrame
|
||||
self.fuds_df: pd.DataFrame
|
||||
self.tribal_overlap_df: pd.DataFrame
|
||||
self.v1_0_score_results_df: pd.DataFrame
|
||||
|
||||
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
|
||||
|
||||
|
@ -205,6 +206,22 @@ class ScoreETL(ExtractTransformLoad):
|
|||
header=None,
|
||||
)
|
||||
|
||||
# Load v1.0 score results for grandfathering purposes
|
||||
score_v1_0_csv = (
|
||||
constants.STATIC_DATA_PATH / "v1.0-score-results-usa.csv"
|
||||
)
|
||||
self.v1_0_score_results_df = pd.read_csv(
|
||||
score_v1_0_csv,
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
self.v1_0_score_results_df.rename(
|
||||
columns={
|
||||
field_names.FINAL_SCORE_N_BOOLEAN: field_names.FINAL_SCORE_N_BOOLEAN_V1_0,
|
||||
},
|
||||
inplace=True,
|
||||
)
|
||||
|
||||
def _join_tract_dfs(self, census_tract_dfs: list) -> pd.DataFrame:
|
||||
logger.debug("Joining Census Tract dataframes")
|
||||
|
||||
|
@ -364,6 +381,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
self.eamlis_df,
|
||||
self.fuds_df,
|
||||
self.tribal_overlap_df,
|
||||
self.v1_0_score_results_df,
|
||||
]
|
||||
|
||||
# Sanity check each data frame before merging.
|
||||
|
@ -514,6 +532,7 @@ class ScoreETL(ExtractTransformLoad):
|
|||
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
|
||||
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
|
||||
field_names.IS_TRIBAL_DAC,
|
||||
field_names.FINAL_SCORE_N_BOOLEAN_V1_0,
|
||||
]
|
||||
|
||||
# For some columns, high values are "good", so we want to reverse the percentile
|
||||
|
|
|
@ -129,6 +129,16 @@ def tile_data_expected():
|
|||
return pd.read_pickle(pytest.SNAPSHOT_DIR / "tile_data_expected.pkl")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def create_tile_score_data_input():
|
||||
return pd.read_pickle(pytest.SNAPSHOT_DIR / "create_tile_score_data_input.pkl")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def create_tile_data_expected():
|
||||
return pd.read_pickle(pytest.SNAPSHOT_DIR / "create_tile_data_expected.pkl")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def downloadable_data_expected():
|
||||
return pd.read_pickle(
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
These files are used as inputs to unit tests. Some notes in their creation is below.
|
||||
|
||||
### create_tile_data_expected.pkl
|
||||
1. Set a breakpoint in the `test_create_tile_data` method in `data_pipeline/etl/score/tests/test_score_post.py`
|
||||
after the call to `_create_tile_data` and debug the test.
|
||||
2. Extract a subset of the `output_tiles_df_actual` dataframe. Do not extract the whole score as the file
|
||||
will be too big and the test will run slow. Also, you need to extract the same tracts that are in
|
||||
the `create_tile_score_data_input.pkl` input data. For example, use the following command once the breakpoint is reached
|
||||
to extract a few rows at the top and bottom of the score. This will some capture states and territories.
|
||||
```python
|
||||
import pandas as pd
|
||||
pd.concat([output_tiles_df_actual.head(3), output_tiles_df_actual.tail(3)], ignore_index=True).to_pickle('data_pipeline/etl/score/tests/snapshots/create_tile_data_expected.pkl')
|
||||
```
|
||||
|
||||
### create_tile_score_data_input.pkl
|
||||
1. Set a breakpoint in the transform method in `data_pipeline/etl/score/etl_score_post.py` before the call to
|
||||
`_create_tile_data` and run the post scoring.
|
||||
2. Extract a subset of the `output_score_county_state_merged_df` dataframe. Do not extract the whole score as the file
|
||||
will be too big and the test will run slow. For example, use the following command once the breakpoint is reached
|
||||
to extract a few rows at the top and bottom of the score. This will some capture states and territories.
|
||||
```python
|
||||
pd.concat([output_score_county_state_merged_df.head(3), output_score_county_state_merged_df.tail(3)], ignore_index=True).to_pickle('data_pipeline/etl/score/tests/snapshots/create_tile_score_data_input.pkl')
|
||||
```
|
Binary file not shown.
Binary file not shown.
|
@ -80,11 +80,11 @@ def test_create_score_data(
|
|||
)
|
||||
|
||||
|
||||
def test_create_tile_data(etl, score_data_expected, tile_data_expected):
|
||||
output_tiles_df_actual = etl._create_tile_data(score_data_expected)
|
||||
def test_create_tile_data(etl, create_tile_score_data_input, create_tile_data_expected):
|
||||
output_tiles_df_actual = etl._create_tile_data(create_tile_score_data_input)
|
||||
pdt.assert_frame_equal(
|
||||
output_tiles_df_actual,
|
||||
tile_data_expected,
|
||||
create_tile_data_expected,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
|
|||
ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)"
|
||||
ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"
|
||||
ISLAND_AREA_BACKFILL_SUFFIX = " in 2009"
|
||||
V1_0_RESULTS_SUFFIX = " v1.0"
|
||||
|
||||
# Geographic field names
|
||||
GEOID_TRACT_FIELD = "GEOID10_TRACT"
|
||||
|
@ -27,6 +28,10 @@ N_NON_WORKFORCE = "Any Non-Workforce Factor (Definition N)"
|
|||
FINAL_SCORE_N_BOOLEAN = (
|
||||
"Definition N community, including adjacency index tracts"
|
||||
)
|
||||
FINAL_SCORE_N_BOOLEAN_V1_0 = f"{FINAL_SCORE_N_BOOLEAN}{V1_0_RESULTS_SUFFIX}"
|
||||
GRANDFATHERED_N_COMMUNITIES_V1_0 = (
|
||||
f"Grandfathered {SCORE_N_COMMUNITIES} from v1.0"
|
||||
)
|
||||
|
||||
PERCENTILE = 90
|
||||
MEDIAN_HOUSE_VALUE_PERCENTILE = 90
|
||||
|
|
|
@ -1024,6 +1024,20 @@ class ScoreNarwhal(Score):
|
|||
self.df[field_names.SCORE_N_COMMUNITIES],
|
||||
)
|
||||
|
||||
def _mark_grandfathered_dacs(self) -> None:
|
||||
"""Territory tracts that are flagged as DACS in the V1.0 score are also marked."""
|
||||
self.df[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0] = np.where(
|
||||
self.df[field_names.FINAL_SCORE_N_BOOLEAN_V1_0]
|
||||
& ~self.df[field_names.FINAL_SCORE_N_BOOLEAN],
|
||||
True,
|
||||
False,
|
||||
)
|
||||
self.df[field_names.FINAL_SCORE_N_BOOLEAN] = np.where(
|
||||
self.df[field_names.FINAL_SCORE_N_BOOLEAN_V1_0],
|
||||
True,
|
||||
self.df[field_names.FINAL_SCORE_N_BOOLEAN],
|
||||
)
|
||||
|
||||
def _mark_poverty_flag(self) -> None:
|
||||
"""Combine poverty less than 200% for territories and update the income flag."""
|
||||
# First we set the low income flag for non-territories by themselves, this
|
||||
|
@ -1111,6 +1125,7 @@ class ScoreNarwhal(Score):
|
|||
] = self.df[field_names.SCORE_N_COMMUNITIES].astype(int)
|
||||
|
||||
self._mark_donut_hole_tracts()
|
||||
self._mark_grandfathered_dacs()
|
||||
self.df[
|
||||
field_names.PERCENT_OF_TRACT_IS_DAC
|
||||
] = self._get_percent_of_tract_that_is_dac()
|
||||
|
|
|
@ -128,3 +128,30 @@ def test_mark_poverty_flag():
|
|||
assert not test_data[~expected_low_income_filter][
|
||||
field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED
|
||||
].all()
|
||||
|
||||
|
||||
def test_mark_grandfathered_dacs():
|
||||
data = {
|
||||
field_names.GEOID_TRACT_FIELD: [
|
||||
"78010971500",
|
||||
"78010970500",
|
||||
"66010954400",
|
||||
"66010953400",
|
||||
],
|
||||
field_names.FINAL_SCORE_N_BOOLEAN_V1_0: [False, False, True, True],
|
||||
field_names.FINAL_SCORE_N_BOOLEAN: [False, True, False, True],
|
||||
}
|
||||
test_df = pd.DataFrame(data)
|
||||
scorer = ScoreNarwhal(test_df)
|
||||
scorer._mark_grandfathered_dacs()
|
||||
result = scorer.df
|
||||
assert field_names.GRANDFATHERED_N_COMMUNITIES_V1_0 in result.columns
|
||||
assert not result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][0]
|
||||
assert not result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][1]
|
||||
assert result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][2]
|
||||
assert not result[field_names.GRANDFATHERED_N_COMMUNITIES_V1_0][3]
|
||||
|
||||
assert not result[field_names.FINAL_SCORE_N_BOOLEAN][0]
|
||||
assert result[field_names.FINAL_SCORE_N_BOOLEAN][1]
|
||||
assert result[field_names.FINAL_SCORE_N_BOOLEAN][2]
|
||||
assert result[field_names.FINAL_SCORE_N_BOOLEAN][3]
|
||||
|
|
Loading…
Add table
Reference in a new issue