Add decennial 2020 territory imputations

This commit is contained in:
Carlos Felix 2024-11-21 15:08:15 -05:00 committed by Carlos Felix
commit cce91fb47b
10 changed files with 420 additions and 75 deletions

View file

@ -10,6 +10,11 @@ from data_pipeline.etl.sources.census_decennial.etl import CensusDecennialETL
from data_pipeline.score import field_names
def _check_fields_exist(df: pd.DataFrame, field_names: list):
for field in field_names:
assert field in df.columns
@pytest.fixture
def territory_params_fixture():
return [
@ -31,25 +36,39 @@ def territory_params_fixture():
@pytest.fixture
def extract_path_fixture():
def extract_path_fixture() -> Path:
return Path(__file__).parents[0] / "data/extract"
@pytest.fixture
def transform_path_fixture():
def transform_path_fixture() -> Path:
return Path(__file__).parents[0] / "data/transform"
@pytest.fixture
def transformed_data_fixture(transform_path_fixture):
"""Load the test data and call the ETL transform"""
dec = CensusDecennialETL()
dec.df_all = pd.read_csv(
def imputed_path_fixture() -> Path:
return Path(__file__).parents[0] / "data/imputation"
@pytest.fixture
def extracted_data_fixture(
transform_path_fixture: pd.DataFrame,
) -> pd.DataFrame:
return pd.read_csv(
transform_path_fixture / "usa.csv",
# Make sure these columns are string as expected of the original
dtype={"state": "object", "county": "object", "tract": "object"},
)
dec.transform()
@pytest.fixture
def transformed_data_fixture(
extracted_data_fixture: pd.DataFrame, imputed_path_fixture: Path
) -> pd.DataFrame:
"""Load the test data and call the ETL transform"""
dec = CensusDecennialETL()
dec.df_all = extracted_data_fixture
dec.transform(imputed_path_fixture / "census-us-territory-geojson.json")
return dec.df_all
@ -67,7 +86,7 @@ def test_no_files_found(territory_params_fixture):
)
def test_load_data(extract_path_fixture, territory_params_fixture):
def test_load_data(extract_path_fixture: Path, territory_params_fixture):
"""Test the ETL loads and translates the data"""
dec = CensusDecennialETL()
dec.extract(
@ -103,10 +122,10 @@ def test_load_data(extract_path_fixture, territory_params_fixture):
).any()
###############
#################
# Transform tests
###############
def test_geo_tract_generation(transformed_data_fixture):
#################
def test_geo_tract_generation(transformed_data_fixture: pd.DataFrame):
result = transformed_data_fixture
assert field_names.GEOID_TRACT_FIELD in result.columns
assert result[field_names.GEOID_TRACT_FIELD].notnull().all()
@ -118,7 +137,7 @@ def test_geo_tract_generation(transformed_data_fixture):
)
def test_merge_tracts(transformed_data_fixture):
def test_merge_tracts(transformed_data_fixture: pd.DataFrame):
result = transformed_data_fixture
# 69120950200 exists, but the tract split does now
assert (
@ -138,15 +157,103 @@ def test_merge_tracts(transformed_data_fixture):
)
def test_remove_invalid_values(transformed_data_fixture):
def test_remove_invalid_values(transformed_data_fixture: pd.DataFrame):
numeric_df = transformed_data_fixture.select_dtypes(include="number")
assert not (numeric_df < -999).any().any()
def test_race_fields(transformed_data_fixture):
def test_race_fields(transformed_data_fixture: pd.DataFrame):
for race_field_name in OUTPUT_RACE_FIELDS:
assert race_field_name in transformed_data_fixture.columns
assert any(
col.startswith(field_names.PERCENT_PREFIX + race_field_name)
for col in transformed_data_fixture.columns
)
def test_transformation_fields(transformed_data_fixture: pd.DataFrame):
_check_fields_exist(
transformed_data_fixture,
[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019,
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION,
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT,
],
)
##################
# Imputation tests
##################
def test_merge_geojson(transformed_data_fixture: pd.DataFrame):
_check_fields_exist(transformed_data_fixture, ["STATEFP10", "COUNTYFP10"])
def test_imputation_added(transformed_data_fixture: pd.DataFrame):
assert (
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
in transformed_data_fixture.columns
)
# All rows with population > 0 need to have an value (real or imputed)
df_has_pop = transformed_data_fixture[
transformed_data_fixture[
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
]
> 0
]
assert (
df_has_pop[
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
]
.notnull()
.all()
)
# The imputed value equals the real value when available
df_has_real_data = transformed_data_fixture[
transformed_data_fixture[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
].notnull()
]
assert (
df_has_real_data[
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
]
== df_has_real_data[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
]
).all()
# The imputed value exists when no real value exists
df_missing_data = transformed_data_fixture[
transformed_data_fixture[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
].isnull()
]
assert (
df_missing_data[
df_missing_data[
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
].notnull()
][
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
]
.notnull()
.all()
)
# Test the imputation flag is set
df_missing_no_pop = df_missing_data[
df_missing_data[
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
]
> 0
]
assert df_missing_no_pop[
field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD
].all()