mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-29 12:51:16 -07:00
Add decennial 2020 territory imputations
This commit is contained in:
parent
6436dfa683
commit
cce91fb47b
10 changed files with 420 additions and 75 deletions
File diff suppressed because one or more lines are too long
|
@ -10,6 +10,11 @@ from data_pipeline.etl.sources.census_decennial.etl import CensusDecennialETL
|
|||
from data_pipeline.score import field_names
|
||||
|
||||
|
||||
def _check_fields_exist(df: pd.DataFrame, field_names: list):
|
||||
for field in field_names:
|
||||
assert field in df.columns
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def territory_params_fixture():
|
||||
return [
|
||||
|
@ -31,25 +36,39 @@ def territory_params_fixture():
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
def extract_path_fixture():
|
||||
def extract_path_fixture() -> Path:
|
||||
return Path(__file__).parents[0] / "data/extract"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def transform_path_fixture():
|
||||
def transform_path_fixture() -> Path:
|
||||
return Path(__file__).parents[0] / "data/transform"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def transformed_data_fixture(transform_path_fixture):
|
||||
"""Load the test data and call the ETL transform"""
|
||||
dec = CensusDecennialETL()
|
||||
dec.df_all = pd.read_csv(
|
||||
def imputed_path_fixture() -> Path:
|
||||
return Path(__file__).parents[0] / "data/imputation"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def extracted_data_fixture(
|
||||
transform_path_fixture: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
return pd.read_csv(
|
||||
transform_path_fixture / "usa.csv",
|
||||
# Make sure these columns are string as expected of the original
|
||||
dtype={"state": "object", "county": "object", "tract": "object"},
|
||||
)
|
||||
dec.transform()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def transformed_data_fixture(
|
||||
extracted_data_fixture: pd.DataFrame, imputed_path_fixture: Path
|
||||
) -> pd.DataFrame:
|
||||
"""Load the test data and call the ETL transform"""
|
||||
dec = CensusDecennialETL()
|
||||
dec.df_all = extracted_data_fixture
|
||||
dec.transform(imputed_path_fixture / "census-us-territory-geojson.json")
|
||||
return dec.df_all
|
||||
|
||||
|
||||
|
@ -67,7 +86,7 @@ def test_no_files_found(territory_params_fixture):
|
|||
)
|
||||
|
||||
|
||||
def test_load_data(extract_path_fixture, territory_params_fixture):
|
||||
def test_load_data(extract_path_fixture: Path, territory_params_fixture):
|
||||
"""Test the ETL loads and translates the data"""
|
||||
dec = CensusDecennialETL()
|
||||
dec.extract(
|
||||
|
@ -103,10 +122,10 @@ def test_load_data(extract_path_fixture, territory_params_fixture):
|
|||
).any()
|
||||
|
||||
|
||||
###############
|
||||
#################
|
||||
# Transform tests
|
||||
###############
|
||||
def test_geo_tract_generation(transformed_data_fixture):
|
||||
#################
|
||||
def test_geo_tract_generation(transformed_data_fixture: pd.DataFrame):
|
||||
result = transformed_data_fixture
|
||||
assert field_names.GEOID_TRACT_FIELD in result.columns
|
||||
assert result[field_names.GEOID_TRACT_FIELD].notnull().all()
|
||||
|
@ -118,7 +137,7 @@ def test_geo_tract_generation(transformed_data_fixture):
|
|||
)
|
||||
|
||||
|
||||
def test_merge_tracts(transformed_data_fixture):
|
||||
def test_merge_tracts(transformed_data_fixture: pd.DataFrame):
|
||||
result = transformed_data_fixture
|
||||
# 69120950200 exists, but the tract split does now
|
||||
assert (
|
||||
|
@ -138,15 +157,103 @@ def test_merge_tracts(transformed_data_fixture):
|
|||
)
|
||||
|
||||
|
||||
def test_remove_invalid_values(transformed_data_fixture):
|
||||
def test_remove_invalid_values(transformed_data_fixture: pd.DataFrame):
|
||||
numeric_df = transformed_data_fixture.select_dtypes(include="number")
|
||||
assert not (numeric_df < -999).any().any()
|
||||
|
||||
|
||||
def test_race_fields(transformed_data_fixture):
|
||||
def test_race_fields(transformed_data_fixture: pd.DataFrame):
|
||||
for race_field_name in OUTPUT_RACE_FIELDS:
|
||||
assert race_field_name in transformed_data_fixture.columns
|
||||
assert any(
|
||||
col.startswith(field_names.PERCENT_PREFIX + race_field_name)
|
||||
for col in transformed_data_fixture.columns
|
||||
)
|
||||
|
||||
|
||||
def test_transformation_fields(transformed_data_fixture: pd.DataFrame):
|
||||
_check_fields_exist(
|
||||
transformed_data_fixture,
|
||||
[
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019,
|
||||
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION,
|
||||
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
##################
|
||||
# Imputation tests
|
||||
##################
|
||||
def test_merge_geojson(transformed_data_fixture: pd.DataFrame):
|
||||
_check_fields_exist(transformed_data_fixture, ["STATEFP10", "COUNTYFP10"])
|
||||
|
||||
|
||||
def test_imputation_added(transformed_data_fixture: pd.DataFrame):
|
||||
assert (
|
||||
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||
in transformed_data_fixture.columns
|
||||
)
|
||||
|
||||
# All rows with population > 0 need to have an value (real or imputed)
|
||||
df_has_pop = transformed_data_fixture[
|
||||
transformed_data_fixture[
|
||||
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
|
||||
]
|
||||
> 0
|
||||
]
|
||||
assert (
|
||||
df_has_pop[
|
||||
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||
]
|
||||
.notnull()
|
||||
.all()
|
||||
)
|
||||
|
||||
# The imputed value equals the real value when available
|
||||
df_has_real_data = transformed_data_fixture[
|
||||
transformed_data_fixture[
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||
].notnull()
|
||||
]
|
||||
assert (
|
||||
df_has_real_data[
|
||||
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||
]
|
||||
== df_has_real_data[
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||
]
|
||||
).all()
|
||||
|
||||
# The imputed value exists when no real value exists
|
||||
df_missing_data = transformed_data_fixture[
|
||||
transformed_data_fixture[
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||
].isnull()
|
||||
]
|
||||
assert (
|
||||
df_missing_data[
|
||||
df_missing_data[
|
||||
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||
].notnull()
|
||||
][
|
||||
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||
]
|
||||
.notnull()
|
||||
.all()
|
||||
)
|
||||
|
||||
# Test the imputation flag is set
|
||||
df_missing_no_pop = df_missing_data[
|
||||
df_missing_data[
|
||||
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
|
||||
]
|
||||
> 0
|
||||
]
|
||||
assert df_missing_no_pop[
|
||||
field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD
|
||||
].all()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue