Add decennial 2020 territory imputations

This commit is contained in:
Carlos Felix 2024-11-21 15:08:15 -05:00 committed by Carlos Felix
parent 6436dfa683
commit cce91fb47b
10 changed files with 420 additions and 75 deletions

View file

@ -2,9 +2,6 @@
name: Data Checks name: Data Checks
on: on:
pull_request: pull_request:
branches:
- main
- "**/release/**"
paths: paths:
- "data/**" - "data/**"
jobs: jobs:

View file

@ -473,6 +473,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019, field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019, field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019, field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019, field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
field_names.CENSUS_UNEMPLOYMENT_FIELD_2010, field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010, field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,

View file

@ -25,6 +25,9 @@ class CensusACSETL(ExtractTransformLoad):
NAME = "census_acs" NAME = "census_acs"
ACS_YEAR = 2019 ACS_YEAR = 2019
MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1 MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
ImputeVariables = namedtuple(
"ImputeVariables", ["raw_field_name", "imputed_field_name"]
)
def __init__(self): def __init__(self):
@ -284,7 +287,7 @@ class CensusACSETL(ExtractTransformLoad):
self.COLUMNS_TO_KEEP = ( self.COLUMNS_TO_KEEP = (
[ [
self.GEOID_TRACT_FIELD_NAME, field_names.GEOID_TRACT_FIELD,
field_names.TOTAL_POP_FIELD, field_names.TOTAL_POP_FIELD,
self.UNEMPLOYED_FIELD_NAME, self.UNEMPLOYED_FIELD_NAME,
self.LINGUISTIC_ISOLATION_FIELD_NAME, self.LINGUISTIC_ISOLATION_FIELD_NAME,
@ -335,15 +338,15 @@ class CensusACSETL(ExtractTransformLoad):
destination=self.census_acs_source, destination=self.census_acs_source,
acs_year=self.ACS_YEAR, acs_year=self.ACS_YEAR,
variables=variables, variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME, tract_output_field_name=field_names.GEOID_TRACT_FIELD,
data_path_for_fips_codes=self.DATA_PATH, data_path_for_fips_codes=self.DATA_PATH,
acs_type="acs5", acs_type="acs5",
) )
] ]
# pylint: disable=too-many-arguments # pylint: disable=too-many-arguments
def _merge_geojson( @staticmethod
self, def merge_geojson(
df: pd.DataFrame, df: pd.DataFrame,
usa_geo_df: gpd.GeoDataFrame, usa_geo_df: gpd.GeoDataFrame,
geoid_field: str = "GEOID10", geoid_field: str = "GEOID10",
@ -364,7 +367,7 @@ class CensusACSETL(ExtractTransformLoad):
county_code_field, county_code_field,
] ]
], ],
left_on=[self.GEOID_TRACT_FIELD_NAME], left_on=[field_names.GEOID_TRACT_FIELD],
right_on=[geoid_field], right_on=[geoid_field],
) )
) )
@ -377,7 +380,7 @@ class CensusACSETL(ExtractTransformLoad):
self.df = pd.read_csv( self.df = pd.read_csv(
self.census_acs_source, self.census_acs_source,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, dtype={field_names.GEOID_TRACT_FIELD: "string"},
) )
def transform(self) -> None: def transform(self) -> None:
@ -401,7 +404,7 @@ class CensusACSETL(ExtractTransformLoad):
self.DATA_PATH / "census" / "geojson" / "us.json", self.DATA_PATH / "census" / "geojson" / "us.json",
) )
df = self._merge_geojson( df = CensusACSETL.merge_geojson(
df=df, df=df,
usa_geo_df=geo_df, usa_geo_df=geo_df,
) )
@ -608,23 +611,19 @@ class CensusACSETL(ExtractTransformLoad):
# we impute income for both income measures # we impute income for both income measures
## TODO: Convert to pydantic for clarity ## TODO: Convert to pydantic for clarity
logger.debug("Imputing income information") logger.debug("Imputing income information")
ImputeVariables = namedtuple(
"ImputeVariables", ["raw_field_name", "imputed_field_name"]
)
df = calculate_income_measures( df = calculate_income_measures(
impute_var_named_tup_list=[ impute_var_named_tup_list=[
ImputeVariables( CensusACSETL.ImputeVariables(
raw_field_name=self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME, raw_field_name=self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
imputed_field_name=self.IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME, imputed_field_name=self.IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
), ),
ImputeVariables( CensusACSETL.ImputeVariables(
raw_field_name=self.COLLEGE_ATTENDANCE_FIELD, raw_field_name=self.COLLEGE_ATTENDANCE_FIELD,
imputed_field_name=self.IMPUTED_COLLEGE_ATTENDANCE_FIELD, imputed_field_name=self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
), ),
], ],
geo_df=df, geo_df=df,
geoid_field=self.GEOID_TRACT_FIELD_NAME, geoid_field=field_names.GEOID_TRACT_FIELD,
minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION, minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION,
) )

View file

@ -1,4 +1,5 @@
from enum import Enum from enum import Enum
from types import MappingProxyType
from data_pipeline.score import field_names from data_pipeline.score import field_names
@ -29,6 +30,7 @@ class DEC_FIELD_NAMES(str, Enum):
HOUSEHOLD_POVERTY_LEVEL_OVER_2_0 = ( HOUSEHOLD_POVERTY_LEVEL_OVER_2_0 = (
"Household poverty level Over 2.0 IN 2019" "Household poverty level Over 2.0 IN 2019"
) )
IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL = f"{field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019}, imputed"
TOTAL_HOUSEHOLD_POVERTY_LEVEL = "Total Household poverty level IN 2019" TOTAL_HOUSEHOLD_POVERTY_LEVEL = "Total Household poverty level IN 2019"
TERRITORY_MEDIAN_INCOME = "Territory Median Income" TERRITORY_MEDIAN_INCOME = "Territory Median Income"
EMPLOYMENT_MALE_UNEMPLOYED = "Total males not in labor force" EMPLOYMENT_MALE_UNEMPLOYED = "Total males not in labor force"
@ -45,6 +47,9 @@ class DEC_FIELD_NAMES(str, Enum):
COLLEGE_ATTENDANCE_PERCENT = ( COLLEGE_ATTENDANCE_PERCENT = (
"Percent enrollment in college, graduate or professional school" "Percent enrollment in college, graduate or professional school"
) )
IMPUTED_COLLEGE_ATTENDANCE_PERCENT = (
f"{COLLEGE_ATTENDANCE_PERCENT}, imputed"
)
COLLEGE_NON_ATTENDANCE_PERCENT = "Percent of population not currently enrolled in college, graduate or professional school" COLLEGE_NON_ATTENDANCE_PERCENT = "Percent of population not currently enrolled in college, graduate or professional school"
def __str__(self) -> str: def __str__(self) -> str:
@ -146,45 +151,61 @@ OUTPUT_RACE_FIELDS = [
"""Race fields to output in the results.""" """Race fields to output in the results."""
DEC_TERRITORY_PARAMS = [ DEC_TERRITORY_PARAMS = [
{ MappingProxyType(
"state_abbreviation": "as", {
"fips": "60", "state_abbreviation": "as",
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st60_as_cou2020.txt "fips": "60",
"county_fips": ["010", "020", "030", "040", "050"], # https://www2.census.gov/geo/docs/reference/codes2020/cou/st60_as_cou2020.txt
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_AS_XWALK, "county_fips": ("010", "020", "030", "040", "050"),
# Note: we hardcode the median income for each territory in this dict, "xwalk": MappingProxyType(
# because that data is hard to programmatically access. __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_AS_XWALK
# https://www.ruralhealthinfo.org/states/american-samoa ),
"median_income": 26352, # Note: we hardcode the median income for each territory in this dict,
}, # because that data is hard to programmatically access.
{ # https://www.ruralhealthinfo.org/states/american-samoa
"state_abbreviation": "gu", "median_income": 26352,
"fips": "66", }
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st66_gu_cou2020.txt ),
"county_fips": ["010"], MappingProxyType(
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_GU_XWALK, {
# https://www.ruralhealthinfo.org/states/guam "state_abbreviation": "gu",
# https://data.census.gov/table/DECENNIALDPGU2020.DP3?g=040XX00US66&d=DECIA%20Guam%20Demographic%20Profile "fips": "66",
"median_income": 58289, # https://www2.census.gov/geo/docs/reference/codes2020/cou/st66_gu_cou2020.txt
}, "county_fips": ("010",),
{ "xwalk": MappingProxyType(
"state_abbreviation": "mp", __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_GU_XWALK
"fips": "69", ),
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st69_mp_cou2020.txt # https://www.ruralhealthinfo.org/states/guam
"county_fips": ["085", "100", "110", "120"], # https://data.census.gov/table/DECENNIALDPGU2020.DP3?g=040XX00US66&d=DECIA%20Guam%20Demographic%20Profile
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_MP_XWALK, "median_income": 58289,
# https://www.ruralhealthinfo.org/states/northern-mariana }
# https://data.census.gov/table/DECENNIALDPMP2020.DP3?d=DECIA%20Commonwealth%20of%20the%20Northern%20Mariana%20Islands%20Demographic%20Profile ),
"median_income": 31362, MappingProxyType(
}, {
{ "state_abbreviation": "mp",
"state_abbreviation": "vi", "fips": "69",
"fips": "78", # https://www2.census.gov/geo/docs/reference/codes2020/cou/st69_mp_cou2020.txt
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st78_vi_cou2020.txt "county_fips": ("085", "100", "110", "120"),
"county_fips": ["010", "020", "030"], "xwalk": MappingProxyType(
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_VI_XWALK, __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_MP_XWALK
# https://www.ruralhealthinfo.org/states/us-virgin-islands ),
"median_income": 40408, # https://www.ruralhealthinfo.org/states/northern-mariana
}, # https://data.census.gov/table/DECENNIALDPMP2020.DP3?d=DECIA%20Commonwealth%20of%20the%20Northern%20Mariana%20Islands%20Demographic%20Profile
"median_income": 31362,
}
),
MappingProxyType(
{
"state_abbreviation": "vi",
"fips": "78",
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st78_vi_cou2020.txt
"county_fips": ("010", "020", "030"),
"xwalk": MappingProxyType(
__FIELD_NAME_COMMON_XWALK | __FIELD_NAME_VI_XWALK
),
# https://www.ruralhealthinfo.org/states/us-virgin-islands
"median_income": 40408,
}
),
] ]
"""List of territories to process.""" """Read-only list of territories to process."""

View file

@ -1,6 +1,7 @@
import os import os
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import geopandas as gpd
import json import json
from typing import List from typing import List
from pathlib import Path from pathlib import Path
@ -14,6 +15,10 @@ from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.sources.census_acs.etl import CensusACSETL
from data_pipeline.etl.sources.census_acs.etl_imputations import (
calculate_income_measures,
)
pd.options.mode.chained_assignment = "raise" pd.options.mode.chained_assignment = "raise"
@ -27,6 +32,9 @@ class CensusDecennialETL(ExtractTransformLoad):
/ "dataset" / "dataset"
/ f"census_decennial_{DECENNIAL_YEAR}" / f"census_decennial_{DECENNIAL_YEAR}"
) )
CENSUS_GEOJSON_PATH = (
ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us.json"
)
def __get_api_url( def __get_api_url(
self, self,
@ -136,7 +144,73 @@ class CensusDecennialETL(ExtractTransformLoad):
field_names.GEOID_TRACT_FIELD, field_names.GEOID_TRACT_FIELD,
] = "69120950200" ] = "69120950200"
def transform(self) -> None: def _impute_income(self, geojson_path: Path):
"""Impute income for both income measures."""
# Merges Census geojson to imput values from.
logger.debug(f"Reading GeoJSON from {geojson_path}")
geo_df = gpd.read_file(geojson_path)
self.df_all = CensusACSETL.merge_geojson(
df=self.df_all,
usa_geo_df=geo_df,
)
logger.debug("Imputing income information")
impute_var_named_tup_list = [
CensusACSETL.ImputeVariables(
raw_field_name=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
imputed_field_name=DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL,
),
]
self.df_all = calculate_income_measures(
impute_var_named_tup_list=impute_var_named_tup_list,
geo_df=self.df_all,
geoid_field=self.GEOID_TRACT_FIELD_NAME,
population_field=field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019,
)
logger.debug("Calculating with imputed values")
self.df_all[
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
] = (
self.df_all[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
].fillna(
self.df_all[
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
]
)
# Use clip to ensure that the values are not negative
).clip(
lower=0
)
# All values should have a value at this point for tracts with >0 population
assert (
self.df_all[
self.df_all[
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
]
>= 1
][
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
]
.isna()
.sum()
== 0
), "Error: not all values were filled with imputations..."
# We generate a boolean that is TRUE when there is an imputed income but not a baseline income, and FALSE otherwise.
# This allows us to see which tracts have an imputed income.
self.df_all[field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD] = (
self.df_all[
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
].notna()
& self.df_all[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
].isna()
)
def transform(self, geojson_path: Path = CENSUS_GEOJSON_PATH) -> None:
# Creating Geo ID (Census Block Group) Field Name # Creating Geo ID (Census Block Group) Field Name
self.df_all[field_names.GEOID_TRACT_FIELD] = ( self.df_all[field_names.GEOID_TRACT_FIELD] = (
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"] self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
@ -232,6 +306,8 @@ class CensusDecennialETL(ExtractTransformLoad):
f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows" f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows"
) )
self._impute_income(geojson_path)
def load(self) -> None: def load(self) -> None:
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
columns_to_include = [ columns_to_include = [
@ -242,11 +318,14 @@ class CensusDecennialETL(ExtractTransformLoad):
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019, field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019,
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019, field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019, field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL,
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019, field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019, field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT, DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT,
DEC_FIELD_NAMES.COLLEGE_NON_ATTENDANCE, DEC_FIELD_NAMES.COLLEGE_NON_ATTENDANCE,
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION, DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION,
field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD,
] + self.final_race_fields ] + self.final_race_fields
self.df_all[columns_to_include].to_csv( self.df_all[columns_to_include].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False

View file

@ -191,6 +191,7 @@ CENSUS_DECENNIAL_MEDIAN_INCOME_2019 = (
) )
CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019 = f"Percentage households below 100% of federal poverty line in {DEC_DATA_YEAR}" CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019 = f"Percentage households below 100% of federal poverty line in {DEC_DATA_YEAR}"
CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019 = f"Percentage households below 200% of federal poverty line in {DEC_DATA_YEAR}" CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019 = f"Percentage households below 200% of federal poverty line in {DEC_DATA_YEAR}"
CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019 = f"{CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019}, adjusted and imputed"
CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019 = f"Percent individuals age 25 or over with less than high school degree in {DEC_DATA_YEAR}" CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019 = f"Percent individuals age 25 or over with less than high school degree in {DEC_DATA_YEAR}"
CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019 = ( CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019 = (
f"Unemployment (percent) in {DEC_DATA_YEAR}" f"Unemployment (percent) in {DEC_DATA_YEAR}"
@ -707,6 +708,8 @@ ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD = (
) )
ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD = f"{CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019} exceeds {PERCENTILE}th percentile" ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD = f"{CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019} exceeds {PERCENTILE}th percentile"
ISLAND_POVERTY_PCTILE_THRESHOLD = f"{CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019} exceeds {PERCENTILE}th percentile" ISLAND_POVERTY_PCTILE_THRESHOLD = f"{CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019} exceeds {PERCENTILE}th percentile"
# Low Income Island Areas
ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD = f"Income data has been estimated based on neighbor income{ISLAND_AREAS_SUFFIX}"
# Not currently used in a factor # Not currently used in a factor
EXTREME_HEAT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD = ( EXTREME_HEAT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD = (

View file

@ -1044,7 +1044,7 @@ class ScoreNarwhal(Score):
island_areas_poverty_200_criteria_field_name, island_areas_poverty_200_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds( ) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df, df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019, column_from_island_areas=field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
column_from_decennial_census=field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD, column_from_decennial_census=field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_200_FPL_FIELD_2010, combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_200_FPL_FIELD_2010,
threshold_cutoff_for_island_areas=self.LOW_INCOME_THRESHOLD, threshold_cutoff_for_island_areas=self.LOW_INCOME_THRESHOLD,

View file

@ -1,4 +1,4 @@
GEOID10_TRACT,Percentage households below 200% of federal poverty line in 2009,"Percent of individuals below 200% Federal Poverty Line, imputed and adjusted","Percent of individuals below 200% Federal Poverty Line, imputed and adjusted (percentile)",Is low income (imputed and adjusted)? GEOID10_TRACT,"Percentage households below 200% of federal poverty line in 2009, adjusted and imputed","Percent of individuals below 200% Federal Poverty Line, imputed and adjusted","Percent of individuals below 200% Federal Poverty Line, imputed and adjusted (percentile)",Is low income (imputed and adjusted)?
01071950300,,0.1,0.1,False 01071950300,,0.1,0.1,False
36087011302,,0.7,0.7,False 36087011302,,0.7,0.7,False
72119130701,,0.5,0.5,False 72119130701,,0.5,0.5,False

1 GEOID10_TRACT Percentage households below 200% of federal poverty line in 2009 Percentage households below 200% of federal poverty line in 2009, adjusted and imputed Percent of individuals below 200% Federal Poverty Line, imputed and adjusted Percent of individuals below 200% Federal Poverty Line, imputed and adjusted (percentile) Is low income (imputed and adjusted)?
2 01071950300 0.1 0.1 False
3 36087011302 0.7 0.7 False
4 72119130701 0.5 0.5 False

View file

@ -10,6 +10,11 @@ from data_pipeline.etl.sources.census_decennial.etl import CensusDecennialETL
from data_pipeline.score import field_names from data_pipeline.score import field_names
def _check_fields_exist(df: pd.DataFrame, field_names: list):
for field in field_names:
assert field in df.columns
@pytest.fixture @pytest.fixture
def territory_params_fixture(): def territory_params_fixture():
return [ return [
@ -31,25 +36,39 @@ def territory_params_fixture():
@pytest.fixture @pytest.fixture
def extract_path_fixture(): def extract_path_fixture() -> Path:
return Path(__file__).parents[0] / "data/extract" return Path(__file__).parents[0] / "data/extract"
@pytest.fixture @pytest.fixture
def transform_path_fixture(): def transform_path_fixture() -> Path:
return Path(__file__).parents[0] / "data/transform" return Path(__file__).parents[0] / "data/transform"
@pytest.fixture @pytest.fixture
def transformed_data_fixture(transform_path_fixture): def imputed_path_fixture() -> Path:
"""Load the test data and call the ETL transform""" return Path(__file__).parents[0] / "data/imputation"
dec = CensusDecennialETL()
dec.df_all = pd.read_csv(
@pytest.fixture
def extracted_data_fixture(
transform_path_fixture: pd.DataFrame,
) -> pd.DataFrame:
return pd.read_csv(
transform_path_fixture / "usa.csv", transform_path_fixture / "usa.csv",
# Make sure these columns are string as expected of the original # Make sure these columns are string as expected of the original
dtype={"state": "object", "county": "object", "tract": "object"}, dtype={"state": "object", "county": "object", "tract": "object"},
) )
dec.transform()
@pytest.fixture
def transformed_data_fixture(
extracted_data_fixture: pd.DataFrame, imputed_path_fixture: Path
) -> pd.DataFrame:
"""Load the test data and call the ETL transform"""
dec = CensusDecennialETL()
dec.df_all = extracted_data_fixture
dec.transform(imputed_path_fixture / "census-us-territory-geojson.json")
return dec.df_all return dec.df_all
@ -67,7 +86,7 @@ def test_no_files_found(territory_params_fixture):
) )
def test_load_data(extract_path_fixture, territory_params_fixture): def test_load_data(extract_path_fixture: Path, territory_params_fixture):
"""Test the ETL loads and translates the data""" """Test the ETL loads and translates the data"""
dec = CensusDecennialETL() dec = CensusDecennialETL()
dec.extract( dec.extract(
@ -103,10 +122,10 @@ def test_load_data(extract_path_fixture, territory_params_fixture):
).any() ).any()
############### #################
# Transform tests # Transform tests
############### #################
def test_geo_tract_generation(transformed_data_fixture): def test_geo_tract_generation(transformed_data_fixture: pd.DataFrame):
result = transformed_data_fixture result = transformed_data_fixture
assert field_names.GEOID_TRACT_FIELD in result.columns assert field_names.GEOID_TRACT_FIELD in result.columns
assert result[field_names.GEOID_TRACT_FIELD].notnull().all() assert result[field_names.GEOID_TRACT_FIELD].notnull().all()
@ -118,7 +137,7 @@ def test_geo_tract_generation(transformed_data_fixture):
) )
def test_merge_tracts(transformed_data_fixture): def test_merge_tracts(transformed_data_fixture: pd.DataFrame):
result = transformed_data_fixture result = transformed_data_fixture
# 69120950200 exists, but the tract split does now # 69120950200 exists, but the tract split does now
assert ( assert (
@ -138,15 +157,103 @@ def test_merge_tracts(transformed_data_fixture):
) )
def test_remove_invalid_values(transformed_data_fixture): def test_remove_invalid_values(transformed_data_fixture: pd.DataFrame):
numeric_df = transformed_data_fixture.select_dtypes(include="number") numeric_df = transformed_data_fixture.select_dtypes(include="number")
assert not (numeric_df < -999).any().any() assert not (numeric_df < -999).any().any()
def test_race_fields(transformed_data_fixture): def test_race_fields(transformed_data_fixture: pd.DataFrame):
for race_field_name in OUTPUT_RACE_FIELDS: for race_field_name in OUTPUT_RACE_FIELDS:
assert race_field_name in transformed_data_fixture.columns assert race_field_name in transformed_data_fixture.columns
assert any( assert any(
col.startswith(field_names.PERCENT_PREFIX + race_field_name) col.startswith(field_names.PERCENT_PREFIX + race_field_name)
for col in transformed_data_fixture.columns for col in transformed_data_fixture.columns
) )
def test_transformation_fields(transformed_data_fixture: pd.DataFrame):
_check_fields_exist(
transformed_data_fixture,
[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019,
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION,
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT,
],
)
##################
# Imputation tests
##################
def test_merge_geojson(transformed_data_fixture: pd.DataFrame):
_check_fields_exist(transformed_data_fixture, ["STATEFP10", "COUNTYFP10"])
def test_imputation_added(transformed_data_fixture: pd.DataFrame):
assert (
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
in transformed_data_fixture.columns
)
# All rows with population > 0 need to have an value (real or imputed)
df_has_pop = transformed_data_fixture[
transformed_data_fixture[
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
]
> 0
]
assert (
df_has_pop[
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
]
.notnull()
.all()
)
# The imputed value equals the real value when available
df_has_real_data = transformed_data_fixture[
transformed_data_fixture[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
].notnull()
]
assert (
df_has_real_data[
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
]
== df_has_real_data[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
]
).all()
# The imputed value exists when no real value exists
df_missing_data = transformed_data_fixture[
transformed_data_fixture[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
].isnull()
]
assert (
df_missing_data[
df_missing_data[
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
].notnull()
][
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
]
.notnull()
.all()
)
# Test the imputation flag is set
df_missing_no_pop = df_missing_data[
df_missing_data[
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
]
> 0
]
assert df_missing_no_pop[
field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD
].all()