Add decennial 2020 territory imputations

This commit is contained in:
Carlos Felix 2024-11-21 15:08:15 -05:00 committed by Carlos Felix
commit cce91fb47b
10 changed files with 420 additions and 75 deletions

View file

@ -25,6 +25,9 @@ class CensusACSETL(ExtractTransformLoad):
NAME = "census_acs"
ACS_YEAR = 2019
MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
ImputeVariables = namedtuple(
"ImputeVariables", ["raw_field_name", "imputed_field_name"]
)
def __init__(self):
@ -284,7 +287,7 @@ class CensusACSETL(ExtractTransformLoad):
self.COLUMNS_TO_KEEP = (
[
self.GEOID_TRACT_FIELD_NAME,
field_names.GEOID_TRACT_FIELD,
field_names.TOTAL_POP_FIELD,
self.UNEMPLOYED_FIELD_NAME,
self.LINGUISTIC_ISOLATION_FIELD_NAME,
@ -335,15 +338,15 @@ class CensusACSETL(ExtractTransformLoad):
destination=self.census_acs_source,
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
tract_output_field_name=field_names.GEOID_TRACT_FIELD,
data_path_for_fips_codes=self.DATA_PATH,
acs_type="acs5",
)
]
# pylint: disable=too-many-arguments
def _merge_geojson(
self,
@staticmethod
def merge_geojson(
df: pd.DataFrame,
usa_geo_df: gpd.GeoDataFrame,
geoid_field: str = "GEOID10",
@ -364,7 +367,7 @@ class CensusACSETL(ExtractTransformLoad):
county_code_field,
]
],
left_on=[self.GEOID_TRACT_FIELD_NAME],
left_on=[field_names.GEOID_TRACT_FIELD],
right_on=[geoid_field],
)
)
@ -377,7 +380,7 @@ class CensusACSETL(ExtractTransformLoad):
self.df = pd.read_csv(
self.census_acs_source,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
dtype={field_names.GEOID_TRACT_FIELD: "string"},
)
def transform(self) -> None:
@ -401,7 +404,7 @@ class CensusACSETL(ExtractTransformLoad):
self.DATA_PATH / "census" / "geojson" / "us.json",
)
df = self._merge_geojson(
df = CensusACSETL.merge_geojson(
df=df,
usa_geo_df=geo_df,
)
@ -608,23 +611,19 @@ class CensusACSETL(ExtractTransformLoad):
# we impute income for both income measures
## TODO: Convert to pydantic for clarity
logger.debug("Imputing income information")
ImputeVariables = namedtuple(
"ImputeVariables", ["raw_field_name", "imputed_field_name"]
)
df = calculate_income_measures(
impute_var_named_tup_list=[
ImputeVariables(
CensusACSETL.ImputeVariables(
raw_field_name=self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
imputed_field_name=self.IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
),
ImputeVariables(
CensusACSETL.ImputeVariables(
raw_field_name=self.COLLEGE_ATTENDANCE_FIELD,
imputed_field_name=self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
),
],
geo_df=df,
geoid_field=self.GEOID_TRACT_FIELD_NAME,
geoid_field=field_names.GEOID_TRACT_FIELD,
minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION,
)

View file

@ -1,4 +1,5 @@
from enum import Enum
from types import MappingProxyType
from data_pipeline.score import field_names
@ -29,6 +30,7 @@ class DEC_FIELD_NAMES(str, Enum):
HOUSEHOLD_POVERTY_LEVEL_OVER_2_0 = (
"Household poverty level Over 2.0 IN 2019"
)
IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL = f"{field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019}, imputed"
TOTAL_HOUSEHOLD_POVERTY_LEVEL = "Total Household poverty level IN 2019"
TERRITORY_MEDIAN_INCOME = "Territory Median Income"
EMPLOYMENT_MALE_UNEMPLOYED = "Total males not in labor force"
@ -45,6 +47,9 @@ class DEC_FIELD_NAMES(str, Enum):
COLLEGE_ATTENDANCE_PERCENT = (
"Percent enrollment in college, graduate or professional school"
)
IMPUTED_COLLEGE_ATTENDANCE_PERCENT = (
f"{COLLEGE_ATTENDANCE_PERCENT}, imputed"
)
COLLEGE_NON_ATTENDANCE_PERCENT = "Percent of population not currently enrolled in college, graduate or professional school"
def __str__(self) -> str:
@ -146,45 +151,61 @@ OUTPUT_RACE_FIELDS = [
"""Race fields to output in the results."""
DEC_TERRITORY_PARAMS = [
{
"state_abbreviation": "as",
"fips": "60",
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st60_as_cou2020.txt
"county_fips": ["010", "020", "030", "040", "050"],
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_AS_XWALK,
# Note: we hardcode the median income for each territory in this dict,
# because that data is hard to programmatically access.
# https://www.ruralhealthinfo.org/states/american-samoa
"median_income": 26352,
},
{
"state_abbreviation": "gu",
"fips": "66",
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st66_gu_cou2020.txt
"county_fips": ["010"],
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_GU_XWALK,
# https://www.ruralhealthinfo.org/states/guam
# https://data.census.gov/table/DECENNIALDPGU2020.DP3?g=040XX00US66&d=DECIA%20Guam%20Demographic%20Profile
"median_income": 58289,
},
{
"state_abbreviation": "mp",
"fips": "69",
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st69_mp_cou2020.txt
"county_fips": ["085", "100", "110", "120"],
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_MP_XWALK,
# https://www.ruralhealthinfo.org/states/northern-mariana
# https://data.census.gov/table/DECENNIALDPMP2020.DP3?d=DECIA%20Commonwealth%20of%20the%20Northern%20Mariana%20Islands%20Demographic%20Profile
"median_income": 31362,
},
{
"state_abbreviation": "vi",
"fips": "78",
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st78_vi_cou2020.txt
"county_fips": ["010", "020", "030"],
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_VI_XWALK,
# https://www.ruralhealthinfo.org/states/us-virgin-islands
"median_income": 40408,
},
MappingProxyType(
{
"state_abbreviation": "as",
"fips": "60",
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st60_as_cou2020.txt
"county_fips": ("010", "020", "030", "040", "050"),
"xwalk": MappingProxyType(
__FIELD_NAME_COMMON_XWALK | __FIELD_NAME_AS_XWALK
),
# Note: we hardcode the median income for each territory in this dict,
# because that data is hard to programmatically access.
# https://www.ruralhealthinfo.org/states/american-samoa
"median_income": 26352,
}
),
MappingProxyType(
{
"state_abbreviation": "gu",
"fips": "66",
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st66_gu_cou2020.txt
"county_fips": ("010",),
"xwalk": MappingProxyType(
__FIELD_NAME_COMMON_XWALK | __FIELD_NAME_GU_XWALK
),
# https://www.ruralhealthinfo.org/states/guam
# https://data.census.gov/table/DECENNIALDPGU2020.DP3?g=040XX00US66&d=DECIA%20Guam%20Demographic%20Profile
"median_income": 58289,
}
),
MappingProxyType(
{
"state_abbreviation": "mp",
"fips": "69",
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st69_mp_cou2020.txt
"county_fips": ("085", "100", "110", "120"),
"xwalk": MappingProxyType(
__FIELD_NAME_COMMON_XWALK | __FIELD_NAME_MP_XWALK
),
# https://www.ruralhealthinfo.org/states/northern-mariana
# https://data.census.gov/table/DECENNIALDPMP2020.DP3?d=DECIA%20Commonwealth%20of%20the%20Northern%20Mariana%20Islands%20Demographic%20Profile
"median_income": 31362,
}
),
MappingProxyType(
{
"state_abbreviation": "vi",
"fips": "78",
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st78_vi_cou2020.txt
"county_fips": ("010", "020", "030"),
"xwalk": MappingProxyType(
__FIELD_NAME_COMMON_XWALK | __FIELD_NAME_VI_XWALK
),
# https://www.ruralhealthinfo.org/states/us-virgin-islands
"median_income": 40408,
}
),
]
"""List of territories to process."""
"""Read-only list of territories to process."""

View file

@ -1,6 +1,7 @@
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import json
from typing import List
from pathlib import Path
@ -14,6 +15,10 @@ from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.sources.census_acs.etl import CensusACSETL
from data_pipeline.etl.sources.census_acs.etl_imputations import (
calculate_income_measures,
)
pd.options.mode.chained_assignment = "raise"
@ -27,6 +32,9 @@ class CensusDecennialETL(ExtractTransformLoad):
/ "dataset"
/ f"census_decennial_{DECENNIAL_YEAR}"
)
CENSUS_GEOJSON_PATH = (
ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us.json"
)
def __get_api_url(
self,
@ -136,7 +144,73 @@ class CensusDecennialETL(ExtractTransformLoad):
field_names.GEOID_TRACT_FIELD,
] = "69120950200"
def transform(self) -> None:
def _impute_income(self, geojson_path: Path):
"""Impute income for both income measures."""
# Merges Census geojson to imput values from.
logger.debug(f"Reading GeoJSON from {geojson_path}")
geo_df = gpd.read_file(geojson_path)
self.df_all = CensusACSETL.merge_geojson(
df=self.df_all,
usa_geo_df=geo_df,
)
logger.debug("Imputing income information")
impute_var_named_tup_list = [
CensusACSETL.ImputeVariables(
raw_field_name=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
imputed_field_name=DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL,
),
]
self.df_all = calculate_income_measures(
impute_var_named_tup_list=impute_var_named_tup_list,
geo_df=self.df_all,
geoid_field=self.GEOID_TRACT_FIELD_NAME,
population_field=field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019,
)
logger.debug("Calculating with imputed values")
self.df_all[
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
] = (
self.df_all[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
].fillna(
self.df_all[
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
]
)
# Use clip to ensure that the values are not negative
).clip(
lower=0
)
# All values should have a value at this point for tracts with >0 population
assert (
self.df_all[
self.df_all[
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
]
>= 1
][
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
]
.isna()
.sum()
== 0
), "Error: not all values were filled with imputations..."
# We generate a boolean that is TRUE when there is an imputed income but not a baseline income, and FALSE otherwise.
# This allows us to see which tracts have an imputed income.
self.df_all[field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD] = (
self.df_all[
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
].notna()
& self.df_all[
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
].isna()
)
def transform(self, geojson_path: Path = CENSUS_GEOJSON_PATH) -> None:
# Creating Geo ID (Census Block Group) Field Name
self.df_all[field_names.GEOID_TRACT_FIELD] = (
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
@ -232,6 +306,8 @@ class CensusDecennialETL(ExtractTransformLoad):
f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows"
)
self._impute_income(geojson_path)
def load(self) -> None:
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
columns_to_include = [
@ -242,11 +318,14 @@ class CensusDecennialETL(ExtractTransformLoad):
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019,
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL,
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT,
DEC_FIELD_NAMES.COLLEGE_NON_ATTENDANCE,
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION,
field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD,
] + self.final_race_fields
self.df_all[columns_to_include].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False