mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 09:41:26 -08:00
Add decennial 2020 territory imputations
This commit is contained in:
parent
6436dfa683
commit
cce91fb47b
10 changed files with 420 additions and 75 deletions
3
.github/workflows/data-checks.yml
vendored
3
.github/workflows/data-checks.yml
vendored
|
@ -2,9 +2,6 @@
|
||||||
name: Data Checks
|
name: Data Checks
|
||||||
on:
|
on:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
- "**/release/**"
|
|
||||||
paths:
|
paths:
|
||||||
- "data/**"
|
- "data/**"
|
||||||
jobs:
|
jobs:
|
||||||
|
|
|
@ -473,6 +473,7 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
|
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
|
||||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
|
||||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||||
|
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||||
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
|
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
|
||||||
field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
|
field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
|
||||||
field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
|
field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
|
||||||
|
|
|
@ -25,6 +25,9 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
NAME = "census_acs"
|
NAME = "census_acs"
|
||||||
ACS_YEAR = 2019
|
ACS_YEAR = 2019
|
||||||
MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
|
MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
|
||||||
|
ImputeVariables = namedtuple(
|
||||||
|
"ImputeVariables", ["raw_field_name", "imputed_field_name"]
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
@ -284,7 +287,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.COLUMNS_TO_KEEP = (
|
self.COLUMNS_TO_KEEP = (
|
||||||
[
|
[
|
||||||
self.GEOID_TRACT_FIELD_NAME,
|
field_names.GEOID_TRACT_FIELD,
|
||||||
field_names.TOTAL_POP_FIELD,
|
field_names.TOTAL_POP_FIELD,
|
||||||
self.UNEMPLOYED_FIELD_NAME,
|
self.UNEMPLOYED_FIELD_NAME,
|
||||||
self.LINGUISTIC_ISOLATION_FIELD_NAME,
|
self.LINGUISTIC_ISOLATION_FIELD_NAME,
|
||||||
|
@ -335,15 +338,15 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
destination=self.census_acs_source,
|
destination=self.census_acs_source,
|
||||||
acs_year=self.ACS_YEAR,
|
acs_year=self.ACS_YEAR,
|
||||||
variables=variables,
|
variables=variables,
|
||||||
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
tract_output_field_name=field_names.GEOID_TRACT_FIELD,
|
||||||
data_path_for_fips_codes=self.DATA_PATH,
|
data_path_for_fips_codes=self.DATA_PATH,
|
||||||
acs_type="acs5",
|
acs_type="acs5",
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
# pylint: disable=too-many-arguments
|
# pylint: disable=too-many-arguments
|
||||||
def _merge_geojson(
|
@staticmethod
|
||||||
self,
|
def merge_geojson(
|
||||||
df: pd.DataFrame,
|
df: pd.DataFrame,
|
||||||
usa_geo_df: gpd.GeoDataFrame,
|
usa_geo_df: gpd.GeoDataFrame,
|
||||||
geoid_field: str = "GEOID10",
|
geoid_field: str = "GEOID10",
|
||||||
|
@ -364,7 +367,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
county_code_field,
|
county_code_field,
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
left_on=[self.GEOID_TRACT_FIELD_NAME],
|
left_on=[field_names.GEOID_TRACT_FIELD],
|
||||||
right_on=[geoid_field],
|
right_on=[geoid_field],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -377,7 +380,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
self.census_acs_source,
|
self.census_acs_source,
|
||||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
dtype={field_names.GEOID_TRACT_FIELD: "string"},
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
@ -401,7 +404,7 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
self.DATA_PATH / "census" / "geojson" / "us.json",
|
self.DATA_PATH / "census" / "geojson" / "us.json",
|
||||||
)
|
)
|
||||||
|
|
||||||
df = self._merge_geojson(
|
df = CensusACSETL.merge_geojson(
|
||||||
df=df,
|
df=df,
|
||||||
usa_geo_df=geo_df,
|
usa_geo_df=geo_df,
|
||||||
)
|
)
|
||||||
|
@ -608,23 +611,19 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
# we impute income for both income measures
|
# we impute income for both income measures
|
||||||
## TODO: Convert to pydantic for clarity
|
## TODO: Convert to pydantic for clarity
|
||||||
logger.debug("Imputing income information")
|
logger.debug("Imputing income information")
|
||||||
ImputeVariables = namedtuple(
|
|
||||||
"ImputeVariables", ["raw_field_name", "imputed_field_name"]
|
|
||||||
)
|
|
||||||
|
|
||||||
df = calculate_income_measures(
|
df = calculate_income_measures(
|
||||||
impute_var_named_tup_list=[
|
impute_var_named_tup_list=[
|
||||||
ImputeVariables(
|
CensusACSETL.ImputeVariables(
|
||||||
raw_field_name=self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
raw_field_name=self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
||||||
imputed_field_name=self.IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
imputed_field_name=self.IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME,
|
||||||
),
|
),
|
||||||
ImputeVariables(
|
CensusACSETL.ImputeVariables(
|
||||||
raw_field_name=self.COLLEGE_ATTENDANCE_FIELD,
|
raw_field_name=self.COLLEGE_ATTENDANCE_FIELD,
|
||||||
imputed_field_name=self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
|
imputed_field_name=self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
geo_df=df,
|
geo_df=df,
|
||||||
geoid_field=self.GEOID_TRACT_FIELD_NAME,
|
geoid_field=field_names.GEOID_TRACT_FIELD,
|
||||||
minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION,
|
minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
from types import MappingProxyType
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,6 +30,7 @@ class DEC_FIELD_NAMES(str, Enum):
|
||||||
HOUSEHOLD_POVERTY_LEVEL_OVER_2_0 = (
|
HOUSEHOLD_POVERTY_LEVEL_OVER_2_0 = (
|
||||||
"Household poverty level Over 2.0 IN 2019"
|
"Household poverty level Over 2.0 IN 2019"
|
||||||
)
|
)
|
||||||
|
IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL = f"{field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019}, imputed"
|
||||||
TOTAL_HOUSEHOLD_POVERTY_LEVEL = "Total Household poverty level IN 2019"
|
TOTAL_HOUSEHOLD_POVERTY_LEVEL = "Total Household poverty level IN 2019"
|
||||||
TERRITORY_MEDIAN_INCOME = "Territory Median Income"
|
TERRITORY_MEDIAN_INCOME = "Territory Median Income"
|
||||||
EMPLOYMENT_MALE_UNEMPLOYED = "Total males not in labor force"
|
EMPLOYMENT_MALE_UNEMPLOYED = "Total males not in labor force"
|
||||||
|
@ -45,6 +47,9 @@ class DEC_FIELD_NAMES(str, Enum):
|
||||||
COLLEGE_ATTENDANCE_PERCENT = (
|
COLLEGE_ATTENDANCE_PERCENT = (
|
||||||
"Percent enrollment in college, graduate or professional school"
|
"Percent enrollment in college, graduate or professional school"
|
||||||
)
|
)
|
||||||
|
IMPUTED_COLLEGE_ATTENDANCE_PERCENT = (
|
||||||
|
f"{COLLEGE_ATTENDANCE_PERCENT}, imputed"
|
||||||
|
)
|
||||||
COLLEGE_NON_ATTENDANCE_PERCENT = "Percent of population not currently enrolled in college, graduate or professional school"
|
COLLEGE_NON_ATTENDANCE_PERCENT = "Percent of population not currently enrolled in college, graduate or professional school"
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
|
@ -146,45 +151,61 @@ OUTPUT_RACE_FIELDS = [
|
||||||
"""Race fields to output in the results."""
|
"""Race fields to output in the results."""
|
||||||
|
|
||||||
DEC_TERRITORY_PARAMS = [
|
DEC_TERRITORY_PARAMS = [
|
||||||
{
|
MappingProxyType(
|
||||||
"state_abbreviation": "as",
|
{
|
||||||
"fips": "60",
|
"state_abbreviation": "as",
|
||||||
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st60_as_cou2020.txt
|
"fips": "60",
|
||||||
"county_fips": ["010", "020", "030", "040", "050"],
|
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st60_as_cou2020.txt
|
||||||
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_AS_XWALK,
|
"county_fips": ("010", "020", "030", "040", "050"),
|
||||||
# Note: we hardcode the median income for each territory in this dict,
|
"xwalk": MappingProxyType(
|
||||||
# because that data is hard to programmatically access.
|
__FIELD_NAME_COMMON_XWALK | __FIELD_NAME_AS_XWALK
|
||||||
# https://www.ruralhealthinfo.org/states/american-samoa
|
),
|
||||||
"median_income": 26352,
|
# Note: we hardcode the median income for each territory in this dict,
|
||||||
},
|
# because that data is hard to programmatically access.
|
||||||
{
|
# https://www.ruralhealthinfo.org/states/american-samoa
|
||||||
"state_abbreviation": "gu",
|
"median_income": 26352,
|
||||||
"fips": "66",
|
}
|
||||||
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st66_gu_cou2020.txt
|
),
|
||||||
"county_fips": ["010"],
|
MappingProxyType(
|
||||||
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_GU_XWALK,
|
{
|
||||||
# https://www.ruralhealthinfo.org/states/guam
|
"state_abbreviation": "gu",
|
||||||
# https://data.census.gov/table/DECENNIALDPGU2020.DP3?g=040XX00US66&d=DECIA%20Guam%20Demographic%20Profile
|
"fips": "66",
|
||||||
"median_income": 58289,
|
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st66_gu_cou2020.txt
|
||||||
},
|
"county_fips": ("010",),
|
||||||
{
|
"xwalk": MappingProxyType(
|
||||||
"state_abbreviation": "mp",
|
__FIELD_NAME_COMMON_XWALK | __FIELD_NAME_GU_XWALK
|
||||||
"fips": "69",
|
),
|
||||||
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st69_mp_cou2020.txt
|
# https://www.ruralhealthinfo.org/states/guam
|
||||||
"county_fips": ["085", "100", "110", "120"],
|
# https://data.census.gov/table/DECENNIALDPGU2020.DP3?g=040XX00US66&d=DECIA%20Guam%20Demographic%20Profile
|
||||||
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_MP_XWALK,
|
"median_income": 58289,
|
||||||
# https://www.ruralhealthinfo.org/states/northern-mariana
|
}
|
||||||
# https://data.census.gov/table/DECENNIALDPMP2020.DP3?d=DECIA%20Commonwealth%20of%20the%20Northern%20Mariana%20Islands%20Demographic%20Profile
|
),
|
||||||
"median_income": 31362,
|
MappingProxyType(
|
||||||
},
|
{
|
||||||
{
|
"state_abbreviation": "mp",
|
||||||
"state_abbreviation": "vi",
|
"fips": "69",
|
||||||
"fips": "78",
|
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st69_mp_cou2020.txt
|
||||||
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st78_vi_cou2020.txt
|
"county_fips": ("085", "100", "110", "120"),
|
||||||
"county_fips": ["010", "020", "030"],
|
"xwalk": MappingProxyType(
|
||||||
"xwalk": __FIELD_NAME_COMMON_XWALK | __FIELD_NAME_VI_XWALK,
|
__FIELD_NAME_COMMON_XWALK | __FIELD_NAME_MP_XWALK
|
||||||
# https://www.ruralhealthinfo.org/states/us-virgin-islands
|
),
|
||||||
"median_income": 40408,
|
# https://www.ruralhealthinfo.org/states/northern-mariana
|
||||||
},
|
# https://data.census.gov/table/DECENNIALDPMP2020.DP3?d=DECIA%20Commonwealth%20of%20the%20Northern%20Mariana%20Islands%20Demographic%20Profile
|
||||||
|
"median_income": 31362,
|
||||||
|
}
|
||||||
|
),
|
||||||
|
MappingProxyType(
|
||||||
|
{
|
||||||
|
"state_abbreviation": "vi",
|
||||||
|
"fips": "78",
|
||||||
|
# https://www2.census.gov/geo/docs/reference/codes2020/cou/st78_vi_cou2020.txt
|
||||||
|
"county_fips": ("010", "020", "030"),
|
||||||
|
"xwalk": MappingProxyType(
|
||||||
|
__FIELD_NAME_COMMON_XWALK | __FIELD_NAME_VI_XWALK
|
||||||
|
),
|
||||||
|
# https://www.ruralhealthinfo.org/states/us-virgin-islands
|
||||||
|
"median_income": 40408,
|
||||||
|
}
|
||||||
|
),
|
||||||
]
|
]
|
||||||
"""List of territories to process."""
|
"""Read-only list of territories to process."""
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import geopandas as gpd
|
||||||
import json
|
import json
|
||||||
from typing import List
|
from typing import List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -14,6 +15,10 @@ from data_pipeline.etl.datasource import DataSource
|
||||||
from data_pipeline.etl.datasource import FileDataSource
|
from data_pipeline.etl.datasource import FileDataSource
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.sources.census_acs.etl import CensusACSETL
|
||||||
|
from data_pipeline.etl.sources.census_acs.etl_imputations import (
|
||||||
|
calculate_income_measures,
|
||||||
|
)
|
||||||
|
|
||||||
pd.options.mode.chained_assignment = "raise"
|
pd.options.mode.chained_assignment = "raise"
|
||||||
|
|
||||||
|
@ -27,6 +32,9 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
/ "dataset"
|
/ "dataset"
|
||||||
/ f"census_decennial_{DECENNIAL_YEAR}"
|
/ f"census_decennial_{DECENNIAL_YEAR}"
|
||||||
)
|
)
|
||||||
|
CENSUS_GEOJSON_PATH = (
|
||||||
|
ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us.json"
|
||||||
|
)
|
||||||
|
|
||||||
def __get_api_url(
|
def __get_api_url(
|
||||||
self,
|
self,
|
||||||
|
@ -136,7 +144,73 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
field_names.GEOID_TRACT_FIELD,
|
field_names.GEOID_TRACT_FIELD,
|
||||||
] = "69120950200"
|
] = "69120950200"
|
||||||
|
|
||||||
def transform(self) -> None:
|
def _impute_income(self, geojson_path: Path):
|
||||||
|
"""Impute income for both income measures."""
|
||||||
|
# Merges Census geojson to imput values from.
|
||||||
|
logger.debug(f"Reading GeoJSON from {geojson_path}")
|
||||||
|
geo_df = gpd.read_file(geojson_path)
|
||||||
|
self.df_all = CensusACSETL.merge_geojson(
|
||||||
|
df=self.df_all,
|
||||||
|
usa_geo_df=geo_df,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug("Imputing income information")
|
||||||
|
impute_var_named_tup_list = [
|
||||||
|
CensusACSETL.ImputeVariables(
|
||||||
|
raw_field_name=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||||
|
imputed_field_name=DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
self.df_all = calculate_income_measures(
|
||||||
|
impute_var_named_tup_list=impute_var_named_tup_list,
|
||||||
|
geo_df=self.df_all,
|
||||||
|
geoid_field=self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
population_field=field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug("Calculating with imputed values")
|
||||||
|
self.df_all[
|
||||||
|
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||||
|
] = (
|
||||||
|
self.df_all[
|
||||||
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||||
|
].fillna(
|
||||||
|
self.df_all[
|
||||||
|
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# Use clip to ensure that the values are not negative
|
||||||
|
).clip(
|
||||||
|
lower=0
|
||||||
|
)
|
||||||
|
|
||||||
|
# All values should have a value at this point for tracts with >0 population
|
||||||
|
assert (
|
||||||
|
self.df_all[
|
||||||
|
self.df_all[
|
||||||
|
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
|
||||||
|
]
|
||||||
|
>= 1
|
||||||
|
][
|
||||||
|
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||||
|
]
|
||||||
|
.isna()
|
||||||
|
.sum()
|
||||||
|
== 0
|
||||||
|
), "Error: not all values were filled with imputations..."
|
||||||
|
|
||||||
|
# We generate a boolean that is TRUE when there is an imputed income but not a baseline income, and FALSE otherwise.
|
||||||
|
# This allows us to see which tracts have an imputed income.
|
||||||
|
self.df_all[field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD] = (
|
||||||
|
self.df_all[
|
||||||
|
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||||
|
].notna()
|
||||||
|
& self.df_all[
|
||||||
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||||
|
].isna()
|
||||||
|
)
|
||||||
|
|
||||||
|
def transform(self, geojson_path: Path = CENSUS_GEOJSON_PATH) -> None:
|
||||||
# Creating Geo ID (Census Block Group) Field Name
|
# Creating Geo ID (Census Block Group) Field Name
|
||||||
self.df_all[field_names.GEOID_TRACT_FIELD] = (
|
self.df_all[field_names.GEOID_TRACT_FIELD] = (
|
||||||
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
|
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
|
||||||
|
@ -232,6 +306,8 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows"
|
f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self._impute_income(geojson_path)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
columns_to_include = [
|
columns_to_include = [
|
||||||
|
@ -242,11 +318,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019,
|
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019,
|
||||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
|
||||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||||
|
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL,
|
||||||
|
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||||
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
|
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
|
||||||
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
|
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
|
||||||
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT,
|
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT,
|
||||||
DEC_FIELD_NAMES.COLLEGE_NON_ATTENDANCE,
|
DEC_FIELD_NAMES.COLLEGE_NON_ATTENDANCE,
|
||||||
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION,
|
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION,
|
||||||
|
field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD,
|
||||||
] + self.final_race_fields
|
] + self.final_race_fields
|
||||||
self.df_all[columns_to_include].to_csv(
|
self.df_all[columns_to_include].to_csv(
|
||||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||||
|
|
|
@ -191,6 +191,7 @@ CENSUS_DECENNIAL_MEDIAN_INCOME_2019 = (
|
||||||
)
|
)
|
||||||
CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019 = f"Percentage households below 100% of federal poverty line in {DEC_DATA_YEAR}"
|
CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019 = f"Percentage households below 100% of federal poverty line in {DEC_DATA_YEAR}"
|
||||||
CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019 = f"Percentage households below 200% of federal poverty line in {DEC_DATA_YEAR}"
|
CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019 = f"Percentage households below 200% of federal poverty line in {DEC_DATA_YEAR}"
|
||||||
|
CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019 = f"{CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019}, adjusted and imputed"
|
||||||
CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019 = f"Percent individuals age 25 or over with less than high school degree in {DEC_DATA_YEAR}"
|
CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019 = f"Percent individuals age 25 or over with less than high school degree in {DEC_DATA_YEAR}"
|
||||||
CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019 = (
|
CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019 = (
|
||||||
f"Unemployment (percent) in {DEC_DATA_YEAR}"
|
f"Unemployment (percent) in {DEC_DATA_YEAR}"
|
||||||
|
@ -707,6 +708,8 @@ ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD = (
|
||||||
)
|
)
|
||||||
ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD = f"{CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019} exceeds {PERCENTILE}th percentile"
|
ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD = f"{CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019} exceeds {PERCENTILE}th percentile"
|
||||||
ISLAND_POVERTY_PCTILE_THRESHOLD = f"{CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019} exceeds {PERCENTILE}th percentile"
|
ISLAND_POVERTY_PCTILE_THRESHOLD = f"{CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019} exceeds {PERCENTILE}th percentile"
|
||||||
|
# Low Income Island Areas
|
||||||
|
ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD = f"Income data has been estimated based on neighbor income{ISLAND_AREAS_SUFFIX}"
|
||||||
|
|
||||||
# Not currently used in a factor
|
# Not currently used in a factor
|
||||||
EXTREME_HEAT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD = (
|
EXTREME_HEAT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD = (
|
||||||
|
|
|
@ -1044,7 +1044,7 @@ class ScoreNarwhal(Score):
|
||||||
island_areas_poverty_200_criteria_field_name,
|
island_areas_poverty_200_criteria_field_name,
|
||||||
) = self._combine_island_areas_with_states_and_set_thresholds(
|
) = self._combine_island_areas_with_states_and_set_thresholds(
|
||||||
df=self.df,
|
df=self.df,
|
||||||
column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
column_from_island_areas=field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||||
column_from_decennial_census=field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
|
column_from_decennial_census=field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
|
||||||
combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_200_FPL_FIELD_2010,
|
combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_200_FPL_FIELD_2010,
|
||||||
threshold_cutoff_for_island_areas=self.LOW_INCOME_THRESHOLD,
|
threshold_cutoff_for_island_areas=self.LOW_INCOME_THRESHOLD,
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
GEOID10_TRACT,Percentage households below 200% of federal poverty line in 2009,"Percent of individuals below 200% Federal Poverty Line, imputed and adjusted","Percent of individuals below 200% Federal Poverty Line, imputed and adjusted (percentile)",Is low income (imputed and adjusted)?
|
GEOID10_TRACT,"Percentage households below 200% of federal poverty line in 2009, adjusted and imputed","Percent of individuals below 200% Federal Poverty Line, imputed and adjusted","Percent of individuals below 200% Federal Poverty Line, imputed and adjusted (percentile)",Is low income (imputed and adjusted)?
|
||||||
01071950300,,0.1,0.1,False
|
01071950300,,0.1,0.1,False
|
||||||
36087011302,,0.7,0.7,False
|
36087011302,,0.7,0.7,False
|
||||||
72119130701,,0.5,0.5,False
|
72119130701,,0.5,0.5,False
|
||||||
|
|
|
File diff suppressed because one or more lines are too long
|
@ -10,6 +10,11 @@ from data_pipeline.etl.sources.census_decennial.etl import CensusDecennialETL
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
|
|
||||||
|
|
||||||
|
def _check_fields_exist(df: pd.DataFrame, field_names: list):
|
||||||
|
for field in field_names:
|
||||||
|
assert field in df.columns
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def territory_params_fixture():
|
def territory_params_fixture():
|
||||||
return [
|
return [
|
||||||
|
@ -31,25 +36,39 @@ def territory_params_fixture():
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def extract_path_fixture():
|
def extract_path_fixture() -> Path:
|
||||||
return Path(__file__).parents[0] / "data/extract"
|
return Path(__file__).parents[0] / "data/extract"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def transform_path_fixture():
|
def transform_path_fixture() -> Path:
|
||||||
return Path(__file__).parents[0] / "data/transform"
|
return Path(__file__).parents[0] / "data/transform"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def transformed_data_fixture(transform_path_fixture):
|
def imputed_path_fixture() -> Path:
|
||||||
"""Load the test data and call the ETL transform"""
|
return Path(__file__).parents[0] / "data/imputation"
|
||||||
dec = CensusDecennialETL()
|
|
||||||
dec.df_all = pd.read_csv(
|
|
||||||
|
@pytest.fixture
|
||||||
|
def extracted_data_fixture(
|
||||||
|
transform_path_fixture: pd.DataFrame,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
return pd.read_csv(
|
||||||
transform_path_fixture / "usa.csv",
|
transform_path_fixture / "usa.csv",
|
||||||
# Make sure these columns are string as expected of the original
|
# Make sure these columns are string as expected of the original
|
||||||
dtype={"state": "object", "county": "object", "tract": "object"},
|
dtype={"state": "object", "county": "object", "tract": "object"},
|
||||||
)
|
)
|
||||||
dec.transform()
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def transformed_data_fixture(
|
||||||
|
extracted_data_fixture: pd.DataFrame, imputed_path_fixture: Path
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Load the test data and call the ETL transform"""
|
||||||
|
dec = CensusDecennialETL()
|
||||||
|
dec.df_all = extracted_data_fixture
|
||||||
|
dec.transform(imputed_path_fixture / "census-us-territory-geojson.json")
|
||||||
return dec.df_all
|
return dec.df_all
|
||||||
|
|
||||||
|
|
||||||
|
@ -67,7 +86,7 @@ def test_no_files_found(territory_params_fixture):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_load_data(extract_path_fixture, territory_params_fixture):
|
def test_load_data(extract_path_fixture: Path, territory_params_fixture):
|
||||||
"""Test the ETL loads and translates the data"""
|
"""Test the ETL loads and translates the data"""
|
||||||
dec = CensusDecennialETL()
|
dec = CensusDecennialETL()
|
||||||
dec.extract(
|
dec.extract(
|
||||||
|
@ -103,10 +122,10 @@ def test_load_data(extract_path_fixture, territory_params_fixture):
|
||||||
).any()
|
).any()
|
||||||
|
|
||||||
|
|
||||||
###############
|
#################
|
||||||
# Transform tests
|
# Transform tests
|
||||||
###############
|
#################
|
||||||
def test_geo_tract_generation(transformed_data_fixture):
|
def test_geo_tract_generation(transformed_data_fixture: pd.DataFrame):
|
||||||
result = transformed_data_fixture
|
result = transformed_data_fixture
|
||||||
assert field_names.GEOID_TRACT_FIELD in result.columns
|
assert field_names.GEOID_TRACT_FIELD in result.columns
|
||||||
assert result[field_names.GEOID_TRACT_FIELD].notnull().all()
|
assert result[field_names.GEOID_TRACT_FIELD].notnull().all()
|
||||||
|
@ -118,7 +137,7 @@ def test_geo_tract_generation(transformed_data_fixture):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_merge_tracts(transformed_data_fixture):
|
def test_merge_tracts(transformed_data_fixture: pd.DataFrame):
|
||||||
result = transformed_data_fixture
|
result = transformed_data_fixture
|
||||||
# 69120950200 exists, but the tract split does now
|
# 69120950200 exists, but the tract split does now
|
||||||
assert (
|
assert (
|
||||||
|
@ -138,15 +157,103 @@ def test_merge_tracts(transformed_data_fixture):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_remove_invalid_values(transformed_data_fixture):
|
def test_remove_invalid_values(transformed_data_fixture: pd.DataFrame):
|
||||||
numeric_df = transformed_data_fixture.select_dtypes(include="number")
|
numeric_df = transformed_data_fixture.select_dtypes(include="number")
|
||||||
assert not (numeric_df < -999).any().any()
|
assert not (numeric_df < -999).any().any()
|
||||||
|
|
||||||
|
|
||||||
def test_race_fields(transformed_data_fixture):
|
def test_race_fields(transformed_data_fixture: pd.DataFrame):
|
||||||
for race_field_name in OUTPUT_RACE_FIELDS:
|
for race_field_name in OUTPUT_RACE_FIELDS:
|
||||||
assert race_field_name in transformed_data_fixture.columns
|
assert race_field_name in transformed_data_fixture.columns
|
||||||
assert any(
|
assert any(
|
||||||
col.startswith(field_names.PERCENT_PREFIX + race_field_name)
|
col.startswith(field_names.PERCENT_PREFIX + race_field_name)
|
||||||
for col in transformed_data_fixture.columns
|
for col in transformed_data_fixture.columns
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_transformation_fields(transformed_data_fixture: pd.DataFrame):
|
||||||
|
_check_fields_exist(
|
||||||
|
transformed_data_fixture,
|
||||||
|
[
|
||||||
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
|
||||||
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||||
|
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
|
||||||
|
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
|
||||||
|
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019,
|
||||||
|
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION,
|
||||||
|
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
##################
|
||||||
|
# Imputation tests
|
||||||
|
##################
|
||||||
|
def test_merge_geojson(transformed_data_fixture: pd.DataFrame):
|
||||||
|
_check_fields_exist(transformed_data_fixture, ["STATEFP10", "COUNTYFP10"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_imputation_added(transformed_data_fixture: pd.DataFrame):
|
||||||
|
assert (
|
||||||
|
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||||
|
in transformed_data_fixture.columns
|
||||||
|
)
|
||||||
|
|
||||||
|
# All rows with population > 0 need to have an value (real or imputed)
|
||||||
|
df_has_pop = transformed_data_fixture[
|
||||||
|
transformed_data_fixture[
|
||||||
|
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
|
||||||
|
]
|
||||||
|
> 0
|
||||||
|
]
|
||||||
|
assert (
|
||||||
|
df_has_pop[
|
||||||
|
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||||
|
]
|
||||||
|
.notnull()
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
# The imputed value equals the real value when available
|
||||||
|
df_has_real_data = transformed_data_fixture[
|
||||||
|
transformed_data_fixture[
|
||||||
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||||
|
].notnull()
|
||||||
|
]
|
||||||
|
assert (
|
||||||
|
df_has_real_data[
|
||||||
|
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||||
|
]
|
||||||
|
== df_has_real_data[
|
||||||
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||||
|
]
|
||||||
|
).all()
|
||||||
|
|
||||||
|
# The imputed value exists when no real value exists
|
||||||
|
df_missing_data = transformed_data_fixture[
|
||||||
|
transformed_data_fixture[
|
||||||
|
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||||
|
].isnull()
|
||||||
|
]
|
||||||
|
assert (
|
||||||
|
df_missing_data[
|
||||||
|
df_missing_data[
|
||||||
|
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||||
|
].notnull()
|
||||||
|
][
|
||||||
|
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||||
|
]
|
||||||
|
.notnull()
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test the imputation flag is set
|
||||||
|
df_missing_no_pop = df_missing_data[
|
||||||
|
df_missing_data[
|
||||||
|
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
|
||||||
|
]
|
||||||
|
> 0
|
||||||
|
]
|
||||||
|
assert df_missing_no_pop[
|
||||||
|
field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD
|
||||||
|
].all()
|
||||||
|
|
Loading…
Add table
Reference in a new issue