mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-29 17:41:17 -07:00
Add decennial 2020 territory imputations
This commit is contained in:
parent
6436dfa683
commit
cce91fb47b
10 changed files with 420 additions and 75 deletions
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
import json
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
|
@ -14,6 +15,10 @@ from data_pipeline.etl.datasource import DataSource
|
|||
from data_pipeline.etl.datasource import FileDataSource
|
||||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.etl.sources.census_acs.etl import CensusACSETL
|
||||
from data_pipeline.etl.sources.census_acs.etl_imputations import (
|
||||
calculate_income_measures,
|
||||
)
|
||||
|
||||
pd.options.mode.chained_assignment = "raise"
|
||||
|
||||
|
@ -27,6 +32,9 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
/ "dataset"
|
||||
/ f"census_decennial_{DECENNIAL_YEAR}"
|
||||
)
|
||||
CENSUS_GEOJSON_PATH = (
|
||||
ExtractTransformLoad.DATA_PATH / "census" / "geojson" / "us.json"
|
||||
)
|
||||
|
||||
def __get_api_url(
|
||||
self,
|
||||
|
@ -136,7 +144,73 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
field_names.GEOID_TRACT_FIELD,
|
||||
] = "69120950200"
|
||||
|
||||
def transform(self) -> None:
|
||||
def _impute_income(self, geojson_path: Path):
|
||||
"""Impute income for both income measures."""
|
||||
# Merges Census geojson to imput values from.
|
||||
logger.debug(f"Reading GeoJSON from {geojson_path}")
|
||||
geo_df = gpd.read_file(geojson_path)
|
||||
self.df_all = CensusACSETL.merge_geojson(
|
||||
df=self.df_all,
|
||||
usa_geo_df=geo_df,
|
||||
)
|
||||
|
||||
logger.debug("Imputing income information")
|
||||
impute_var_named_tup_list = [
|
||||
CensusACSETL.ImputeVariables(
|
||||
raw_field_name=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||
imputed_field_name=DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL,
|
||||
),
|
||||
]
|
||||
self.df_all = calculate_income_measures(
|
||||
impute_var_named_tup_list=impute_var_named_tup_list,
|
||||
geo_df=self.df_all,
|
||||
geoid_field=self.GEOID_TRACT_FIELD_NAME,
|
||||
population_field=field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019,
|
||||
)
|
||||
|
||||
logger.debug("Calculating with imputed values")
|
||||
self.df_all[
|
||||
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||
] = (
|
||||
self.df_all[
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||
].fillna(
|
||||
self.df_all[
|
||||
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL
|
||||
]
|
||||
)
|
||||
# Use clip to ensure that the values are not negative
|
||||
).clip(
|
||||
lower=0
|
||||
)
|
||||
|
||||
# All values should have a value at this point for tracts with >0 population
|
||||
assert (
|
||||
self.df_all[
|
||||
self.df_all[
|
||||
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2019
|
||||
]
|
||||
>= 1
|
||||
][
|
||||
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||
]
|
||||
.isna()
|
||||
.sum()
|
||||
== 0
|
||||
), "Error: not all values were filled with imputations..."
|
||||
|
||||
# We generate a boolean that is TRUE when there is an imputed income but not a baseline income, and FALSE otherwise.
|
||||
# This allows us to see which tracts have an imputed income.
|
||||
self.df_all[field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD] = (
|
||||
self.df_all[
|
||||
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||
].notna()
|
||||
& self.df_all[
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019
|
||||
].isna()
|
||||
)
|
||||
|
||||
def transform(self, geojson_path: Path = CENSUS_GEOJSON_PATH) -> None:
|
||||
# Creating Geo ID (Census Block Group) Field Name
|
||||
self.df_all[field_names.GEOID_TRACT_FIELD] = (
|
||||
self.df_all["state"] + self.df_all["county"] + self.df_all["tract"]
|
||||
|
@ -232,6 +306,8 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
f"There are {missing_value_count} missing values in the field {col} out of a total of {self.df_all.shape[0]} rows"
|
||||
)
|
||||
|
||||
self._impute_income(geojson_path)
|
||||
|
||||
def load(self) -> None:
|
||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||
columns_to_include = [
|
||||
|
@ -242,11 +318,14 @@ class CensusDecennialETL(ExtractTransformLoad):
|
|||
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||
DEC_FIELD_NAMES.IMPUTED_PERCENTAGE_HOUSEHOLDS_BELOW_200_PERC_POVERTY_LEVEL,
|
||||
field_names.CENSUS_DECENNIAL_ADJUSTED_POVERTY_LESS_THAN_200_FPL_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2019,
|
||||
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2019,
|
||||
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_PERCENT,
|
||||
DEC_FIELD_NAMES.COLLEGE_NON_ATTENDANCE,
|
||||
DEC_FIELD_NAMES.COLLEGE_ATTENDANCE_POPULATION,
|
||||
field_names.ISLAND_AREAS_IMPUTED_INCOME_FLAG_FIELD,
|
||||
] + self.final_race_fields
|
||||
self.df_all[columns_to_include].to_csv(
|
||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue