Issue 1910: Do not impute income for 0 population tracts (#1918)

* should be working, has unnecessary loggers

* removing loggers and cleaning up

* updating ejscreen tests

* adding tests and responding to PR feedback

* fixing broken smoke test

* delete smoketest docs
This commit is contained in:
Lucas Merrill Brown 2022-09-26 11:00:21 -04:00 committed by GitHub
commit 9fb9874a15
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 150 additions and 75 deletions

View file

@ -23,12 +23,11 @@ CENSUS_DATA_S3_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/census.zip"
class CensusACSETL(ExtractTransformLoad):
def __init__(self):
self.ACS_YEAR = 2019
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
)
NAME = "census_acs"
ACS_YEAR = 2019
MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
def __init__(self):
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
self.EMPLOYMENT_FIELDS = [
@ -216,8 +215,15 @@ class CensusACSETL(ExtractTransformLoad):
self.OTHER_RACE_FIELD_NAME,
]
# Note: this field does double-duty here. It's used as the total population
# within the age questions.
# It's also what EJScreen used as their variable for total population in the
# census tract, so we use it similarly.
# See p. 83 of https://www.epa.gov/sites/default/files/2021-04/documents/ejscreen_technical_document.pdf
self.TOTAL_POPULATION_FROM_AGE_TABLE = "B01001_001E" # Estimate!!Total:
self.AGE_INPUT_FIELDS = [
"B01001_001E", # Estimate!!Total:
self.TOTAL_POPULATION_FROM_AGE_TABLE,
"B01001_003E", # Estimate!!Total:!!Male:!!Under 5 years
"B01001_004E", # Estimate!!Total:!!Male:!!5 to 9 years
"B01001_005E", # Estimate!!Total:!!Male:!!10 to 14 years
@ -277,6 +283,7 @@ class CensusACSETL(ExtractTransformLoad):
self.COLUMNS_TO_KEEP = (
[
self.GEOID_TRACT_FIELD_NAME,
field_names.TOTAL_POP_FIELD,
self.UNEMPLOYED_FIELD_NAME,
self.LINGUISTIC_ISOLATION_FIELD_NAME,
self.MEDIAN_INCOME_FIELD_NAME,
@ -375,18 +382,22 @@ class CensusACSETL(ExtractTransformLoad):
)
geo_df = gpd.read_file(
self.DATA_PATH / "census" / "geojson" / "us.json"
self.DATA_PATH / "census" / "geojson" / "us.json",
)
df = self._merge_geojson(
df=df,
usa_geo_df=geo_df,
)
# Rename two fields.
# Rename some fields.
df = df.rename(
columns={
self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
}
self.TOTAL_POPULATION_FROM_AGE_TABLE: field_names.TOTAL_POP_FIELD,
},
errors="raise",
)
# Handle null values for various fields, which are `-666666666`.
@ -472,7 +483,6 @@ class CensusACSETL(ExtractTransformLoad):
)
# Calculate some demographic information.
df = df.rename(
columns={
"B02001_003E": self.BLACK_FIELD_NAME,
@ -560,14 +570,11 @@ class CensusACSETL(ExtractTransformLoad):
),
]
# Calculate age groups
total_population_age_series = df["B01001_001E"]
# For each age bucket, sum the relevant columns and calculate the total
# percentage.
for age_bucket, sum_columns in age_bucket_and_its_sum_columns:
df[age_bucket] = (
df[sum_columns].sum(axis=1) / total_population_age_series
df[sum_columns].sum(axis=1) / df[field_names.TOTAL_POP_FIELD]
)
# Calculate college attendance and adjust low income
@ -602,6 +609,7 @@ class CensusACSETL(ExtractTransformLoad):
],
geo_df=df,
geoid_field=self.GEOID_TRACT_FIELD_NAME,
minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION,
)
logger.info("Calculating with imputed values")
@ -615,13 +623,20 @@ class CensusACSETL(ExtractTransformLoad):
- df[self.COLLEGE_ATTENDANCE_FIELD].fillna(
df[self.IMPUTED_COLLEGE_ATTENDANCE_FIELD]
)
# Use clip to ensure that the values are not negative if college attendance
# is very high
).clip(
lower=0
)
# All values should have a value at this point
assert (
# For tracts with >0 population
df[
df[field_names.TOTAL_POP_FIELD]
>= self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION
][
# Then the imputed field should have no nulls
self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME
]
.isna()
@ -644,13 +659,5 @@ class CensusACSETL(ExtractTransformLoad):
& df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD].isna()
)
# Strip columns and save results to self.
self.df = df[self.COLUMNS_TO_KEEP]
def load(self) -> None:
logger.info("Saving Census ACS Data")
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
# Save results to self.
self.output_df = df

View file

@ -2,6 +2,7 @@ from typing import Any, List, NamedTuple, Tuple
import pandas as pd
import geopandas as gpd
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
# pylint: disable=unsubscriptable-object
@ -23,6 +24,7 @@ def _get_fips_mask(
def _get_neighbor_mask(
geo_df: gpd.GeoDataFrame, row: gpd.GeoSeries
) -> pd.Series:
"""Returns neighboring tracts."""
return geo_df["geometry"].touches(row["geometry"])
@ -40,24 +42,47 @@ def _choose_best_mask(
def _prepare_dataframe_for_imputation(
impute_var_named_tup_list: List[NamedTuple],
geo_df: gpd.GeoDataFrame,
population_field: str,
minimum_population_required_for_imputation: int = 1,
geoid_field: str = "GEOID10_TRACT",
) -> Tuple[Any, gpd.GeoDataFrame]:
"""Helper for imputation.
Given the inputs of `ImputeVariables`, returns list of tracts that need to be
imputed, along with a GeoDataFrame that has a column with the imputed field
"primed", meaning it is a copy of the raw field.
Will drop any rows with population less than
`minimum_population_required_for_imputation`.
"""
imputing_cols = [
impute_var_pair.raw_field_name
for impute_var_pair in impute_var_named_tup_list
]
# prime column to exist
# Prime column to exist
for impute_var_pair in impute_var_named_tup_list:
geo_df[impute_var_pair.imputed_field_name] = geo_df[
impute_var_pair.raw_field_name
].copy()
# generate a list of tracts for which at least one of the imputation
# columns is null
tract_list = geo_df[geo_df[imputing_cols].isna().any(axis=1)][
geoid_field
].unique()
# Generate a list of tracts for which at least one of the imputation
# columns is null that also meets population criteria.
tract_list = geo_df[
(
# First, check whether any of the columns we want to impute contain null
# values
geo_df[imputing_cols].isna().any(axis=1)
# Second, ensure population is either null or >= the minimum population
& (
geo_df[population_field].isnull()
| (
geo_df[population_field]
>= minimum_population_required_for_imputation
)
)
)
][geoid_field].unique()
# Check that imputation is a valid choice for this set of fields
logger.info(f"Imputing values for {len(tract_list)} unique tracts.")
@ -70,6 +95,8 @@ def calculate_income_measures(
impute_var_named_tup_list: list,
geo_df: gpd.GeoDataFrame,
geoid_field: str,
population_field: str = field_names.TOTAL_POP_FIELD,
minimum_population_required_for_imputation: int = 1,
) -> pd.DataFrame:
"""Impute values based on geographic neighbors
@ -89,6 +116,8 @@ def calculate_income_measures(
impute_var_named_tup_list=impute_var_named_tup_list,
geo_df=geo_df,
geoid_field=geoid_field,
population_field=population_field,
minimum_population_required_for_imputation=minimum_population_required_for_imputation,
)
# Iterate through the dataframe to impute in place
@ -119,6 +148,7 @@ def calculate_income_measures(
],
column_to_impute=impute_var_pair.raw_field_name,
)
geo_df.loc[index, impute_var_pair.imputed_field_name] = geo_df[
mask_to_use
][impute_var_pair.raw_field_name].mean()