Issue 1910: Do not impute income for 0 population tracts (#1918)

* should be working, has unnecessary loggers

* removing loggers and cleaning up

* updating ejscreen tests

* adding tests and responding to PR feedback

* fixing broken smoke test

* delete smoketest docs
This commit is contained in:
Lucas Merrill Brown 2022-09-26 11:00:21 -04:00 committed by GitHub
commit 9fb9874a15
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 150 additions and 75 deletions

View file

@ -5,6 +5,7 @@ import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census_acs.etl import CensusACSETL
from data_pipeline.etl.sources.national_risk_index.etl import (
NationalRiskIndexETL,
)
@ -35,7 +36,7 @@ class ScoreETL(ExtractTransformLoad):
# dataframes
self.df: pd.DataFrame
self.ejscreen_df: pd.DataFrame
self.census_df: pd.DataFrame
self.census_acs_df: pd.DataFrame
self.hud_housing_df: pd.DataFrame
self.cdc_places_df: pd.DataFrame
self.census_acs_median_incomes_df: pd.DataFrame
@ -67,14 +68,7 @@ class ScoreETL(ExtractTransformLoad):
)
# Load census data
census_csv = (
constants.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
)
self.census_df = pd.read_csv(
census_csv,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)
self.census_acs_df = CensusACSETL.get_data_frame()
# Load HUD housing data
hud_housing_csv = (
@ -346,7 +340,7 @@ class ScoreETL(ExtractTransformLoad):
# Join all the data sources that use census tracts
census_tract_dfs = [
self.census_df,
self.census_acs_df,
self.hud_housing_df,
self.cdc_places_df,
self.cdc_life_expectancy_df,
@ -364,7 +358,7 @@ class ScoreETL(ExtractTransformLoad):
self.nature_deprived_df,
self.eamlis_df,
self.fuds_df,
self.tribal_overlap_df
self.tribal_overlap_df,
]
# Sanity check each data frame before merging.