Issue 1910: Do not impute income for 0 population tracts (#1918)

* should be working, has unnecessary loggers * removing loggers and cleaning up * updating ejscreen tests * adding tests and responding to PR feedback * fixing broken smoke test * delete smoketest docs
2025-07-28 09:21:16 -07:00 · 2022-09-26 11:00:21 -04:00 · 2022-09-26 11:00:21 -04:00 · 9fb9874a15
commit 9fb9874a15
parent 9e85375d9b
13 changed files with 150 additions and 75 deletions
--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@ -322,7 +322,9 @@ see [python-markdown docs](https://github.com/ipython-contrib/jupyter_contrib_nb
 ### Background
-For this project, we make use of [pytest](https://docs.pytest.org/en/latest/) for testing purposes. To run tests, simply run `poetry run pytest` in this directory (i.e., `justice40-tool/data/data-pipeline`).
+For this project, we make use of [pytest](https://docs.pytest.org/en/latest/) for testing purposes. 
 To run tests, simply run `poetry run pytest` in this directory (i.e., `justice40-tool/data/data-pipeline`).
 Test data is configured via [fixtures](https://docs.pytest.org/en/latest/explanation/fixtures.html).
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -365,6 +365,9 @@ class ExtractTransformLoad:
                f"No file found at `{output_file_path}`."
            )
        logger.info(
            f"Reading in CSV `{output_file_path}` for ETL of class `{cls}`."
        )
        output_df = pd.read_csv(
            output_file_path,
            dtype={
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -5,6 +5,7 @@ import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census_acs.etl import CensusACSETL
 from data_pipeline.etl.sources.national_risk_index.etl import (
    NationalRiskIndexETL,
 )
@ -35,7 +36,7 @@ class ScoreETL(ExtractTransformLoad):
        # dataframes
        self.df: pd.DataFrame
        self.ejscreen_df: pd.DataFrame
-        self.census_df: pd.DataFrame
+        self.census_acs_df: pd.DataFrame
        self.hud_housing_df: pd.DataFrame
        self.cdc_places_df: pd.DataFrame
        self.census_acs_median_incomes_df: pd.DataFrame
@ -67,14 +68,7 @@ class ScoreETL(ExtractTransformLoad):
        )
        # Load census data
-        census_csv = (
+        self.census_acs_df = CensusACSETL.get_data_frame()
            constants.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
        )
        self.census_df = pd.read_csv(
            census_csv,
            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
            low_memory=False,
        )
        # Load HUD housing data
        hud_housing_csv = (
@ -346,7 +340,7 @@ class ScoreETL(ExtractTransformLoad):
        # Join all the data sources that use census tracts
        census_tract_dfs = [
-            self.census_df,
+            self.census_acs_df,
            self.hud_housing_df,
            self.cdc_places_df,
            self.cdc_life_expectancy_df,
@ -364,7 +358,7 @@ class ScoreETL(ExtractTransformLoad):
            self.nature_deprived_df,
            self.eamlis_df,
            self.fuds_df,
-            self.tribal_overlap_df
+            self.tribal_overlap_df,
        ]
        # Sanity check each data frame before merging.
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@ -73,8 +73,7 @@ class CDCLifeExpectancy(ExtractTransformLoad):
        all_usa_raw_df = self._download_and_prep_data(
            file_url=self.USA_FILE_URL,
-            download_file_name=self.get_tmp_path()
+            download_file_name=self.get_tmp_path() / "US_A.CSV",
            / "US_A.CSV",
        )
        # Check which states are missing
@ -94,15 +93,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):
        logger.info("Downloading data for Maine")
        maine_raw_df = self._download_and_prep_data(
            file_url=self.MAINE_FILE_URL,
-            download_file_name=self.get_tmp_path()
+            download_file_name=self.get_tmp_path() / "maine.csv",
            / "maine.csv",
        )
        logger.info("Downloading data for Wisconsin")
        wisconsin_raw_df = self._download_and_prep_data(
            file_url=self.WISCONSIN_FILE_URL,
-            download_file_name=self.get_tmp_path()
+            download_file_name=self.get_tmp_path() / "wisconsin.csv",
            / "wisconsin.csv",
        )
        combined_df = pd.concat(
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -23,12 +23,11 @@ CENSUS_DATA_S3_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/census.zip"
 class CensusACSETL(ExtractTransformLoad):
-    def __init__(self):
+    NAME = "census_acs"
-        self.ACS_YEAR = 2019
+    ACS_YEAR = 2019
-        self.OUTPUT_PATH = (
+    MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
            self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}"
        )
    def __init__(self):
        self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
        self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
        self.EMPLOYMENT_FIELDS = [
@ -216,8 +215,15 @@ class CensusACSETL(ExtractTransformLoad):
            self.OTHER_RACE_FIELD_NAME,
        ]
        # Note: this field does double-duty here. It's used as the total population
        # within the age questions.
        # It's also what EJScreen used as their variable for total population in the
        # census tract, so we use it similarly.
        # See p. 83 of https://www.epa.gov/sites/default/files/2021-04/documents/ejscreen_technical_document.pdf
        self.TOTAL_POPULATION_FROM_AGE_TABLE = "B01001_001E"  # Estimate!!Total:
        self.AGE_INPUT_FIELDS = [
-            "B01001_001E",  # Estimate!!Total:
+            self.TOTAL_POPULATION_FROM_AGE_TABLE,
            "B01001_003E",  # Estimate!!Total:!!Male:!!Under 5 years
            "B01001_004E",  # Estimate!!Total:!!Male:!!5 to 9 years
            "B01001_005E",  # Estimate!!Total:!!Male:!!10 to 14 years
@ -277,6 +283,7 @@ class CensusACSETL(ExtractTransformLoad):
        self.COLUMNS_TO_KEEP = (
            [
                self.GEOID_TRACT_FIELD_NAME,
                field_names.TOTAL_POP_FIELD,
                self.UNEMPLOYED_FIELD_NAME,
                self.LINGUISTIC_ISOLATION_FIELD_NAME,
                self.MEDIAN_INCOME_FIELD_NAME,
@ -375,18 +382,22 @@ class CensusACSETL(ExtractTransformLoad):
            )
        geo_df = gpd.read_file(
-            self.DATA_PATH / "census" / "geojson" / "us.json"
+            self.DATA_PATH / "census" / "geojson" / "us.json",
        )
        df = self._merge_geojson(
            df=df,
            usa_geo_df=geo_df,
        )
-        # Rename two fields.
+
        # Rename some fields.
        df = df.rename(
            columns={
                self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME,
                self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME,
-            }
+                self.TOTAL_POPULATION_FROM_AGE_TABLE: field_names.TOTAL_POP_FIELD,
            },
            errors="raise",
        )
        # Handle null values for various fields, which are `-666666666`.
@ -472,7 +483,6 @@ class CensusACSETL(ExtractTransformLoad):
        )
        # Calculate some demographic information.
        df = df.rename(
            columns={
                "B02001_003E": self.BLACK_FIELD_NAME,
@ -560,14 +570,11 @@ class CensusACSETL(ExtractTransformLoad):
            ),
        ]
        # Calculate age groups
        total_population_age_series = df["B01001_001E"]
        # For each age bucket, sum the relevant columns and calculate the total
        # percentage.
        for age_bucket, sum_columns in age_bucket_and_its_sum_columns:
            df[age_bucket] = (
-                df[sum_columns].sum(axis=1) / total_population_age_series
+                df[sum_columns].sum(axis=1) / df[field_names.TOTAL_POP_FIELD]
            )
        # Calculate college attendance and adjust low income
@ -602,6 +609,7 @@ class CensusACSETL(ExtractTransformLoad):
            ],
            geo_df=df,
            geoid_field=self.GEOID_TRACT_FIELD_NAME,
            minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION,
        )
        logger.info("Calculating with imputed values")
@ -615,13 +623,20 @@ class CensusACSETL(ExtractTransformLoad):
            - df[self.COLLEGE_ATTENDANCE_FIELD].fillna(
                df[self.IMPUTED_COLLEGE_ATTENDANCE_FIELD]
            )
            # Use clip to ensure that the values are not negative if college attendance
            # is very high
        ).clip(
            lower=0
        )
        # All values should have a value at this point
        assert (
            # For tracts with >0 population
            df[
                df[field_names.TOTAL_POP_FIELD]
                >= self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION
            ][
                # Then the imputed field should have no nulls
                self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME
            ]
            .isna()
@ -644,13 +659,5 @@ class CensusACSETL(ExtractTransformLoad):
            & df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD].isna()
        )
-        # Strip columns and save results to self.
+        # Save results to self.
-        self.df = df[self.COLUMNS_TO_KEEP]
+        self.output_df = df
    def load(self) -> None:
        logger.info("Saving Census ACS Data")
        # mkdir census
        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
        self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False)
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py
@ -2,6 +2,7 @@ from typing import Any, List, NamedTuple, Tuple
 import pandas as pd
 import geopandas as gpd
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 # pylint: disable=unsubscriptable-object
@ -23,6 +24,7 @@ def _get_fips_mask(
 def _get_neighbor_mask(
    geo_df: gpd.GeoDataFrame, row: gpd.GeoSeries
 ) -> pd.Series:
    """Returns neighboring tracts."""
    return geo_df["geometry"].touches(row["geometry"])
@ -40,24 +42,47 @@ def _choose_best_mask(
 def _prepare_dataframe_for_imputation(
    impute_var_named_tup_list: List[NamedTuple],
    geo_df: gpd.GeoDataFrame,
    population_field: str,
    minimum_population_required_for_imputation: int = 1,
    geoid_field: str = "GEOID10_TRACT",
 ) -> Tuple[Any, gpd.GeoDataFrame]:
    """Helper for imputation.
    Given the inputs of `ImputeVariables`, returns list of tracts that need to be
    imputed, along with a GeoDataFrame that has a column with the imputed field
    "primed", meaning it is a copy of the raw field.
    Will drop any rows with population less than
    `minimum_population_required_for_imputation`.
    """
    imputing_cols = [
        impute_var_pair.raw_field_name
        for impute_var_pair in impute_var_named_tup_list
    ]
-    # prime column to exist
+    # Prime column to exist
    for impute_var_pair in impute_var_named_tup_list:
        geo_df[impute_var_pair.imputed_field_name] = geo_df[
            impute_var_pair.raw_field_name
        ].copy()
-    # generate a list of tracts for which at least one of the imputation
+    # Generate a list of tracts for which at least one of the imputation
-    # columns is null
+    # columns is null that also meets population criteria.
-    tract_list = geo_df[geo_df[imputing_cols].isna().any(axis=1)][
+    tract_list = geo_df[
-        geoid_field
+        (
-    ].unique()
+            # First, check whether any of the columns we want to impute contain null
            # values
            geo_df[imputing_cols].isna().any(axis=1)
            # Second, ensure population is either null or >= the minimum population
            & (
                geo_df[population_field].isnull()
                | (
                    geo_df[population_field]
                    >= minimum_population_required_for_imputation
                )
            )
        )
    ][geoid_field].unique()
    # Check that imputation is a valid choice for this set of fields
    logger.info(f"Imputing values for {len(tract_list)} unique tracts.")
@ -70,6 +95,8 @@ def calculate_income_measures(
    impute_var_named_tup_list: list,
    geo_df: gpd.GeoDataFrame,
    geoid_field: str,
    population_field: str = field_names.TOTAL_POP_FIELD,
    minimum_population_required_for_imputation: int = 1,
 ) -> pd.DataFrame:
    """Impute values based on geographic neighbors
@ -89,6 +116,8 @@ def calculate_income_measures(
        impute_var_named_tup_list=impute_var_named_tup_list,
        geo_df=geo_df,
        geoid_field=geoid_field,
        population_field=population_field,
        minimum_population_required_for_imputation=minimum_population_required_for_imputation,
    )
    # Iterate through the dataframe to impute in place
@ -119,6 +148,7 @@ def calculate_income_measures(
                    ],
                    column_to_impute=impute_var_pair.raw_field_name,
                )
                geo_df.loc[index, impute_var_pair.imputed_field_name] = geo_df[
                    mask_to_use
                ][impute_var_pair.raw_field_name].mean()
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@ -24,7 +24,6 @@ class EJSCREENETL(ExtractTransformLoad):
        self.COLUMNS_TO_KEEP = [
            self.GEOID_TRACT_FIELD_NAME,
            field_names.TOTAL_POP_FIELD,
            # pylint: disable=duplicate-code
            field_names.AIR_TOXICS_CANCER_RISK_FIELD,
            field_names.RESPIRATORY_HAZARD_FIELD,
@ -66,7 +65,6 @@ class EJSCREENETL(ExtractTransformLoad):
        self.output_df = self.df.rename(
            columns={
                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                "ACSTOTPOP": field_names.TOTAL_POP_FIELD,
                "CANCER": field_names.AIR_TOXICS_CANCER_RISK_FIELD,
                "RESP": field_names.RESPIRATORY_HAZARD_FIELD,
                "DSLPM": field_names.DIESEL_FIELD,
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
@ -108,8 +108,12 @@ class TribalOverlapETL(ExtractTransformLoad):
        # Switch from geographic to projected CRSes
        # because logically that's right
-        self.census_tract_gdf = self.census_tract_gdf.to_crs(crs=self.CRS_INTEGER)
+        self.census_tract_gdf = self.census_tract_gdf.to_crs(
-        tribal_gdf_without_points = tribal_gdf_without_points.to_crs(crs=self.CRS_INTEGER)
+            crs=self.CRS_INTEGER
        )
        tribal_gdf_without_points = tribal_gdf_without_points.to_crs(
            crs=self.CRS_INTEGER
        )
        # Create a measure for the entire census tract area
        self.census_tract_gdf["area_tract"] = self.census_tract_gdf.area
--- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py
+++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py
@ -15,10 +15,10 @@ def final_score_df():
@pytest.fixture()
-def census_df():
+def census_acs_df():
-    census_csv = constants.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv"
+    census_acs_csv = constants.DATA_PATH / "dataset" / "census_acs" / "usa.csv"
    return pd.read_csv(
-        census_csv,
+        census_acs_csv,
        dtype={GEOID_TRACT_FIELD: "string"},
        low_memory=False,
    )
--- a/data/data-pipeline/data_pipeline/tests/score/test_output.py
+++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py
@ -11,7 +11,7 @@ from .fixtures import (
    final_score_df,
    ejscreen_df,
    hud_housing_df,
-    census_df,
+    census_acs_df,
    cdc_places_df,
    census_acs_median_incomes_df,
    cdc_life_expectancy_df,
@ -235,7 +235,7 @@ def test_data_sources(
    final_score_df,
    hud_housing_df,
    ejscreen_df,
-    census_df,
+    census_acs_df,
    cdc_places_df,
    census_acs_median_incomes_df,
    cdc_life_expectancy_df,
@ -337,3 +337,41 @@ def test_output_tracts(final_score_df, national_tract_df):
 def test_all_tracts_have_scores(final_score_df):
    assert not final_score_df[field_names.SCORE_N_COMMUNITIES].isna().any()
 def test_imputed_tracts(final_score_df):
    # Make sure that any tracts with zero population have null imputed income
    tracts_with_zero_population_df = final_score_df[
        final_score_df[field_names.TOTAL_POP_FIELD] == 0
    ]
    assert (
        tracts_with_zero_population_df[
            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
        ]
        .isna()
        .all()
    )
    # Make sure that any tracts with null population have null imputed income
    tracts_with_null_population_df = final_score_df[
        final_score_df[field_names.TOTAL_POP_FIELD].isnull()
    ]
    assert (
        tracts_with_null_population_df[
            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
        ]
        .isna()
        .all()
    )
    # Make sure that no tracts with population have null imputed income
    tracts_with_some_population_df = final_score_df[
        final_score_df[field_names.TOTAL_POP_FIELD] > 0
    ]
    assert (
        not tracts_with_some_population_df[
            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD
        ]
        .isna()
        .any()
    )
--- a/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/output.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/output.csv
@ -1,16 +1,16 @@
-GEOID10_TRACT,Total population,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter exposure,PM2.5 in the air,Ozone,Traffic proximity and volume,Proximity to Risk Management Plan (RMP) facilities,Proximity to hazardous waste sites,Proximity to NPL sites,Wastewater discharge,Percent of households in linguistic isolation,Poverty (Less than 200% of federal poverty line),Individuals over 64 years old,Individuals under 5 years old,Percent pre-1960s housing (lead paint indicator),Leaky underground storage tanks
+GEOID10_TRACT,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter exposure,PM2.5 in the air,Ozone,Traffic proximity and volume,Proximity to Risk Management Plan (RMP) facilities,Proximity to hazardous waste sites,Proximity to NPL sites,Wastewater discharge,Percent of households in linguistic isolation,Poverty (Less than 200% of federal poverty line),Individuals over 64 years old,Individuals under 5 years old,Percent pre-1960s housing (lead paint indicator),Leaky underground storage tanks
-06027000800,3054,20.0000000000,0.2000000000,0.0162608457,5.9332945205,59.8143830065,134.3731709435,0.0161739005,0.0231458734,0.0088169702,0.0000000476,0.0943661972,0.4021269525,0.2445972495,0.0422396857,0.3691340106,0.0271801764
+06027000800,20.0000000000,0.2000000000,0.0162608457,5.9332945205,59.8143830065,134.3731709435,0.0161739005,0.0231458734,0.0088169702,0.0000000476,0.0943661972,0.4021269525,0.2445972495,0.0422396857,0.3691340106,0.0271801764
-06061021322,20899,30.0000000000,0.5000000000,0.1849562857,12.1102756164,52.7832287582,12.5173455346,0.4515663958,0.2027045525,0.0687928975,0.2667203153,0.0343563903,0.1859250743,0.1406287382,0.0683764773,0.0334588644,0.0258826940
+06061021322,30.0000000000,0.5000000000,0.1849562857,12.1102756164,52.7832287582,12.5173455346,0.4515663958,0.2027045525,0.0687928975,0.2667203153,0.0343563903,0.1859250743,0.1406287382,0.0683764773,0.0334588644,0.0258826940
-06069000802,3049,20.0000000000,0.2000000000,0.0375346206,7.4113546849,47.0434058824,15.7944927934,0.0811927061,0.1674220356,0.0396183204,,0.0324607330,0.2453201970,0.1534929485,0.0787143326,0.3485254692,0.0102735941
+06069000802,20.0000000000,0.2000000000,0.0375346206,7.4113546849,47.0434058824,15.7944927934,0.0811927061,0.1674220356,0.0396183204,,0.0324607330,0.2453201970,0.1534929485,0.0787143326,0.3485254692,0.0102735941
-15001021010,8606,10.0000000000,0.1000000000,0.0067389217,,,0.1074143214,0.0478749209,0.0931096253,0.0027318608,,0.0109090909,0.5159562078,0.1992795724,0.0366023704,0.0112496943,0.0259838494
+15001021010,10.0000000000,0.1000000000,0.0067389217,,,0.1074143214,0.0478749209,0.0931096253,0.0027318608,,0.0109090909,0.5159562078,0.1992795724,0.0366023704,0.0112496943,0.0259838494
-15001021101,3054,10.0000000000,0.1000000000,0.0033713587,,,1.7167679255,0.2484740667,0.2746856427,0.0025910486,,0.0194426442,0.4755657593,0.2976424361,0.0301244270,0.0168539326,0.0375389154
+15001021101,10.0000000000,0.1000000000,0.0033713587,,,1.7167679255,0.2484740667,0.2746856427,0.0025910486,,0.0194426442,0.4755657593,0.2976424361,0.0301244270,0.0168539326,0.0375389154
-15001021402,3778,10.0000000000,0.1000000000,0.0131608945,,,635.9981128640,0.0225482603,0.6278707343,0.0033357209,,0.0407569141,0.1877496671,0.2469560614,0.0751720487,0.1743524953,0.5088713177
+15001021402,10.0000000000,0.1000000000,0.0131608945,,,635.9981128640,0.0225482603,0.6278707343,0.0033357209,,0.0407569141,0.1877496671,0.2469560614,0.0751720487,0.1743524953,0.5088713177
-15001021800,5998,10.0000000000,0.1000000000,0.0049503455,,,0.0743045071,0.0402733327,0.0410968274,0.0038298946,,0.0359848485,0.2698678267,0.2352450817,0.0586862287,0.1676168757,0.1071290552
+15001021800,10.0000000000,0.1000000000,0.0049503455,,,0.0743045071,0.0402733327,0.0410968274,0.0038298946,,0.0359848485,0.2698678267,0.2352450817,0.0586862287,0.1676168757,0.1071290552
-15003010201,4936,10.0000000000,0.1000000000,0.0171119880,,,1493.8870892160,0.0548137804,0.4080845621,0.0694550700,,0.0340041638,0.2999166319,0.1318881686,0.0964343598,0.2131062951,0.0995447326
+15003010201,10.0000000000,0.1000000000,0.0171119880,,,1493.8870892160,0.0548137804,0.4080845621,0.0694550700,,0.0340041638,0.2999166319,0.1318881686,0.0964343598,0.2131062951,0.0995447326
-15007040603,2984,10.0000000000,0.1000000000,0.0225796264,,,255.5966484444,0.1042895043,0.5200441984,0.0065810172,,0.0311909263,0.2676292814,0.2533512064,0.0563002681,0.0935077519,0.1610354485
+15007040603,10.0000000000,0.1000000000,0.0225796264,,,255.5966484444,0.1042895043,0.5200441984,0.0065810172,,0.0311909263,0.2676292814,0.2533512064,0.0563002681,0.0935077519,0.1610354485
-15007040604,3529,10.0000000000,0.1000000000,0.0297040750,,,464.0468169721,0.1282189641,0.3810520320,0.0064334940,,0.0353833193,0.3687102371,0.1790875602,0.0943610088,0.1981538462,0.2277699060
+15007040604,10.0000000000,0.1000000000,0.0297040750,,,464.0468169721,0.1282189641,0.3810520320,0.0064334940,,0.0353833193,0.3687102371,0.1790875602,0.0943610088,0.1981538462,0.2277699060
-15007040700,9552,10.0000000000,0.1000000000,0.0120486502,,,829.6297843840,0.2776903565,0.5315584393,0.0062317499,,0.0328151986,0.2079176730,0.1920016750,0.0808207705,0.1049120679,0.8605507426
+15007040700,10.0000000000,0.1000000000,0.0120486502,,,829.6297843840,0.2776903565,0.5315584393,0.0062317499,,0.0328151986,0.2079176730,0.1920016750,0.0808207705,0.1049120679,0.8605507426
-15009030100,1405,10.0000000000,0.1000000000,0.0026846006,,,,0.0398066625,0.0329594792,0.0046765532,,0.0000000000,0.2911208151,0.2434163701,0.0882562278,0.2135678392,0.0973247551
+15009030100,10.0000000000,0.1000000000,0.0026846006,,,,0.0398066625,0.0329594792,0.0046765532,,0.0000000000,0.2911208151,0.2434163701,0.0882562278,0.2135678392,0.0973247551
-15009030201,2340,10.0000000000,0.1000000000,0.0063521816,,,7.0868595222,0.1292001112,0.0908033666,0.0053511202,,0.0000000000,0.2677266867,0.2367521368,0.0641025641,0.0928229665,0.0098923140
+15009030201,10.0000000000,0.1000000000,0.0063521816,,,7.0868595222,0.1292001112,0.0908033666,0.0053511202,,0.0000000000,0.2677266867,0.2367521368,0.0641025641,0.0928229665,0.0098923140
-15009030402,8562,10.0000000000,0.1000000000,0.0153866969,,,233.6880574427,0.6633705951,0.5914191729,0.0055146115,,0.0122641509,0.1792805419,0.1810324690,0.0463676711,0.0760149726,0.4432670413
+15009030402,10.0000000000,0.1000000000,0.0153866969,,,233.6880574427,0.6633705951,0.5914191729,0.0055146115,,0.0122641509,0.1792805419,0.1810324690,0.0463676711,0.0760149726,0.4432670413
-15009030800,7879,10.0000000000,0.1000000000,0.0169064550,,,575.9991000531,1.0347888110,0.5999348163,0.0061499864,0.0008675195,0.0013422819,0.1386100877,0.1303464907,0.0753902780,0.1220556745,0.0263640121
+15009030800,10.0000000000,0.1000000000,0.0169064550,,,575.9991000531,1.0347888110,0.5999348163,0.0061499864,0.0008675195,0.0013422819,0.1386100877,0.1303464907,0.0753902780,0.1220556745,0.0263640121
--- a/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/transform.csv
+++ b/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/transform.csv
@ -1,4 +1,4 @@
-OBJECTID,GEOID10_TRACT,Total population,ACSIPOVBAS,ACSEDUCBAS,ACSTOTHH,ACSTOTHU,ACSUNEMPBAS,MINORPOP,MINORPCT,LOWINCOME,Poverty (Less than 200% of federal poverty line),LESSHS,LESSHSPCT,LINGISO,Percent of households in linguistic isolation,UNDER5,Individuals under 5 years old,OVER64,Individuals over 64 years old,UNEMP,UNEMPPCT,PRE1960,Percent pre-1960s housing (lead paint indicator),VULEOPCT,VULSVI6PCT,VULEO,VULSVI6,DISPEO,DISPSVI6,Diesel particulate matter exposure,Air toxics cancer risk,Respiratory hazard index,Traffic proximity and volume,Wastewater discharge,Proximity to NPL sites,Proximity to Risk Management Plan (RMP) facilities,Proximity to hazardous waste sites,Ozone,PM2.5 in the air,Leaky underground storage tanks,D_LDPNT_2,D_DSLPM_2,D_CANCR_2,D_RESP_2,D_PTRAF_2,D_PWDIS_2,D_PNPL_2,D_PRMP_2,D_PTSDF_2,D_OZONE_2,D_PM25_2,D_UST_2,STATE_NAME,ST_ABBREV,REGION,P_MINORPCT,P_LWINCPCT,P_LESHSPCT,P_LNGISPCT,P_UNDR5PCT,P_OVR64PCT,P_UNEMP,P_UNEMPPCT,P_LDPNT,P_VULEOPCT,P_VULSVI6PCT,P_VULSVI6,P_DISPSVI6,P_DSLPM,P_CANCR,P_RESP,P_PTRAF,P_PWDIS,P_PNPL,P_PRMP,P_PTSDF,P_OZONE,P_PM25,P_UST,P_LDPNT_D2,P_DSLPM_D2,P_CANCR_D2,P_RESP_D2,P_PTRAF_D2,P_PWDIS_D2,P_PNPL_D2,P_PRMP_D2,P_PTSDF_D2,P_OZONE_D2,P_PM25_D2,P_UST_D2,B_MINORPCT,B_LWINCPCT,B_LESHSPCT,B_LNGISPCT,B_UNDR5PCT,B_OVR64PCT,B_UNEMP,B_UNEMPPCT,B_LDPNT,B_VULEOPCT,B_VULSVI6PCT,B_VULSVI6,B_DISPSVI6,B_DSLPM,B_CANCR,B_RESP,B_PTRAF,B_PWDIS,B_PNPL,B_PRMP,B_PTSDF,B_OZONE,B_PM25,B_UST,B_LDPNT_D2,B_DSLPM_D2,B_CANCR_D2,B_RESP_D2,B_PTRAF_D2,B_PWDIS_D2,B_PNPL_D2,B_PRMP_D2,B_PTSDF_D2,B_OZONE_D2,B_PM25_D2,B_UST_D2,T_MINORPCT,T_LWINCPCT,T_LESHSPCT,T_LNGISPCT,T_UNDR5PCT,T_OVR64PCT,T_UNEMPPCT,T_VULEOPCT,T_LDPNT,T_LDPNT_D2,T_DSLPM,T_DSLPM_D2,T_CANCR,T_CANCR_D2,T_RESP,T_RESP_D2,T_PTRAF,T_PTRAF_D2,T_PWDIS,T_PWDIS_D2,T_PNPL,T_PNPL_D2,T_PRMP,T_PRMP_D2,T_PTSDF,T_PTSDF_D2,T_OZONE,T_OZONE_D2,T_PM25,T_PM25_D2,T_UST,T_UST_D2,AREALAND,AREAWATER,NPL_CNT,TSDF_CNT,Shape_Length,Shape_Area
+OBJECTID,GEOID10_TRACT,ACSTOTPOP,ACSIPOVBAS,ACSEDUCBAS,ACSTOTHH,ACSTOTHU,ACSUNEMPBAS,MINORPOP,MINORPCT,LOWINCOME,Poverty (Less than 200% of federal poverty line),LESSHS,LESSHSPCT,LINGISO,Percent of households in linguistic isolation,UNDER5,Individuals under 5 years old,OVER64,Individuals over 64 years old,UNEMP,UNEMPPCT,PRE1960,Percent pre-1960s housing (lead paint indicator),VULEOPCT,VULSVI6PCT,VULEO,VULSVI6,DISPEO,DISPSVI6,Diesel particulate matter exposure,Air toxics cancer risk,Respiratory hazard index,Traffic proximity and volume,Wastewater discharge,Proximity to NPL sites,Proximity to Risk Management Plan (RMP) facilities,Proximity to hazardous waste sites,Ozone,PM2.5 in the air,Leaky underground storage tanks,D_LDPNT_2,D_DSLPM_2,D_CANCR_2,D_RESP_2,D_PTRAF_2,D_PWDIS_2,D_PNPL_2,D_PRMP_2,D_PTSDF_2,D_OZONE_2,D_PM25_2,D_UST_2,STATE_NAME,ST_ABBREV,REGION,P_MINORPCT,P_LWINCPCT,P_LESHSPCT,P_LNGISPCT,P_UNDR5PCT,P_OVR64PCT,P_UNEMP,P_UNEMPPCT,P_LDPNT,P_VULEOPCT,P_VULSVI6PCT,P_VULSVI6,P_DISPSVI6,P_DSLPM,P_CANCR,P_RESP,P_PTRAF,P_PWDIS,P_PNPL,P_PRMP,P_PTSDF,P_OZONE,P_PM25,P_UST,P_LDPNT_D2,P_DSLPM_D2,P_CANCR_D2,P_RESP_D2,P_PTRAF_D2,P_PWDIS_D2,P_PNPL_D2,P_PRMP_D2,P_PTSDF_D2,P_OZONE_D2,P_PM25_D2,P_UST_D2,B_MINORPCT,B_LWINCPCT,B_LESHSPCT,B_LNGISPCT,B_UNDR5PCT,B_OVR64PCT,B_UNEMP,B_UNEMPPCT,B_LDPNT,B_VULEOPCT,B_VULSVI6PCT,B_VULSVI6,B_DISPSVI6,B_DSLPM,B_CANCR,B_RESP,B_PTRAF,B_PWDIS,B_PNPL,B_PRMP,B_PTSDF,B_OZONE,B_PM25,B_UST,B_LDPNT_D2,B_DSLPM_D2,B_CANCR_D2,B_RESP_D2,B_PTRAF_D2,B_PWDIS_D2,B_PNPL_D2,B_PRMP_D2,B_PTSDF_D2,B_OZONE_D2,B_PM25_D2,B_UST_D2,T_MINORPCT,T_LWINCPCT,T_LESHSPCT,T_LNGISPCT,T_UNDR5PCT,T_OVR64PCT,T_UNEMPPCT,T_VULEOPCT,T_LDPNT,T_LDPNT_D2,T_DSLPM,T_DSLPM_D2,T_CANCR,T_CANCR_D2,T_RESP,T_RESP_D2,T_PTRAF,T_PTRAF_D2,T_PWDIS,T_PWDIS_D2,T_PNPL,T_PNPL_D2,T_PRMP,T_PRMP_D2,T_PTSDF,T_PTSDF_D2,T_OZONE,T_OZONE_D2,T_PM25,T_PM25_D2,T_UST,T_UST_D2,AREALAND,AREAWATER,NPL_CNT,TSDF_CNT,Shape_Length,Shape_Area
 4529,06027000800,3054,3009,2337,1420,2067,1443,1218,0.3988212181,1210,0.4021269525,475,0.2032520325,134,0.0943661972,129,0.0422396857,747,0.2445972495,62,0.0429660430,763,0.3691340106,0.4004740853,0.2309005559,1223.0478564307,705.1702977293,135.9429095904,144.8520486255,0.0162608457,20.0000000000,0.2000000000,134.3731709435,0.0000000476,0.0088169702,0.0161739005,0.0231458734,59.8143830065,5.9332945205,0.0271801764,50.1811514356,2.2105466749,2718.8581918080,27.1885819181,18267.0798289539,0.0000064773,1.1986045786,2.1987270931,3.1465173743,8131.3412612630,806.5893205801,3.6949522625,California,CA,9,58.2565807824,70.8357682483,82.0300855712,83.4211514441,22.4791060804,91.4310072487,20.6342392033,44.8003303446,69.4492207493,64.4805710566,73.9747591523,41.2001973366,69.9936559849,0.4881982980,32.2031638835,14.4688811492,33.6358789383,2.7793036790,3.1380644255,0.3541522801,2.0598614138,97.6642425963,3.6388096802,6.3535808084,71.4956721564,59.1319320934,61.5316181718,60.9745786385,62.4689837463,62.0864910202,59.8317854029,59.0710337447,59.2599060994,64.9284478117,62.2619591744,60.9702180540,6,8,9,9,3,10,3,5,7,7,8,5,7,1,4,2,4,1,1,1,1,11,1,1,8,6,7,7,7,7,6,6,6,7,7,7,40% (58%ile),40% (70%ile),20% (82%ile),9% (83%ile),4% (22%ile),24% (91%ile),4% (44%ile),40% (64%ile),0.37 = fraction pre-1960 (69%ile),71%ile,0.0163 ug/m3 (0%ile),59%ile,20 lifetime risk per million (32%ile),61%ile,0.2  (14%ile),60%ile,130 daily vehicles/meters distance (33%ile),62%ile,0.000000048 toxicity-weighted concentration/meters distance (2%ile),62%ile,0.0088 sites/km distance (3%ile),59%ile,0.016 facilities/km distance (0%ile),59%ile,0.023 facilities/km distance (2%ile),59%ile,59.8 ppb (97%ile),64%ile,5.93 ug/m3 (3%ile),62%ile,0.027 facilities/sq km area (6%ile),60%ile,17743852489.0000000000,41257887.0000000000,0,1,969231.5231135677,27404749177.8422279358
 8028,06061021322,20899,20874,13290,6549,6904,9172,9199,0.4401646012,3881,0.1859250743,825,0.0620767494,225,0.0343563903,1429,0.0683764773,2939,0.1406287382,312,0.0340165722,231,0.0334588644,0.3130448377,0.1552546718,6542.3240634282,3244.6673856589,-896.9052371663,-589.6780917541,0.1849562857,30.0000000000,0.5000000000,12.5173455346,0.2667203153,0.0687928975,0.4515663958,0.2027045525,52.7832287582,12.1102756164,0.0258826940,-30.0094307337,-165.8882612555,-26907.1571149896,-448.4526185832,-11226.8727654026,-239.2228476257,-61.7007100657,-405.0122653138,-181.8067747336,-47341.5543077505,-10861.7696239112,-23.2143238368,California,CA,9,61.7694531724,28.3124099080,32.2625612545,63.3138029183,65.9392366308,44.1611446180,92.1063805127,31.2336817151,19.3531578232,52.0599864076,48.1147912182,98.1253263672,8.5598852754,35.4160437794,83.7767623034,95.2520218071,6.7786023570,88.6613290583,53.5138135020,56.0049245976,28.8270859466,89.7745222973,94.2035706464,6.2511191138,43.0185694890,24.7769097248,17.2770098374,9.5647689629,49.9350307593,5.0850465016,20.5837755437,15.4478896201,34.6338200533,14.8104044330,10.3206402564,53.0011626680,7,3,4,7,7,5,10,4,2,6,5,11,1,4,9,11,1,9,6,6,3,9,10,1,5,3,2,1,5,1,3,2,4,2,2,6,44% (61%ile),19% (28%ile),6% (32%ile),3% (63%ile),7% (65%ile),14% (44%ile),3% (31%ile),31% (52%ile),0.033 = fraction pre-1960 (19%ile),43%ile,0.185 ug/m3 (35%ile),24%ile,30 lifetime risk per million (83%ile),17%ile,0.5  (95%ile),9%ile,13 daily vehicles/meters distance (6%ile),49%ile,0.27 toxicity-weighted concentration/meters distance (88%ile),5%ile,0.069 sites/km distance (53%ile),20%ile,0.45 facilities/km distance (56%ile),15%ile,0.2 facilities/km distance (28%ile),34%ile,52.8 ppb (89%ile),14%ile,12.1 ug/m3 (94%ile),10%ile,0.026 facilities/sq km area (6%ile),53%ile,258653359.0000000000,119890.0000000000,0,0,124755.3452199987,427225089.6229769588
 8849,06069000802,3049,3045,2076,955,1119,1493,1247,0.4089865530,747,0.2453201970,307,0.1478805395,31,0.0324607330,240,0.0787143326,468,0.1534929485,93,0.0622906899,390,0.3485254692,0.3271533750,0.1778092173,997.4906403941,542.1403034316,-87.8345013597,-17.2605942492,0.0375346206,20.0000000000,0.2000000000,15.7944927934,,0.0396183204,0.0811927061,0.1674220356,47.0434058824,7.4113546849,0.0102735941,-30.6125607956,-3.2968346872,-1756.6900271942,-17.5669002719,-1387.3013987358,,-3.4798554127,-7.1315208575,-14.7054310128,-4132.0340979390,-650.9726431509,-0.9023760119,California,CA,9,59.1858457424,41.3904741949,69.9513617378,62.0187896062,79.0518001240,52.1216510370,37.3180569516,68.3483551403,67.5701406274,54.3994266601,57.9926859232,26.1831217492,58.7612911558,2.0014414700,32.2031638835,14.4688811492,8.1570460385,,34.5749415665,10.3739430074,25.1131375379,84.5333172848,19.2864164585,4.9410824602,42.8621394303,58.0471933934,56.5430390950,57.0023528116,55.7266348497,,54.6373148803,57.1359685902,54.8116596007,56.2167239668,56.9568759225,56.2801621878,6,5,7,7,8,6,4,7,7,6,6,3,6,1,4,2,1,0,4,2,3,9,2,1,5,6,6,6,6,0,6,6,6,6,6,6,41% (59%ile),25% (41%ile),15% (69%ile),3% (62%ile),8% (79%ile),15% (52%ile),6% (68%ile),33% (54%ile),0.35 = fraction pre-1960 (67%ile),42%ile,0.0375 ug/m3 (2%ile),58%ile,20 lifetime risk per million (32%ile),56%ile,0.2  (14%ile),57%ile,16 daily vehicles/meters distance (8%ile),55%ile,,,0.04 sites/km distance (34%ile),54%ile,0.081 facilities/km distance (10%ile),57%ile,0.17 facilities/km distance (25%ile),54%ile,47 ppb (84%ile),56%ile,7.41 ug/m3 (19%ile),56%ile,0.01 facilities/sq km area (4%ile),56%ile,2987635876.0000000000,3272257.0000000000,1,0,422237.6856758550,4643687820.1565904617
--- a/data/data-pipeline/data_pipeline/tests/sources/persistent_poverty/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/persistent_poverty/test_etl.py
@ -1,6 +1,8 @@
 import pathlib
 from data_pipeline.tests.sources.example.test_etl import TestETL
-from data_pipeline.etl.sources.persistent_poverty.etl import PersistentPovertyETL
+from data_pipeline.etl.sources.persistent_poverty.etl import (
    PersistentPovertyETL,
 )
 class TestPersistentPovertyETL(TestETL):