From 9fb9874a15110c8ffee7a71e1b7c9985b5010b04 Mon Sep 17 00:00:00 2001 From: Lucas Merrill Brown Date: Mon, 26 Sep 2022 11:00:21 -0400 Subject: [PATCH] Issue 1910: Do not impute income for 0 population tracts (#1918) * should be working, has unnecessary loggers * removing loggers and cleaning up * updating ejscreen tests * adding tests and responding to PR feedback * fixing broken smoke test * delete smoketest docs --- data/data-pipeline/README.md | 4 +- data/data-pipeline/data_pipeline/etl/base.py | 3 + .../data_pipeline/etl/score/etl_score.py | 16 ++---- .../etl/sources/cdc_life_expectancy/etl.py | 9 +-- .../etl/sources/census_acs/etl.py | 55 +++++++++++-------- .../etl/sources/census_acs/etl_imputations.py | 42 ++++++++++++-- .../data_pipeline/etl/sources/ejscreen/etl.py | 2 - .../etl/sources/tribal_overlap/etl.py | 8 ++- .../data_pipeline/tests/score/fixtures.py | 6 +- .../data_pipeline/tests/score/test_output.py | 42 +++++++++++++- .../tests/sources/ejscreen/data/output.csv | 32 +++++------ .../tests/sources/ejscreen/data/transform.csv | 2 +- .../sources/persistent_poverty/test_etl.py | 4 +- 13 files changed, 150 insertions(+), 75 deletions(-) diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md index d553f3b9..2a259894 100644 --- a/data/data-pipeline/README.md +++ b/data/data-pipeline/README.md @@ -322,7 +322,9 @@ see [python-markdown docs](https://github.com/ipython-contrib/jupyter_contrib_nb ### Background -For this project, we make use of [pytest](https://docs.pytest.org/en/latest/) for testing purposes. To run tests, simply run `poetry run pytest` in this directory (i.e., `justice40-tool/data/data-pipeline`). +For this project, we make use of [pytest](https://docs.pytest.org/en/latest/) for testing purposes. + +To run tests, simply run `poetry run pytest` in this directory (i.e., `justice40-tool/data/data-pipeline`). Test data is configured via [fixtures](https://docs.pytest.org/en/latest/explanation/fixtures.html). diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 9dee3915..7026c36f 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -365,6 +365,9 @@ class ExtractTransformLoad: f"No file found at `{output_file_path}`." ) + logger.info( + f"Reading in CSV `{output_file_path}` for ETL of class `{cls}`." + ) output_df = pd.read_csv( output_file_path, dtype={ diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 53f7d260..b0fc3b4d 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.sources.census_acs.etl import CensusACSETL from data_pipeline.etl.sources.national_risk_index.etl import ( NationalRiskIndexETL, ) @@ -35,7 +36,7 @@ class ScoreETL(ExtractTransformLoad): # dataframes self.df: pd.DataFrame self.ejscreen_df: pd.DataFrame - self.census_df: pd.DataFrame + self.census_acs_df: pd.DataFrame self.hud_housing_df: pd.DataFrame self.cdc_places_df: pd.DataFrame self.census_acs_median_incomes_df: pd.DataFrame @@ -67,14 +68,7 @@ class ScoreETL(ExtractTransformLoad): ) # Load census data - census_csv = ( - constants.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv" - ) - self.census_df = pd.read_csv( - census_csv, - dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, - low_memory=False, - ) + self.census_acs_df = CensusACSETL.get_data_frame() # Load HUD housing data hud_housing_csv = ( @@ -346,7 +340,7 @@ class ScoreETL(ExtractTransformLoad): # Join all the data sources that use census tracts census_tract_dfs = [ - self.census_df, + self.census_acs_df, self.hud_housing_df, self.cdc_places_df, self.cdc_life_expectancy_df, @@ -364,7 +358,7 @@ class ScoreETL(ExtractTransformLoad): self.nature_deprived_df, self.eamlis_df, self.fuds_df, - self.tribal_overlap_df + self.tribal_overlap_df, ] # Sanity check each data frame before merging. diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py index 73a4959c..f656cea9 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py @@ -73,8 +73,7 @@ class CDCLifeExpectancy(ExtractTransformLoad): all_usa_raw_df = self._download_and_prep_data( file_url=self.USA_FILE_URL, - download_file_name=self.get_tmp_path() - / "US_A.CSV", + download_file_name=self.get_tmp_path() / "US_A.CSV", ) # Check which states are missing @@ -94,15 +93,13 @@ class CDCLifeExpectancy(ExtractTransformLoad): logger.info("Downloading data for Maine") maine_raw_df = self._download_and_prep_data( file_url=self.MAINE_FILE_URL, - download_file_name=self.get_tmp_path() - / "maine.csv", + download_file_name=self.get_tmp_path() / "maine.csv", ) logger.info("Downloading data for Wisconsin") wisconsin_raw_df = self._download_and_prep_data( file_url=self.WISCONSIN_FILE_URL, - download_file_name=self.get_tmp_path() - / "wisconsin.csv", + download_file_name=self.get_tmp_path() / "wisconsin.csv", ) combined_df = pd.concat( diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index dba9d06b..60e43391 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -23,12 +23,11 @@ CENSUS_DATA_S3_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/census.zip" class CensusACSETL(ExtractTransformLoad): - def __init__(self): - self.ACS_YEAR = 2019 - self.OUTPUT_PATH = ( - self.DATA_PATH / "dataset" / f"census_acs_{self.ACS_YEAR}" - ) + NAME = "census_acs" + ACS_YEAR = 2019 + MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1 + def __init__(self): self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E" self.TOTAL_IN_LABOR_FORCE = "B23025_003E" self.EMPLOYMENT_FIELDS = [ @@ -216,8 +215,15 @@ class CensusACSETL(ExtractTransformLoad): self.OTHER_RACE_FIELD_NAME, ] + # Note: this field does double-duty here. It's used as the total population + # within the age questions. + # It's also what EJScreen used as their variable for total population in the + # census tract, so we use it similarly. + # See p. 83 of https://www.epa.gov/sites/default/files/2021-04/documents/ejscreen_technical_document.pdf + self.TOTAL_POPULATION_FROM_AGE_TABLE = "B01001_001E" # Estimate!!Total: + self.AGE_INPUT_FIELDS = [ - "B01001_001E", # Estimate!!Total: + self.TOTAL_POPULATION_FROM_AGE_TABLE, "B01001_003E", # Estimate!!Total:!!Male:!!Under 5 years "B01001_004E", # Estimate!!Total:!!Male:!!5 to 9 years "B01001_005E", # Estimate!!Total:!!Male:!!10 to 14 years @@ -277,6 +283,7 @@ class CensusACSETL(ExtractTransformLoad): self.COLUMNS_TO_KEEP = ( [ self.GEOID_TRACT_FIELD_NAME, + field_names.TOTAL_POP_FIELD, self.UNEMPLOYED_FIELD_NAME, self.LINGUISTIC_ISOLATION_FIELD_NAME, self.MEDIAN_INCOME_FIELD_NAME, @@ -375,18 +382,22 @@ class CensusACSETL(ExtractTransformLoad): ) geo_df = gpd.read_file( - self.DATA_PATH / "census" / "geojson" / "us.json" + self.DATA_PATH / "census" / "geojson" / "us.json", ) + df = self._merge_geojson( df=df, usa_geo_df=geo_df, ) - # Rename two fields. + + # Rename some fields. df = df.rename( columns={ self.MEDIAN_HOUSE_VALUE_FIELD: self.MEDIAN_HOUSE_VALUE_FIELD_NAME, self.MEDIAN_INCOME_FIELD: self.MEDIAN_INCOME_FIELD_NAME, - } + self.TOTAL_POPULATION_FROM_AGE_TABLE: field_names.TOTAL_POP_FIELD, + }, + errors="raise", ) # Handle null values for various fields, which are `-666666666`. @@ -472,7 +483,6 @@ class CensusACSETL(ExtractTransformLoad): ) # Calculate some demographic information. - df = df.rename( columns={ "B02001_003E": self.BLACK_FIELD_NAME, @@ -560,14 +570,11 @@ class CensusACSETL(ExtractTransformLoad): ), ] - # Calculate age groups - total_population_age_series = df["B01001_001E"] - # For each age bucket, sum the relevant columns and calculate the total # percentage. for age_bucket, sum_columns in age_bucket_and_its_sum_columns: df[age_bucket] = ( - df[sum_columns].sum(axis=1) / total_population_age_series + df[sum_columns].sum(axis=1) / df[field_names.TOTAL_POP_FIELD] ) # Calculate college attendance and adjust low income @@ -602,6 +609,7 @@ class CensusACSETL(ExtractTransformLoad): ], geo_df=df, geoid_field=self.GEOID_TRACT_FIELD_NAME, + minimum_population_required_for_imputation=self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION, ) logger.info("Calculating with imputed values") @@ -615,13 +623,20 @@ class CensusACSETL(ExtractTransformLoad): - df[self.COLLEGE_ATTENDANCE_FIELD].fillna( df[self.IMPUTED_COLLEGE_ATTENDANCE_FIELD] ) + # Use clip to ensure that the values are not negative if college attendance + # is very high ).clip( lower=0 ) # All values should have a value at this point assert ( + # For tracts with >0 population df[ + df[field_names.TOTAL_POP_FIELD] + >= self.MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION + ][ + # Then the imputed field should have no nulls self.ADJUSTED_AND_IMPUTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME ] .isna() @@ -644,13 +659,5 @@ class CensusACSETL(ExtractTransformLoad): & df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD].isna() ) - # Strip columns and save results to self. - self.df = df[self.COLUMNS_TO_KEEP] - - def load(self) -> None: - logger.info("Saving Census ACS Data") - - # mkdir census - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) - - self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False) + # Save results to self. + self.output_df = df diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py index 17180026..9805af2c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py @@ -2,6 +2,7 @@ from typing import Any, List, NamedTuple, Tuple import pandas as pd import geopandas as gpd +from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger # pylint: disable=unsubscriptable-object @@ -23,6 +24,7 @@ def _get_fips_mask( def _get_neighbor_mask( geo_df: gpd.GeoDataFrame, row: gpd.GeoSeries ) -> pd.Series: + """Returns neighboring tracts.""" return geo_df["geometry"].touches(row["geometry"]) @@ -40,24 +42,47 @@ def _choose_best_mask( def _prepare_dataframe_for_imputation( impute_var_named_tup_list: List[NamedTuple], geo_df: gpd.GeoDataFrame, + population_field: str, + minimum_population_required_for_imputation: int = 1, geoid_field: str = "GEOID10_TRACT", ) -> Tuple[Any, gpd.GeoDataFrame]: + """Helper for imputation. + + Given the inputs of `ImputeVariables`, returns list of tracts that need to be + imputed, along with a GeoDataFrame that has a column with the imputed field + "primed", meaning it is a copy of the raw field. + + Will drop any rows with population less than + `minimum_population_required_for_imputation`. + """ imputing_cols = [ impute_var_pair.raw_field_name for impute_var_pair in impute_var_named_tup_list ] - # prime column to exist + # Prime column to exist for impute_var_pair in impute_var_named_tup_list: geo_df[impute_var_pair.imputed_field_name] = geo_df[ impute_var_pair.raw_field_name ].copy() - # generate a list of tracts for which at least one of the imputation - # columns is null - tract_list = geo_df[geo_df[imputing_cols].isna().any(axis=1)][ - geoid_field - ].unique() + # Generate a list of tracts for which at least one of the imputation + # columns is null that also meets population criteria. + tract_list = geo_df[ + ( + # First, check whether any of the columns we want to impute contain null + # values + geo_df[imputing_cols].isna().any(axis=1) + # Second, ensure population is either null or >= the minimum population + & ( + geo_df[population_field].isnull() + | ( + geo_df[population_field] + >= minimum_population_required_for_imputation + ) + ) + ) + ][geoid_field].unique() # Check that imputation is a valid choice for this set of fields logger.info(f"Imputing values for {len(tract_list)} unique tracts.") @@ -70,6 +95,8 @@ def calculate_income_measures( impute_var_named_tup_list: list, geo_df: gpd.GeoDataFrame, geoid_field: str, + population_field: str = field_names.TOTAL_POP_FIELD, + minimum_population_required_for_imputation: int = 1, ) -> pd.DataFrame: """Impute values based on geographic neighbors @@ -89,6 +116,8 @@ def calculate_income_measures( impute_var_named_tup_list=impute_var_named_tup_list, geo_df=geo_df, geoid_field=geoid_field, + population_field=population_field, + minimum_population_required_for_imputation=minimum_population_required_for_imputation, ) # Iterate through the dataframe to impute in place @@ -119,6 +148,7 @@ def calculate_income_measures( ], column_to_impute=impute_var_pair.raw_field_name, ) + geo_df.loc[index, impute_var_pair.imputed_field_name] = geo_df[ mask_to_use ][impute_var_pair.raw_field_name].mean() diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py index 1c10551c..fa52690a 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py @@ -24,7 +24,6 @@ class EJSCREENETL(ExtractTransformLoad): self.COLUMNS_TO_KEEP = [ self.GEOID_TRACT_FIELD_NAME, - field_names.TOTAL_POP_FIELD, # pylint: disable=duplicate-code field_names.AIR_TOXICS_CANCER_RISK_FIELD, field_names.RESPIRATORY_HAZARD_FIELD, @@ -66,7 +65,6 @@ class EJSCREENETL(ExtractTransformLoad): self.output_df = self.df.rename( columns={ self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME, - "ACSTOTPOP": field_names.TOTAL_POP_FIELD, "CANCER": field_names.AIR_TOXICS_CANCER_RISK_FIELD, "RESP": field_names.RESPIRATORY_HAZARD_FIELD, "DSLPM": field_names.DIESEL_FIELD, diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py index cf7ec805..e3bc0fc8 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py @@ -108,8 +108,12 @@ class TribalOverlapETL(ExtractTransformLoad): # Switch from geographic to projected CRSes # because logically that's right - self.census_tract_gdf = self.census_tract_gdf.to_crs(crs=self.CRS_INTEGER) - tribal_gdf_without_points = tribal_gdf_without_points.to_crs(crs=self.CRS_INTEGER) + self.census_tract_gdf = self.census_tract_gdf.to_crs( + crs=self.CRS_INTEGER + ) + tribal_gdf_without_points = tribal_gdf_without_points.to_crs( + crs=self.CRS_INTEGER + ) # Create a measure for the entire census tract area self.census_tract_gdf["area_tract"] = self.census_tract_gdf.area diff --git a/data/data-pipeline/data_pipeline/tests/score/fixtures.py b/data/data-pipeline/data_pipeline/tests/score/fixtures.py index 54d788ce..744ebfa6 100644 --- a/data/data-pipeline/data_pipeline/tests/score/fixtures.py +++ b/data/data-pipeline/data_pipeline/tests/score/fixtures.py @@ -15,10 +15,10 @@ def final_score_df(): @pytest.fixture() -def census_df(): - census_csv = constants.DATA_PATH / "dataset" / "census_acs_2019" / "usa.csv" +def census_acs_df(): + census_acs_csv = constants.DATA_PATH / "dataset" / "census_acs" / "usa.csv" return pd.read_csv( - census_csv, + census_acs_csv, dtype={GEOID_TRACT_FIELD: "string"}, low_memory=False, ) diff --git a/data/data-pipeline/data_pipeline/tests/score/test_output.py b/data/data-pipeline/data_pipeline/tests/score/test_output.py index 026f64fd..f10e6f71 100644 --- a/data/data-pipeline/data_pipeline/tests/score/test_output.py +++ b/data/data-pipeline/data_pipeline/tests/score/test_output.py @@ -11,7 +11,7 @@ from .fixtures import ( final_score_df, ejscreen_df, hud_housing_df, - census_df, + census_acs_df, cdc_places_df, census_acs_median_incomes_df, cdc_life_expectancy_df, @@ -235,7 +235,7 @@ def test_data_sources( final_score_df, hud_housing_df, ejscreen_df, - census_df, + census_acs_df, cdc_places_df, census_acs_median_incomes_df, cdc_life_expectancy_df, @@ -337,3 +337,41 @@ def test_output_tracts(final_score_df, national_tract_df): def test_all_tracts_have_scores(final_score_df): assert not final_score_df[field_names.SCORE_N_COMMUNITIES].isna().any() + + +def test_imputed_tracts(final_score_df): + # Make sure that any tracts with zero population have null imputed income + tracts_with_zero_population_df = final_score_df[ + final_score_df[field_names.TOTAL_POP_FIELD] == 0 + ] + assert ( + tracts_with_zero_population_df[ + field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD + ] + .isna() + .all() + ) + + # Make sure that any tracts with null population have null imputed income + tracts_with_null_population_df = final_score_df[ + final_score_df[field_names.TOTAL_POP_FIELD].isnull() + ] + assert ( + tracts_with_null_population_df[ + field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD + ] + .isna() + .all() + ) + + # Make sure that no tracts with population have null imputed income + tracts_with_some_population_df = final_score_df[ + final_score_df[field_names.TOTAL_POP_FIELD] > 0 + ] + assert ( + not tracts_with_some_population_df[ + field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD + ] + .isna() + .any() + ) diff --git a/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/output.csv b/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/output.csv index 71421615..d6657647 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/output.csv +++ b/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/output.csv @@ -1,16 +1,16 @@ -GEOID10_TRACT,Total population,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter exposure,PM2.5 in the air,Ozone,Traffic proximity and volume,Proximity to Risk Management Plan (RMP) facilities,Proximity to hazardous waste sites,Proximity to NPL sites,Wastewater discharge,Percent of households in linguistic isolation,Poverty (Less than 200% of federal poverty line),Individuals over 64 years old,Individuals under 5 years old,Percent pre-1960s housing (lead paint indicator),Leaky underground storage tanks -06027000800,3054,20.0000000000,0.2000000000,0.0162608457,5.9332945205,59.8143830065,134.3731709435,0.0161739005,0.0231458734,0.0088169702,0.0000000476,0.0943661972,0.4021269525,0.2445972495,0.0422396857,0.3691340106,0.0271801764 -06061021322,20899,30.0000000000,0.5000000000,0.1849562857,12.1102756164,52.7832287582,12.5173455346,0.4515663958,0.2027045525,0.0687928975,0.2667203153,0.0343563903,0.1859250743,0.1406287382,0.0683764773,0.0334588644,0.0258826940 -06069000802,3049,20.0000000000,0.2000000000,0.0375346206,7.4113546849,47.0434058824,15.7944927934,0.0811927061,0.1674220356,0.0396183204,,0.0324607330,0.2453201970,0.1534929485,0.0787143326,0.3485254692,0.0102735941 -15001021010,8606,10.0000000000,0.1000000000,0.0067389217,,,0.1074143214,0.0478749209,0.0931096253,0.0027318608,,0.0109090909,0.5159562078,0.1992795724,0.0366023704,0.0112496943,0.0259838494 -15001021101,3054,10.0000000000,0.1000000000,0.0033713587,,,1.7167679255,0.2484740667,0.2746856427,0.0025910486,,0.0194426442,0.4755657593,0.2976424361,0.0301244270,0.0168539326,0.0375389154 -15001021402,3778,10.0000000000,0.1000000000,0.0131608945,,,635.9981128640,0.0225482603,0.6278707343,0.0033357209,,0.0407569141,0.1877496671,0.2469560614,0.0751720487,0.1743524953,0.5088713177 -15001021800,5998,10.0000000000,0.1000000000,0.0049503455,,,0.0743045071,0.0402733327,0.0410968274,0.0038298946,,0.0359848485,0.2698678267,0.2352450817,0.0586862287,0.1676168757,0.1071290552 -15003010201,4936,10.0000000000,0.1000000000,0.0171119880,,,1493.8870892160,0.0548137804,0.4080845621,0.0694550700,,0.0340041638,0.2999166319,0.1318881686,0.0964343598,0.2131062951,0.0995447326 -15007040603,2984,10.0000000000,0.1000000000,0.0225796264,,,255.5966484444,0.1042895043,0.5200441984,0.0065810172,,0.0311909263,0.2676292814,0.2533512064,0.0563002681,0.0935077519,0.1610354485 -15007040604,3529,10.0000000000,0.1000000000,0.0297040750,,,464.0468169721,0.1282189641,0.3810520320,0.0064334940,,0.0353833193,0.3687102371,0.1790875602,0.0943610088,0.1981538462,0.2277699060 -15007040700,9552,10.0000000000,0.1000000000,0.0120486502,,,829.6297843840,0.2776903565,0.5315584393,0.0062317499,,0.0328151986,0.2079176730,0.1920016750,0.0808207705,0.1049120679,0.8605507426 -15009030100,1405,10.0000000000,0.1000000000,0.0026846006,,,,0.0398066625,0.0329594792,0.0046765532,,0.0000000000,0.2911208151,0.2434163701,0.0882562278,0.2135678392,0.0973247551 -15009030201,2340,10.0000000000,0.1000000000,0.0063521816,,,7.0868595222,0.1292001112,0.0908033666,0.0053511202,,0.0000000000,0.2677266867,0.2367521368,0.0641025641,0.0928229665,0.0098923140 -15009030402,8562,10.0000000000,0.1000000000,0.0153866969,,,233.6880574427,0.6633705951,0.5914191729,0.0055146115,,0.0122641509,0.1792805419,0.1810324690,0.0463676711,0.0760149726,0.4432670413 -15009030800,7879,10.0000000000,0.1000000000,0.0169064550,,,575.9991000531,1.0347888110,0.5999348163,0.0061499864,0.0008675195,0.0013422819,0.1386100877,0.1303464907,0.0753902780,0.1220556745,0.0263640121 +GEOID10_TRACT,Air toxics cancer risk,Respiratory hazard index,Diesel particulate matter exposure,PM2.5 in the air,Ozone,Traffic proximity and volume,Proximity to Risk Management Plan (RMP) facilities,Proximity to hazardous waste sites,Proximity to NPL sites,Wastewater discharge,Percent of households in linguistic isolation,Poverty (Less than 200% of federal poverty line),Individuals over 64 years old,Individuals under 5 years old,Percent pre-1960s housing (lead paint indicator),Leaky underground storage tanks +06027000800,20.0000000000,0.2000000000,0.0162608457,5.9332945205,59.8143830065,134.3731709435,0.0161739005,0.0231458734,0.0088169702,0.0000000476,0.0943661972,0.4021269525,0.2445972495,0.0422396857,0.3691340106,0.0271801764 +06061021322,30.0000000000,0.5000000000,0.1849562857,12.1102756164,52.7832287582,12.5173455346,0.4515663958,0.2027045525,0.0687928975,0.2667203153,0.0343563903,0.1859250743,0.1406287382,0.0683764773,0.0334588644,0.0258826940 +06069000802,20.0000000000,0.2000000000,0.0375346206,7.4113546849,47.0434058824,15.7944927934,0.0811927061,0.1674220356,0.0396183204,,0.0324607330,0.2453201970,0.1534929485,0.0787143326,0.3485254692,0.0102735941 +15001021010,10.0000000000,0.1000000000,0.0067389217,,,0.1074143214,0.0478749209,0.0931096253,0.0027318608,,0.0109090909,0.5159562078,0.1992795724,0.0366023704,0.0112496943,0.0259838494 +15001021101,10.0000000000,0.1000000000,0.0033713587,,,1.7167679255,0.2484740667,0.2746856427,0.0025910486,,0.0194426442,0.4755657593,0.2976424361,0.0301244270,0.0168539326,0.0375389154 +15001021402,10.0000000000,0.1000000000,0.0131608945,,,635.9981128640,0.0225482603,0.6278707343,0.0033357209,,0.0407569141,0.1877496671,0.2469560614,0.0751720487,0.1743524953,0.5088713177 +15001021800,10.0000000000,0.1000000000,0.0049503455,,,0.0743045071,0.0402733327,0.0410968274,0.0038298946,,0.0359848485,0.2698678267,0.2352450817,0.0586862287,0.1676168757,0.1071290552 +15003010201,10.0000000000,0.1000000000,0.0171119880,,,1493.8870892160,0.0548137804,0.4080845621,0.0694550700,,0.0340041638,0.2999166319,0.1318881686,0.0964343598,0.2131062951,0.0995447326 +15007040603,10.0000000000,0.1000000000,0.0225796264,,,255.5966484444,0.1042895043,0.5200441984,0.0065810172,,0.0311909263,0.2676292814,0.2533512064,0.0563002681,0.0935077519,0.1610354485 +15007040604,10.0000000000,0.1000000000,0.0297040750,,,464.0468169721,0.1282189641,0.3810520320,0.0064334940,,0.0353833193,0.3687102371,0.1790875602,0.0943610088,0.1981538462,0.2277699060 +15007040700,10.0000000000,0.1000000000,0.0120486502,,,829.6297843840,0.2776903565,0.5315584393,0.0062317499,,0.0328151986,0.2079176730,0.1920016750,0.0808207705,0.1049120679,0.8605507426 +15009030100,10.0000000000,0.1000000000,0.0026846006,,,,0.0398066625,0.0329594792,0.0046765532,,0.0000000000,0.2911208151,0.2434163701,0.0882562278,0.2135678392,0.0973247551 +15009030201,10.0000000000,0.1000000000,0.0063521816,,,7.0868595222,0.1292001112,0.0908033666,0.0053511202,,0.0000000000,0.2677266867,0.2367521368,0.0641025641,0.0928229665,0.0098923140 +15009030402,10.0000000000,0.1000000000,0.0153866969,,,233.6880574427,0.6633705951,0.5914191729,0.0055146115,,0.0122641509,0.1792805419,0.1810324690,0.0463676711,0.0760149726,0.4432670413 +15009030800,10.0000000000,0.1000000000,0.0169064550,,,575.9991000531,1.0347888110,0.5999348163,0.0061499864,0.0008675195,0.0013422819,0.1386100877,0.1303464907,0.0753902780,0.1220556745,0.0263640121 diff --git a/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/transform.csv b/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/transform.csv index 9247c547..9c353ca4 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/transform.csv +++ b/data/data-pipeline/data_pipeline/tests/sources/ejscreen/data/transform.csv @@ -1,4 +1,4 @@ -OBJECTID,GEOID10_TRACT,Total population,ACSIPOVBAS,ACSEDUCBAS,ACSTOTHH,ACSTOTHU,ACSUNEMPBAS,MINORPOP,MINORPCT,LOWINCOME,Poverty (Less than 200% of federal poverty line),LESSHS,LESSHSPCT,LINGISO,Percent of households in linguistic isolation,UNDER5,Individuals under 5 years old,OVER64,Individuals over 64 years old,UNEMP,UNEMPPCT,PRE1960,Percent pre-1960s housing (lead paint indicator),VULEOPCT,VULSVI6PCT,VULEO,VULSVI6,DISPEO,DISPSVI6,Diesel particulate matter exposure,Air toxics cancer risk,Respiratory hazard index,Traffic proximity and volume,Wastewater discharge,Proximity to NPL sites,Proximity to Risk Management Plan (RMP) facilities,Proximity to hazardous waste sites,Ozone,PM2.5 in the air,Leaky underground storage tanks,D_LDPNT_2,D_DSLPM_2,D_CANCR_2,D_RESP_2,D_PTRAF_2,D_PWDIS_2,D_PNPL_2,D_PRMP_2,D_PTSDF_2,D_OZONE_2,D_PM25_2,D_UST_2,STATE_NAME,ST_ABBREV,REGION,P_MINORPCT,P_LWINCPCT,P_LESHSPCT,P_LNGISPCT,P_UNDR5PCT,P_OVR64PCT,P_UNEMP,P_UNEMPPCT,P_LDPNT,P_VULEOPCT,P_VULSVI6PCT,P_VULSVI6,P_DISPSVI6,P_DSLPM,P_CANCR,P_RESP,P_PTRAF,P_PWDIS,P_PNPL,P_PRMP,P_PTSDF,P_OZONE,P_PM25,P_UST,P_LDPNT_D2,P_DSLPM_D2,P_CANCR_D2,P_RESP_D2,P_PTRAF_D2,P_PWDIS_D2,P_PNPL_D2,P_PRMP_D2,P_PTSDF_D2,P_OZONE_D2,P_PM25_D2,P_UST_D2,B_MINORPCT,B_LWINCPCT,B_LESHSPCT,B_LNGISPCT,B_UNDR5PCT,B_OVR64PCT,B_UNEMP,B_UNEMPPCT,B_LDPNT,B_VULEOPCT,B_VULSVI6PCT,B_VULSVI6,B_DISPSVI6,B_DSLPM,B_CANCR,B_RESP,B_PTRAF,B_PWDIS,B_PNPL,B_PRMP,B_PTSDF,B_OZONE,B_PM25,B_UST,B_LDPNT_D2,B_DSLPM_D2,B_CANCR_D2,B_RESP_D2,B_PTRAF_D2,B_PWDIS_D2,B_PNPL_D2,B_PRMP_D2,B_PTSDF_D2,B_OZONE_D2,B_PM25_D2,B_UST_D2,T_MINORPCT,T_LWINCPCT,T_LESHSPCT,T_LNGISPCT,T_UNDR5PCT,T_OVR64PCT,T_UNEMPPCT,T_VULEOPCT,T_LDPNT,T_LDPNT_D2,T_DSLPM,T_DSLPM_D2,T_CANCR,T_CANCR_D2,T_RESP,T_RESP_D2,T_PTRAF,T_PTRAF_D2,T_PWDIS,T_PWDIS_D2,T_PNPL,T_PNPL_D2,T_PRMP,T_PRMP_D2,T_PTSDF,T_PTSDF_D2,T_OZONE,T_OZONE_D2,T_PM25,T_PM25_D2,T_UST,T_UST_D2,AREALAND,AREAWATER,NPL_CNT,TSDF_CNT,Shape_Length,Shape_Area +OBJECTID,GEOID10_TRACT,ACSTOTPOP,ACSIPOVBAS,ACSEDUCBAS,ACSTOTHH,ACSTOTHU,ACSUNEMPBAS,MINORPOP,MINORPCT,LOWINCOME,Poverty (Less than 200% of federal poverty line),LESSHS,LESSHSPCT,LINGISO,Percent of households in linguistic isolation,UNDER5,Individuals under 5 years old,OVER64,Individuals over 64 years old,UNEMP,UNEMPPCT,PRE1960,Percent pre-1960s housing (lead paint indicator),VULEOPCT,VULSVI6PCT,VULEO,VULSVI6,DISPEO,DISPSVI6,Diesel particulate matter exposure,Air toxics cancer risk,Respiratory hazard index,Traffic proximity and volume,Wastewater discharge,Proximity to NPL sites,Proximity to Risk Management Plan (RMP) facilities,Proximity to hazardous waste sites,Ozone,PM2.5 in the air,Leaky underground storage tanks,D_LDPNT_2,D_DSLPM_2,D_CANCR_2,D_RESP_2,D_PTRAF_2,D_PWDIS_2,D_PNPL_2,D_PRMP_2,D_PTSDF_2,D_OZONE_2,D_PM25_2,D_UST_2,STATE_NAME,ST_ABBREV,REGION,P_MINORPCT,P_LWINCPCT,P_LESHSPCT,P_LNGISPCT,P_UNDR5PCT,P_OVR64PCT,P_UNEMP,P_UNEMPPCT,P_LDPNT,P_VULEOPCT,P_VULSVI6PCT,P_VULSVI6,P_DISPSVI6,P_DSLPM,P_CANCR,P_RESP,P_PTRAF,P_PWDIS,P_PNPL,P_PRMP,P_PTSDF,P_OZONE,P_PM25,P_UST,P_LDPNT_D2,P_DSLPM_D2,P_CANCR_D2,P_RESP_D2,P_PTRAF_D2,P_PWDIS_D2,P_PNPL_D2,P_PRMP_D2,P_PTSDF_D2,P_OZONE_D2,P_PM25_D2,P_UST_D2,B_MINORPCT,B_LWINCPCT,B_LESHSPCT,B_LNGISPCT,B_UNDR5PCT,B_OVR64PCT,B_UNEMP,B_UNEMPPCT,B_LDPNT,B_VULEOPCT,B_VULSVI6PCT,B_VULSVI6,B_DISPSVI6,B_DSLPM,B_CANCR,B_RESP,B_PTRAF,B_PWDIS,B_PNPL,B_PRMP,B_PTSDF,B_OZONE,B_PM25,B_UST,B_LDPNT_D2,B_DSLPM_D2,B_CANCR_D2,B_RESP_D2,B_PTRAF_D2,B_PWDIS_D2,B_PNPL_D2,B_PRMP_D2,B_PTSDF_D2,B_OZONE_D2,B_PM25_D2,B_UST_D2,T_MINORPCT,T_LWINCPCT,T_LESHSPCT,T_LNGISPCT,T_UNDR5PCT,T_OVR64PCT,T_UNEMPPCT,T_VULEOPCT,T_LDPNT,T_LDPNT_D2,T_DSLPM,T_DSLPM_D2,T_CANCR,T_CANCR_D2,T_RESP,T_RESP_D2,T_PTRAF,T_PTRAF_D2,T_PWDIS,T_PWDIS_D2,T_PNPL,T_PNPL_D2,T_PRMP,T_PRMP_D2,T_PTSDF,T_PTSDF_D2,T_OZONE,T_OZONE_D2,T_PM25,T_PM25_D2,T_UST,T_UST_D2,AREALAND,AREAWATER,NPL_CNT,TSDF_CNT,Shape_Length,Shape_Area 4529,06027000800,3054,3009,2337,1420,2067,1443,1218,0.3988212181,1210,0.4021269525,475,0.2032520325,134,0.0943661972,129,0.0422396857,747,0.2445972495,62,0.0429660430,763,0.3691340106,0.4004740853,0.2309005559,1223.0478564307,705.1702977293,135.9429095904,144.8520486255,0.0162608457,20.0000000000,0.2000000000,134.3731709435,0.0000000476,0.0088169702,0.0161739005,0.0231458734,59.8143830065,5.9332945205,0.0271801764,50.1811514356,2.2105466749,2718.8581918080,27.1885819181,18267.0798289539,0.0000064773,1.1986045786,2.1987270931,3.1465173743,8131.3412612630,806.5893205801,3.6949522625,California,CA,9,58.2565807824,70.8357682483,82.0300855712,83.4211514441,22.4791060804,91.4310072487,20.6342392033,44.8003303446,69.4492207493,64.4805710566,73.9747591523,41.2001973366,69.9936559849,0.4881982980,32.2031638835,14.4688811492,33.6358789383,2.7793036790,3.1380644255,0.3541522801,2.0598614138,97.6642425963,3.6388096802,6.3535808084,71.4956721564,59.1319320934,61.5316181718,60.9745786385,62.4689837463,62.0864910202,59.8317854029,59.0710337447,59.2599060994,64.9284478117,62.2619591744,60.9702180540,6,8,9,9,3,10,3,5,7,7,8,5,7,1,4,2,4,1,1,1,1,11,1,1,8,6,7,7,7,7,6,6,6,7,7,7,40% (58%ile),40% (70%ile),20% (82%ile),9% (83%ile),4% (22%ile),24% (91%ile),4% (44%ile),40% (64%ile),0.37 = fraction pre-1960 (69%ile),71%ile,0.0163 ug/m3 (0%ile),59%ile,20 lifetime risk per million (32%ile),61%ile,0.2 (14%ile),60%ile,130 daily vehicles/meters distance (33%ile),62%ile,0.000000048 toxicity-weighted concentration/meters distance (2%ile),62%ile,0.0088 sites/km distance (3%ile),59%ile,0.016 facilities/km distance (0%ile),59%ile,0.023 facilities/km distance (2%ile),59%ile,59.8 ppb (97%ile),64%ile,5.93 ug/m3 (3%ile),62%ile,0.027 facilities/sq km area (6%ile),60%ile,17743852489.0000000000,41257887.0000000000,0,1,969231.5231135677,27404749177.8422279358 8028,06061021322,20899,20874,13290,6549,6904,9172,9199,0.4401646012,3881,0.1859250743,825,0.0620767494,225,0.0343563903,1429,0.0683764773,2939,0.1406287382,312,0.0340165722,231,0.0334588644,0.3130448377,0.1552546718,6542.3240634282,3244.6673856589,-896.9052371663,-589.6780917541,0.1849562857,30.0000000000,0.5000000000,12.5173455346,0.2667203153,0.0687928975,0.4515663958,0.2027045525,52.7832287582,12.1102756164,0.0258826940,-30.0094307337,-165.8882612555,-26907.1571149896,-448.4526185832,-11226.8727654026,-239.2228476257,-61.7007100657,-405.0122653138,-181.8067747336,-47341.5543077505,-10861.7696239112,-23.2143238368,California,CA,9,61.7694531724,28.3124099080,32.2625612545,63.3138029183,65.9392366308,44.1611446180,92.1063805127,31.2336817151,19.3531578232,52.0599864076,48.1147912182,98.1253263672,8.5598852754,35.4160437794,83.7767623034,95.2520218071,6.7786023570,88.6613290583,53.5138135020,56.0049245976,28.8270859466,89.7745222973,94.2035706464,6.2511191138,43.0185694890,24.7769097248,17.2770098374,9.5647689629,49.9350307593,5.0850465016,20.5837755437,15.4478896201,34.6338200533,14.8104044330,10.3206402564,53.0011626680,7,3,4,7,7,5,10,4,2,6,5,11,1,4,9,11,1,9,6,6,3,9,10,1,5,3,2,1,5,1,3,2,4,2,2,6,44% (61%ile),19% (28%ile),6% (32%ile),3% (63%ile),7% (65%ile),14% (44%ile),3% (31%ile),31% (52%ile),0.033 = fraction pre-1960 (19%ile),43%ile,0.185 ug/m3 (35%ile),24%ile,30 lifetime risk per million (83%ile),17%ile,0.5 (95%ile),9%ile,13 daily vehicles/meters distance (6%ile),49%ile,0.27 toxicity-weighted concentration/meters distance (88%ile),5%ile,0.069 sites/km distance (53%ile),20%ile,0.45 facilities/km distance (56%ile),15%ile,0.2 facilities/km distance (28%ile),34%ile,52.8 ppb (89%ile),14%ile,12.1 ug/m3 (94%ile),10%ile,0.026 facilities/sq km area (6%ile),53%ile,258653359.0000000000,119890.0000000000,0,0,124755.3452199987,427225089.6229769588 8849,06069000802,3049,3045,2076,955,1119,1493,1247,0.4089865530,747,0.2453201970,307,0.1478805395,31,0.0324607330,240,0.0787143326,468,0.1534929485,93,0.0622906899,390,0.3485254692,0.3271533750,0.1778092173,997.4906403941,542.1403034316,-87.8345013597,-17.2605942492,0.0375346206,20.0000000000,0.2000000000,15.7944927934,,0.0396183204,0.0811927061,0.1674220356,47.0434058824,7.4113546849,0.0102735941,-30.6125607956,-3.2968346872,-1756.6900271942,-17.5669002719,-1387.3013987358,,-3.4798554127,-7.1315208575,-14.7054310128,-4132.0340979390,-650.9726431509,-0.9023760119,California,CA,9,59.1858457424,41.3904741949,69.9513617378,62.0187896062,79.0518001240,52.1216510370,37.3180569516,68.3483551403,67.5701406274,54.3994266601,57.9926859232,26.1831217492,58.7612911558,2.0014414700,32.2031638835,14.4688811492,8.1570460385,,34.5749415665,10.3739430074,25.1131375379,84.5333172848,19.2864164585,4.9410824602,42.8621394303,58.0471933934,56.5430390950,57.0023528116,55.7266348497,,54.6373148803,57.1359685902,54.8116596007,56.2167239668,56.9568759225,56.2801621878,6,5,7,7,8,6,4,7,7,6,6,3,6,1,4,2,1,0,4,2,3,9,2,1,5,6,6,6,6,0,6,6,6,6,6,6,41% (59%ile),25% (41%ile),15% (69%ile),3% (62%ile),8% (79%ile),15% (52%ile),6% (68%ile),33% (54%ile),0.35 = fraction pre-1960 (67%ile),42%ile,0.0375 ug/m3 (2%ile),58%ile,20 lifetime risk per million (32%ile),56%ile,0.2 (14%ile),57%ile,16 daily vehicles/meters distance (8%ile),55%ile,,,0.04 sites/km distance (34%ile),54%ile,0.081 facilities/km distance (10%ile),57%ile,0.17 facilities/km distance (25%ile),54%ile,47 ppb (84%ile),56%ile,7.41 ug/m3 (19%ile),56%ile,0.01 facilities/sq km area (4%ile),56%ile,2987635876.0000000000,3272257.0000000000,1,0,422237.6856758550,4643687820.1565904617 diff --git a/data/data-pipeline/data_pipeline/tests/sources/persistent_poverty/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/persistent_poverty/test_etl.py index f5331ff0..441683c4 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/persistent_poverty/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/persistent_poverty/test_etl.py @@ -1,6 +1,8 @@ import pathlib from data_pipeline.tests.sources.example.test_etl import TestETL -from data_pipeline.etl.sources.persistent_poverty.etl import PersistentPovertyETL +from data_pipeline.etl.sources.persistent_poverty.etl import ( + PersistentPovertyETL, +) class TestPersistentPovertyETL(TestETL):