From d49c28ca25eae5a2e1b237d1c95e73c5989cf9b6 Mon Sep 17 00:00:00 2001 From: VincentLaUSDS Date: Wed, 22 Sep 2021 14:59:22 -0400 Subject: [PATCH] Adding etl files for ej screen areas of concern --- .../data_pipeline/etl/constants.py | 5 + .../data_pipeline/etl/score/etl_score.py | 110 ++++++++++++++++++ 2 files changed, 115 insertions(+) diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index de9e3443..84652b1a 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -69,6 +69,11 @@ DATASET_LIST = [ "module_dir": "persistent_poverty", "class_name": "PersistentPovertyETL", }, + { + "name": "ejscreen_areas_of_concern", + "module_dir": "ejscreen_areas_of_concern", + "class_name": "EJScreenAreasOfConcernETL", + }, ] CENSUS_INFO = { "name": "census", diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 8e4b2c9c..c9cc2498 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -86,6 +86,44 @@ class ScoreETL(ExtractTransformLoad): # Persistent poverty self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract" + # EJ Areas of Concern + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 70th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 75th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 80th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 85th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 90th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 95th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 70th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 75th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 80th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 85th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 90th percentile (communities)" + ) + self.EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = ( + "EJSCREEN Areas of Concern, National, 95th percentile (communities)" + ) + # dataframes self.df: pd.DataFrame self.ejscreen_df: pd.DataFrame @@ -99,6 +137,7 @@ class ScoreETL(ExtractTransformLoad): self.national_risk_index_df: pd.DataFrame self.geocorr_urban_rural_df: pd.DataFrame self.persistent_poverty_df: pd.DataFrame + self.ejscreen_areas_of_concern_df: pd.DataFrame def data_sets(self) -> list: # Define a named tuple that will be used for each data set input. @@ -215,6 +254,66 @@ class ScoreETL(ExtractTransformLoad): renamed_field=self.PERSISTENT_POVERTY_FIELD, bucket=None, ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), + DataSet( + input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME, + bucket=None, + ), # The following data sets have buckets, because they're used in Score C DataSet( input_field="CANCER", @@ -424,6 +523,16 @@ class ScoreETL(ExtractTransformLoad): low_memory=False, ) + # Load EJ Screen Areas of Concern + ejscreen_areas_of_concern_csv = ( + self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern" / "usa.csv" + ) + self.ejscreen_areas_of_concern_df = pd.read_csv( + ejscreen_areas_of_concern_csv, + dtype={self.GEOID_FIELD_NAME: "string"}, + low_memory=False, + ) + def _join_cbg_dfs(self, census_block_group_dfs: list) -> pd.DataFrame: logger.info("Joining Census Block Group dataframes") census_block_group_df = functools.reduce( @@ -701,6 +810,7 @@ class ScoreETL(ExtractTransformLoad): self.housing_and_transportation_df, self.census_acs_median_incomes_df, self.national_risk_index_df, + self.ejscreen_areas_of_concern_df, ] census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)