Adding etl files for ej screen areas of concern

This commit is contained in:
VincentLaUSDS 2021-09-22 14:59:22 -04:00 committed by lucasmbrown-usds
commit d49c28ca25
2 changed files with 115 additions and 0 deletions

View file

@ -69,6 +69,11 @@ DATASET_LIST = [
"module_dir": "persistent_poverty", "module_dir": "persistent_poverty",
"class_name": "PersistentPovertyETL", "class_name": "PersistentPovertyETL",
}, },
{
"name": "ejscreen_areas_of_concern",
"module_dir": "ejscreen_areas_of_concern",
"class_name": "EJScreenAreasOfConcernETL",
},
] ]
CENSUS_INFO = { CENSUS_INFO = {
"name": "census", "name": "census",

View file

@ -86,6 +86,44 @@ class ScoreETL(ExtractTransformLoad):
# Persistent poverty # Persistent poverty
self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract" self.PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract"
# EJ Areas of Concern
self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 70th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 75th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 80th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 85th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 90th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 95th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 70th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 75th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 80th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 85th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 90th percentile (communities)"
)
self.EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
"EJSCREEN Areas of Concern, National, 95th percentile (communities)"
)
# dataframes # dataframes
self.df: pd.DataFrame self.df: pd.DataFrame
self.ejscreen_df: pd.DataFrame self.ejscreen_df: pd.DataFrame
@ -99,6 +137,7 @@ class ScoreETL(ExtractTransformLoad):
self.national_risk_index_df: pd.DataFrame self.national_risk_index_df: pd.DataFrame
self.geocorr_urban_rural_df: pd.DataFrame self.geocorr_urban_rural_df: pd.DataFrame
self.persistent_poverty_df: pd.DataFrame self.persistent_poverty_df: pd.DataFrame
self.ejscreen_areas_of_concern_df: pd.DataFrame
def data_sets(self) -> list: def data_sets(self) -> list:
# Define a named tuple that will be used for each data set input. # Define a named tuple that will be used for each data set input.
@ -215,6 +254,66 @@ class ScoreETL(ExtractTransformLoad):
renamed_field=self.PERSISTENT_POVERTY_FIELD, renamed_field=self.PERSISTENT_POVERTY_FIELD,
bucket=None, bucket=None,
), ),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
DataSet(
input_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
renamed_field=self.EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME,
bucket=None,
),
# The following data sets have buckets, because they're used in Score C # The following data sets have buckets, because they're used in Score C
DataSet( DataSet(
input_field="CANCER", input_field="CANCER",
@ -424,6 +523,16 @@ class ScoreETL(ExtractTransformLoad):
low_memory=False, low_memory=False,
) )
# Load EJ Screen Areas of Concern
ejscreen_areas_of_concern_csv = (
self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern" / "usa.csv"
)
self.ejscreen_areas_of_concern_df = pd.read_csv(
ejscreen_areas_of_concern_csv,
dtype={self.GEOID_FIELD_NAME: "string"},
low_memory=False,
)
def _join_cbg_dfs(self, census_block_group_dfs: list) -> pd.DataFrame: def _join_cbg_dfs(self, census_block_group_dfs: list) -> pd.DataFrame:
logger.info("Joining Census Block Group dataframes") logger.info("Joining Census Block Group dataframes")
census_block_group_df = functools.reduce( census_block_group_df = functools.reduce(
@ -701,6 +810,7 @@ class ScoreETL(ExtractTransformLoad):
self.housing_and_transportation_df, self.housing_and_transportation_df,
self.census_acs_median_incomes_df, self.census_acs_median_incomes_df,
self.national_risk_index_df, self.national_risk_index_df,
self.ejscreen_areas_of_concern_df,
] ]
census_block_group_df = self._join_cbg_dfs(census_block_group_dfs) census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)