From 95a14adb3541caf15ab91da3b3d5abab87089218 Mon Sep 17 00:00:00 2001 From: Saran Ahluwalia <94847739+saran-ahluwalia@users.noreply.github.com> Date: Fri, 14 Jan 2022 13:50:49 -0500 Subject: [PATCH] Added Census Tract Aggregated Micro-data from EPA Risk-Screening Environmental Indicators (RSEI) model (#1101) * added initial source code - todo is comparison tool * added values * rename fields * check geoid * added black * added revisions * added clean up to comments * more comments * formatting * cleanup and address PR feedback * fix changes * final path changes * style * PR feedback * added final PR comment * fix flake 8 * add revisions --- .../data_pipeline/etl/constants.py | 5 + .../etl/sources/epa_rsei_aggregate/README.md | 0 .../sources/epa_rsei_aggregate/__init__.py | 0 .../etl/sources/epa_rsei_aggregate/etl.py | 160 ++++++++++++++++++ .../ipython/scoring_comparison.ipynb | 24 +++ .../data_pipeline/score/field_names.py | 15 ++ 6 files changed, 204 insertions(+) create mode 100644 data/data-pipeline/data_pipeline/etl/sources/epa_rsei_aggregate/README.md create mode 100644 data/data-pipeline/data_pipeline/etl/sources/epa_rsei_aggregate/__init__.py create mode 100644 data/data-pipeline/data_pipeline/etl/sources/epa_rsei_aggregate/etl.py diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py index e3cae4db..197a28c9 100644 --- a/data/data-pipeline/data_pipeline/etl/constants.py +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -89,6 +89,11 @@ DATASET_LIST = [ "module_dir": "hud_recap", "class_name": "HudRecapETL", }, + { + "name": "epa_rsei_aggregate", + "module_dir": "epa_rsei_aggregate", + "class_name": "EPARiskScreeningEnvironmentalIndicatorsETL", + }, { "name": "energy_definition_alternative_draft", "module_dir": "energy_definition_alternative_draft", diff --git a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei_aggregate/README.md b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei_aggregate/README.md new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei_aggregate/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei_aggregate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei_aggregate/etl.py b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei_aggregate/etl.py new file mode 100644 index 00000000..9afb67b4 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei_aggregate/etl.py @@ -0,0 +1,160 @@ +from pathlib import Path +import pandas as pd + +from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.score import field_names +from data_pipeline.utils import get_module_logger, unzip_file_from_url + +logger = get_module_logger(__name__) + + +class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad): + """Class for 2019 Census Tract RSEI Aggregated micro-data + + Data source overview: Page 20 in this document: + https://www.epa.gov/sites/default/files/2017-01/documents/rsei-documentation-geographic-microdata-v235.pdf + + Disaggregated and aggregated datasets for 2019 is documented here: + https://github.com/usds/justice40-tool/issues/1070#issuecomment-1005604014 + + """ + + def __init__(self): + self.AGGREGATED_RSEI_SCORE_FILE_URL = "http://abt-rsei.s3.amazonaws.com/microdata2019/census_agg/CensusMicroTracts2019_2019_aggregated.zip" + + self.OUTPUT_PATH: Path = ( + self.DATA_PATH / "dataset" / "epa_rsei_aggregated" + ) + self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75 + self.TRACT_INPUT_COLUMN_NAME = "GEOID10" + self.NUMBER_FACILITIES_INPUT_FIELD = "NUMFACS" + self.NUMBER_RELEASES_INPUT_FIELD = "NUMRELEASES" + self.NUMBER_CHEMICALS_INPUT_FIELD = "NUMCHEMS" + self.AVERAGE_TOXICITY_INPUT_FIELD = "TOXCONC" + self.SCORE_INPUT_FIELD = "SCORE" + self.POPULATION_INPUT_FIELD = "POP" + self.CSCORE_INPUT_FIELD = "CSCORE" + self.NCSCORE_INPUT_FIELD = "NSCORE" + + # References to the columns that will be output + self.COLUMNS_TO_KEEP = [ + self.GEOID_TRACT_FIELD_NAME, + field_names.EPA_RSEI_NUMBER_FACILITIES_FIELD, + field_names.EPA_RSEI_NUMBER_RELEASES_FIELD, + field_names.EPA_RSEI_NUMBER_CHEMICALS_FIELD, + field_names.EPA_RSEI_AVERAGE_TOXICITY_FIELD, + field_names.EPA_RSEI_SCORE_FIELD, + field_names.EPA_RSEI_CSCORE_FIELD, + field_names.EPA_RSEI_NCSCORE_FIELD, + field_names.EPA_RSEI_POPULATION_FIELD, + field_names.EPA_RSEI_SCORE_THRESHOLD_FIELD, + field_names.EPA_RSEI_SCORE_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, + ] + + self.df: pd.DataFrame + + def extract(self) -> None: + logger.info("Starting 2.5 MB data download.") + + # the column headers from the above dataset are actually a census tract's data at this point + # We will use this data structure later to specify the column names + input_columns = [ + self.TRACT_INPUT_COLUMN_NAME, + self.NUMBER_FACILITIES_INPUT_FIELD, + self.NUMBER_RELEASES_INPUT_FIELD, + self.NUMBER_CHEMICALS_INPUT_FIELD, + self.AVERAGE_TOXICITY_INPUT_FIELD, + self.SCORE_INPUT_FIELD, + self.POPULATION_INPUT_FIELD, + self.CSCORE_INPUT_FIELD, + self.NCSCORE_INPUT_FIELD, + ] + + unzip_file_from_url( + file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL, + download_path=self.TMP_PATH, + unzipped_file_path=self.TMP_PATH / "epa_rsei_aggregated", + ) + + self.df = pd.read_csv( + filepath_or_buffer=self.TMP_PATH + / "epa_rsei_aggregated" + / "CensusMicroTracts2019_2019_aggregated.csv", + # The following need to remain as strings for all of their digits, not get + # converted to numbers. + low_memory=False, + names=input_columns, + ) + + def transform(self) -> None: + logger.info("Starting transforms.") + + score_columns = [x for x in self.df.columns if "SCORE" in x] + + # coerce dataframe type to perform correct next steps + self.df[score_columns] = self.df[score_columns].astype(float) + + self.df.rename( + columns={ + self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, + self.NUMBER_FACILITIES_INPUT_FIELD: field_names.EPA_RSEI_NUMBER_FACILITIES_FIELD, + self.NUMBER_RELEASES_INPUT_FIELD: field_names.EPA_RSEI_NUMBER_RELEASES_FIELD, + self.NUMBER_CHEMICALS_INPUT_FIELD: field_names.EPA_RSEI_NUMBER_CHEMICALS_FIELD, + self.AVERAGE_TOXICITY_INPUT_FIELD: field_names.EPA_RSEI_AVERAGE_TOXICITY_FIELD, + self.SCORE_INPUT_FIELD: field_names.EPA_RSEI_SCORE_FIELD, + self.CSCORE_INPUT_FIELD: field_names.EPA_RSEI_CSCORE_FIELD, + self.NCSCORE_INPUT_FIELD: field_names.EPA_RSEI_NCSCORE_FIELD, + self.POPULATION_INPUT_FIELD: field_names.EPA_RSEI_POPULATION_FIELD, + }, + inplace=True, + ) + + # Please note this: https://www.epa.gov/rsei/understanding-rsei-results#what + # Section: "What does a high RSEI Score mean?" + # This was created for the sole purpose to be used in the current + # iteration of Score L + self.df[ + field_names.EPA_RSEI_SCORE_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX + ] = self.df[field_names.EPA_RSEI_SCORE_FIELD].rank( + ascending=True, + pct=True, + ) + + # This threshold was arbitrarily chosen. + # It would make sense to enrich this with facilities, industries, or chemical + # that would enable some additional form of sub-stratification when examining + # different percentile ranges that are derived above. + self.df[field_names.EPA_RSEI_SCORE_THRESHOLD_FIELD] = ( + self.df[ + field_names.EPA_RSEI_SCORE_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX + ] + >= self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF + ) + + expected_census_tract_field_length = 11 + self.df[self.GEOID_TRACT_FIELD_NAME] = ( + self.df[self.GEOID_TRACT_FIELD_NAME] + .astype(str) + .apply(lambda x: x.zfill(expected_census_tract_field_length)) + ) + + if len(self.df[self.GEOID_TRACT_FIELD_NAME].str.len().unique()) != 1: + raise ValueError( + f"GEOID Tract must be length of {expected_census_tract_field_length}" + ) + + def validate(self) -> None: + logger.info("Validating data.") + + pass + + def load(self) -> None: + logger.info("Saving CSV") + + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + self.df[self.COLUMNS_TO_KEEP].to_csv( + path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False + ) diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index e8930f59..f00f4331 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -333,6 +333,25 @@ "michigan_ejscreen_df.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "b39342aa", + "metadata": {}, + "outputs": [], + "source": [ + "# Load EPA RSEI EJSCREEN\n", + "epa_rsei_aggregate_data_path = (\n", + " DATA_DIR / \"dataset\" / \"epa_rsei_aggregated\" / \"usa.csv\"\n", + ")\n", + "epa_rsei_aggregate_df = pd.read_csv(\n", + " epa_rsei_aggregate_data_path,\n", + " dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n", + ")\n", + "\n", + "epa_rsei_aggregate_df.head()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -348,6 +367,7 @@ " calenviroscreen_df,\n", " persistent_poverty_df,\n", " mapping_inequality_df,\n", + " epa_rsei_aggregate_df,\n", " maryland_ejscreen_df,\n", " energy_definition_alternative_draft_df,\n", " michigan_ejscreen_df\n", @@ -472,6 +492,10 @@ " priority_communities_field=\"calenviroscreen_priority_community\",\n", " ),\n", " Index(\n", + " method_name=\"EPA RSEI Aggregate Microdata\",\n", + " priority_communities_field=field_names.EPA_RSEI_SCORE_THRESHOLD_FIELD\n", + " ), \n", + " Index(\n", " method_name=\"Persistent Poverty\",\n", " priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n", " ),\n", diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 4d74f4f5..612fafc0 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -155,6 +155,21 @@ CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = ( "Percent of individuals less than 100% Federal Poverty Line in 2010" ) +# RSEI Aggregated Micro-data +EPA_RSEI_NUMBER_FACILITIES_FIELD = "Number of facilities affecting the tract" +EPA_RSEI_NUMBER_RELEASES_FIELD = "Number of releases affecting the tract" +EPA_RSEI_NUMBER_CHEMICALS_FIELD = "Number of chemicals affecting the tract" +EPA_RSEI_AVERAGE_TOXICITY_FIELD = ( + "Average toxicity-weighted concentration of the cells in the tract" +) +EPA_RSEI_SCORE_FIELD = "RSEI Risk Score" +EPA_RSEI_CSCORE_FIELD = "RSEI Risk Score (Cancer toxicity weights)" +EPA_RSEI_NCSCORE_FIELD = "RSEI Risk Score (Noncancer toxicity weights)" +EPA_RSEI_POPULATION_FIELD = "Sum of the population of the cells in the tract" +EPA_RSEI_SCORE_THRESHOLD_FIELD = ( + "At or above 75 for overall percentile for the RSEI score" +) + # Combined fields that merge island areas and states data COMBINED_CENSUS_TOTAL_POPULATION_2010 = ( "Total population in 2009 (island areas) and 2019 (states and PR)"