Add EJSCREEN Areas of Concern (#843)

* Adding ej screen areas of concern * Uses it where user has local files, but not otherwise Co-authored-by: VincentLaUSDS <vincent.la@omb.eop.gov>
2025-07-28 05:21:17 -07:00 · 2021-11-02 15:38:42 -04:00 · 2021-11-02 15:38:42 -04:00 · 1d541be447
commit 1d541be447
parent 1795be6cb4
10 changed files with 2546 additions and 18 deletions
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -12,7 +12,7 @@ DATASET_LIST = [
    {
        "name": "ejscreen",
        "module_dir": "ejscreen",
-        "class_name": "EJScreenETL",
+        "class_name": "EJSCREENETL",
    },
    {
        "name": "housing_and_transportation",
@ -69,6 +69,11 @@ DATASET_LIST = [
        "module_dir": "persistent_poverty",
        "class_name": "PersistentPovertyETL",
    },
+    {
+        "name": "ejscreen_areas_of_concern",
+        "module_dir": "ejscreen_areas_of_concern",
+        "class_name": "EJSCREENAreasOfConcernETL",
+    },
 ]
 CENSUS_INFO = {
    "name": "census",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -8,7 +8,6 @@ from data_pipeline.etl.score import constants

 from data_pipeline.utils import get_module_logger

-
 logger = get_module_logger(__name__)


@ -231,6 +230,7 @@ class ScoreETL(ExtractTransformLoad):
            self.census_acs_median_incomes_df,
            self.national_risk_index_df,
        ]
+
        census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)

        # Join all the data sources that use census tracts
@ -312,10 +312,12 @@ class ScoreETL(ExtractTransformLoad):
            field_names.UNEMPLOYMENT_FIELD,
            field_names.HT_INDEX_FIELD,
        ]
+
        non_numeric_columns = [
            self.GEOID_FIELD_NAME,
            field_names.PERSISTENT_POVERTY_FIELD,
        ]
+
        columns_to_keep = non_numeric_columns + numeric_columns
        df = df[columns_to_keep]

--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@ -6,7 +6,7 @@ from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)


-class EJScreenETL(ExtractTransformLoad):
+class EJSCREENETL(ExtractTransformLoad):
    def __init__(self):
        self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
        self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv"
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/README.md
@ -0,0 +1,3 @@
+# EJ Screen Areas of Concern Data
+
+Note, this dataset is actually not public, so the data file must be stored locally in order for this to run
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
@ -0,0 +1,73 @@
+import pandas as pd
+
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
+    # Note: while we normally set these properties in `__init__`,
+    # we are setting them as class properties here so they can be accessed by the
+    # class method `ejscreen_areas_of_concern_data_exists`.
+    LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
+    EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
+        LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
+    )
+
+    def __init__(self):
+        self.OUTPUT_PATH = (
+            self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
+        )
+
+        # TO DO: Load from actual source; the issue is that this dataset is not public for now
+        self.df: pd.DataFrame
+
+    @classmethod
+    def ejscreen_areas_of_concern_data_exists(cls):
+        """Check whether or not the EJSCREEN areas of concern data exists.
+
+        Note: this data is provided privately and is not currently publicly available.
+
+        To enable the ETL code for EJSCREEN AoCs to run appropriately whether or not the person
+        running it has access to that data, the following method checks whether the source file exists.
+
+        If it does exist, code can and should include to this data. If it does not exist, code should
+        not reference this data.
+
+        """
+        return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
+
+    def extract(self) -> None:
+        if self.ejscreen_areas_of_concern_data_exists():
+            logger.info("Loading EJSCREEN Areas of Concern Data Locally")
+            self.df = pd.read_csv(
+                filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
+                dtype={
+                    self.GEOID_FIELD_NAME: "string",
+                },
+                low_memory=False,
+            )
+        else:
+            logger.info(
+                "EJSCREEN areas of concern data does not exist locally. Not loading the data."
+            )
+
+    def transform(self) -> None:
+        logger.info("Transforming EJSCREEN Areas of Concern Data")
+
+        # TO DO: As a one off we did all the processing in a separate Notebook
+        # Can add here later for a future PR
+        pass
+
+    def load(self) -> None:
+        if self.ejscreen_areas_of_concern_data_exists():
+            logger.info("Saving EJSCREEN Areas of Concern Data")
+            # write nationwide csv
+            self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+            self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False)
+
+        else:
+            logger.info(
+                "EJSCREEN areas of concern data does not exist locally. Not saving the data."
+            )
--- a/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/ejscreen_load.ipynb
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@ -34,7 +34,9 @@
    "\n",
    "from data_pipeline.utils import remove_all_from_dir, get_excel_column_name\n",
    "from data_pipeline.etl.sources.census.etl_utils import get_state_information\n",
-    "\n",
+    "from data_pipeline.etl.sources.ejscreen_areas_of_concern.etl import (\n",
+    "    EJSCREENAreasOfConcernETL,\n",
+    ")\n",
    "\n",
    "# Turn on TQDM for pandas so that we can have progress bars when running `apply`.\n",
    "tqdm_notebook.pandas()"
@ -77,6 +79,14 @@
    "CEJST_PERCENTILE_FIELD = \"cejst_percentile\"\n",
    "CEJST_PRIORITY_COMMUNITY_FIELD = \"cejst_priority_community\"\n",
    "\n",
+    "LIFE_EXPECTANCY_FIELD = \"Life expectancy (years)\"\n",
+    "HEALTH_INSURANCE_FIELD = (\n",
+    "    \"Current lack of health insurance among adults aged 18-64 years\"\n",
+    ")\n",
+    "BAD_HEALTH_FIELD = (\n",
+    "    \"Physical health not good for >=14 days among adults aged >=18 years\"\n",
+    ")\n",
+    "\n",
    "# Define some suffixes\n",
    "POPULATION_SUFFIX = \" (priority population)\""
   ]
@ -108,6 +118,55 @@
    "cejst_df.head()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b1083e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load EJSCREEN Areas of Concern data.\n",
+    "\n",
+    "# Load EJ Screen Areas of Concern\n",
+    "# Before attempting, check whether or not the EJSCREEN AoC data is available locally.\n",
+    "ejscreen_areas_of_concern_df: pd.DataFrame = None\n",
+    "\n",
+    "if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n",
+    "    print(\"Loading EJSCREEN Areas of Concern data for score pipeline.\")\n",
+    "    ejscreen_areas_of_concern_csv = (\n",
+    "        DATA_DIR / \"dataset\" / \"ejscreen_areas_of_concern\" / \"usa.csv\"\n",
+    "    )\n",
+    "    ejscreen_areas_of_concern_df = pd.read_csv(\n",
+    "        ejscreen_areas_of_concern_csv,\n",
+    "        dtype={GEOID_FIELD_NAME: \"string\"},\n",
+    "        low_memory=False,\n",
+    "    )\n",
+    "else:\n",
+    "    print(\n",
+    "        \"EJSCREEN areas of concern data does not exist locally. Not attempting to load data into comparison tool.\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fec0ed63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Merge EJSCREEN AoCs into CEJST data.\n",
+    "# Before attempting, check whether or not the EJSCREEN AoC data is available locally.\n",
+    "if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n",
+    "    # If available, merge EJSCREEN AoC data into CBG dfs.\n",
+    "    cejst_df = cejst_df.merge(\n",
+    "        ejscreen_areas_of_concern_df, on=GEOID_FIELD_NAME, how=\"outer\"\n",
+    "    )\n",
+    "else:\n",
+    "    pass\n",
+    "\n",
+    "cejst_df.head()"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -343,11 +402,6 @@
    "            other_census_tract_fields_to_keep=[],\n",
    "        ),\n",
    "        Index(\n",
-    "            method_name=\"Poverty\",\n",
-    "            priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
-    "            other_census_tract_fields_to_keep=[],\n",
-    "        ),\n",
-    "        Index(\n",
    "            method_name=\"Persistent Poverty (CBG)\",\n",
    "            priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
    "            other_census_tract_fields_to_keep=[],\n",
@ -355,6 +409,34 @@
    "    ]\n",
    ")\n",
    "\n",
+    "\n",
+    "ejscreen_areas_of_concern_census_block_group_indices = [\n",
+    "    Index(\n",
+    "        method_name=\"EJSCREEN Areas of Concern, National, 80th percentile\",\n",
+    "        priority_communities_field=\"EJSCREEN Areas of Concern, National, 80th percentile (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
+    "        method_name=\"EJSCREEN Areas of Concern, National, 90th percentile\",\n",
+    "        priority_communities_field=\"EJSCREEN Areas of Concern, National, 90th percentile (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
+    "        method_name=\"EJSCREEN Areas of Concern, National, 95th percentile\",\n",
+    "        priority_communities_field=\"EJSCREEN Areas of Concern, National, 95th percentile (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "# Before including EJSCREEN AoC indicators are included, check whether or not the EJSCREEN AoC data is available locally.\n",
+    "if EJSCREENAreasOfConcernETL.ejscreen_areas_of_concern_data_exists():\n",
+    "    # Add EJSCREEN AoCs to all of the CBG indices.\n",
+    "    census_block_group_indices.extend(\n",
+    "        ejscreen_areas_of_concern_census_block_group_indices\n",
+    "    )\n",
+    "else:\n",
+    "    pass\n",
+    "\n",
    "census_tract_indices = [\n",
    "    Index(\n",
    "        method_name=\"Persistent Poverty\",\n",
@ -620,6 +702,17 @@
    "    for index in census_block_group_indices + census_tract_indices\n",
    "]\n",
    "\n",
+    "# Convert all indices to boolean\n",
+    "for field_to_analyze in fields_to_analyze:\n",
+    "    if \"Areas of Concern\" in field_to_analyze:\n",
+    "        print(f\"Converting {field_to_analyze} to boolean.\")\n",
+    "\n",
+    "        merged_df[field_to_analyze] = merged_df[field_to_analyze].fillna(\n",
+    "            value=0\n",
+    "        )\n",
+    "        merged_df[field_to_analyze] = merged_df[field_to_analyze].astype(bool)\n",
+    "\n",
+    "\n",
    "state_fips_codes = get_state_information(DATA_DIR)\n",
    "\n",
    "merged_with_state_information_df = merged_df.merge(\n",
@ -835,6 +928,9 @@
    "    \"Unemployed civilians (percent)\",\n",
    "    \"Median household income in the past 12 months\",\n",
    "    URBAN_HEURISTIC_FIELD,\n",
+    "    LIFE_EXPECTANCY_FIELD,\n",
+    "    HEALTH_INSURANCE_FIELD,\n",
+    "    BAD_HEALTH_FIELD,\n",
    "]\n",
    "\n",
    "for (index_a, index_b) in itertools.combinations(census_block_group_indices, 2):\n",
@ -1495,7 +1591,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -1509,7 +1605,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.5"
+   "version": "3.9.6"
  }
 },
 "nbformat": 4,
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -56,7 +56,9 @@ POVERTY_LESS_THAN_100_FPL_PERCENTILE_FIELD = (
    "Percent of individuals < 100% Federal Poverty Line (percentile)"
 )
 MEDIAN_INCOME_PERCENT_AMI_FIELD = "Median household income (% of AMI)"
-MEDIAN_INCOME_PERCENT_AMI_PERCENTILE_FIELD = "Median household income (% of AMI) (percentile)"
+MEDIAN_INCOME_PERCENT_AMI_PERCENTILE_FIELD = (
+    "Median household income (% of AMI) (percentile)"
+)
 STATE_MEDIAN_INCOME_FIELD = (
    "Median household income (State; 2019 inflation-adjusted dollars)"
 )
@ -153,3 +155,42 @@ OVER_64_FIELD = "Individuals over 64 years old"

 # Urban Rural Map
 URBAN_HERUISTIC_FIELD = "Urban Heuristic Flag"
+
+
+# EJSCREEN Areas of Concern
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, National, 70th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, National, 75th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, National, 80th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, National, 85th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, National, 90th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, National, 95th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, State, 70th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, State, 75th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, State, 80th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, State, 85th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, State, 90th percentile (communities)"
+)
+EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+    "EJSCREEN Areas of Concern, State, 95th percentile (communities)"
+)
--- a/data/data-pipeline/data_pipeline/score/score_c.py
+++ b/data/data-pipeline/data_pipeline/score/score_c.py
@ -10,7 +10,7 @@ logger = get_module_logger(__name__)

 class ScoreC(Score):
    def __init__(self, df: pd.DataFrame) -> None:
-        Bucket = namedtuple('Bucket', ['name', 'fields'])
+        Bucket = namedtuple(typename="Bucket", field_names=["name", "fields"])

        self.BUCKET_SOCIOECONOMIC = Bucket(
            field_names.C_SOCIOECONOMIC,
@ -20,15 +20,15 @@ class ScoreC(Score):
                field_names.HIGH_SCHOOL_ED_FIELD,
                field_names.UNEMPLOYMENT_FIELD,
                field_names.HT_INDEX_FIELD,
-            ]
-        ) 
+            ],
+        )
        self.BUCKET_SENSITIVE = Bucket(
            field_names.C_SENSITIVE,
            [
                field_names.UNDER_5_FIELD,
                field_names.OVER_64_FIELD,
                field_names.LINGUISTIC_ISO_FIELD,
-            ]
+            ],
        )
        self.BUCKET_ENVIRONMENTAL = Bucket(
            field_names.C_ENVIRONMENTAL,
@ -38,7 +38,7 @@ class ScoreC(Score):
                field_names.NPL_FIELD,
                field_names.WASTEWATER_FIELD,
                field_names.LEAD_PAINT_FIELD,
-            ]
+            ],
        )
        self.BUCKET_EXPOSURES = Bucket(
            field_names.C_EXPOSURES,
@ -63,7 +63,7 @@ class ScoreC(Score):
    def add_columns(self) -> pd.DataFrame:
        logger.info("Adding Score C")
        # Average all the percentile values in each bucket into a single score for each of the four buckets.
-        
+
        # TODO just use the percentile fields in the list instead
        for bucket in self.BUCKETS:
            fields_to_average = []