Add EJSCREEN Areas of Concern (#843)

* Adding ej screen areas of concern * Uses it where user has local files, but not otherwise Co-authored-by: VincentLaUSDS <vincent.la@omb.eop.gov>
2025-07-26 19:11:16 -07:00 · 2021-11-02 15:38:42 -04:00 · 2021-11-02 15:38:42 -04:00 · 1d541be447
commit 1d541be447
parent 1795be6cb4
10 changed files with 2546 additions and 18 deletions
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -12,7 +12,7 @@ DATASET_LIST = [
    {
        "name": "ejscreen",
        "module_dir": "ejscreen",
-        "class_name": "EJScreenETL",
+        "class_name": "EJSCREENETL",
    },
    {
        "name": "housing_and_transportation",
@ -69,6 +69,11 @@ DATASET_LIST = [
        "module_dir": "persistent_poverty",
        "class_name": "PersistentPovertyETL",
    },
+    {
+        "name": "ejscreen_areas_of_concern",
+        "module_dir": "ejscreen_areas_of_concern",
+        "class_name": "EJSCREENAreasOfConcernETL",
+    },
 ]
 CENSUS_INFO = {
    "name": "census",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -8,7 +8,6 @@ from data_pipeline.etl.score import constants

 from data_pipeline.utils import get_module_logger

-
 logger = get_module_logger(__name__)


@ -231,6 +230,7 @@ class ScoreETL(ExtractTransformLoad):
            self.census_acs_median_incomes_df,
            self.national_risk_index_df,
        ]
+
        census_block_group_df = self._join_cbg_dfs(census_block_group_dfs)

        # Join all the data sources that use census tracts
@ -312,10 +312,12 @@ class ScoreETL(ExtractTransformLoad):
            field_names.UNEMPLOYMENT_FIELD,
            field_names.HT_INDEX_FIELD,
        ]
+
        non_numeric_columns = [
            self.GEOID_FIELD_NAME,
            field_names.PERSISTENT_POVERTY_FIELD,
        ]
+
        columns_to_keep = non_numeric_columns + numeric_columns
        df = df[columns_to_keep]

--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@ -6,7 +6,7 @@ from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)


-class EJScreenETL(ExtractTransformLoad):
+class EJSCREENETL(ExtractTransformLoad):
    def __init__(self):
        self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2019/EJSCREEN_2019_StatePctile.csv.zip"
        self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_2019_StatePctiles.csv"
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/README.md
@ -0,0 +1,3 @@
+# EJ Screen Areas of Concern Data
+
+Note, this dataset is actually not public, so the data file must be stored locally in order for this to run
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
@ -0,0 +1,73 @@
+import pandas as pd
+
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
+    # Note: while we normally set these properties in `__init__`,
+    # we are setting them as class properties here so they can be accessed by the
+    # class method `ejscreen_areas_of_concern_data_exists`.
+    LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
+    EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
+        LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
+    )
+
+    def __init__(self):
+        self.OUTPUT_PATH = (
+            self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
+        )
+
+        # TO DO: Load from actual source; the issue is that this dataset is not public for now
+        self.df: pd.DataFrame
+
+    @classmethod
+    def ejscreen_areas_of_concern_data_exists(cls):
+        """Check whether or not the EJSCREEN areas of concern data exists.
+
+        Note: this data is provided privately and is not currently publicly available.
+
+        To enable the ETL code for EJSCREEN AoCs to run appropriately whether or not the person
+        running it has access to that data, the following method checks whether the source file exists.
+
+        If it does exist, code can and should include to this data. If it does not exist, code should
+        not reference this data.
+
+        """
+        return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
+
+    def extract(self) -> None:
+        if self.ejscreen_areas_of_concern_data_exists():
+            logger.info("Loading EJSCREEN Areas of Concern Data Locally")
+            self.df = pd.read_csv(
+                filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
+                dtype={
+                    self.GEOID_FIELD_NAME: "string",
+                },
+                low_memory=False,
+            )
+        else:
+            logger.info(
+                "EJSCREEN areas of concern data does not exist locally. Not loading the data."
+            )
+
+    def transform(self) -> None:
+        logger.info("Transforming EJSCREEN Areas of Concern Data")
+
+        # TO DO: As a one off we did all the processing in a separate Notebook
+        # Can add here later for a future PR
+        pass
+
+    def load(self) -> None:
+        if self.ejscreen_areas_of_concern_data_exists():
+            logger.info("Saving EJSCREEN Areas of Concern Data")
+            # write nationwide csv
+            self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+            self.df.to_csv(self.OUTPUT_PATH / "usa.csv", index=False)
+
+        else:
+            logger.info(
+                "EJSCREEN areas of concern data does not exist locally. Not saving the data."
+            )