From a98ea35f745f1b647d2649287fd76ca19880201d Mon Sep 17 00:00:00 2001
From: Saran Ahluwalia <94847739+saran-ahluwalia@users.noreply.github.com>
Date: Fri, 14 Jan 2022 13:26:48 -0500
Subject: [PATCH] Maryland EJSCREEN Addition to comparison tool (#1143)

* finalized

* cleanup notebook

* cleanup

* run black
---
 .../data_pipeline/etl/constants.py            |   5 +
 .../etl/sources/maryland_ejscreen/README.md   |  23 ++++
 .../etl/sources/maryland_ejscreen/__init__.py |   0
 .../etl/sources/maryland_ejscreen/etl.py      | 113 ++++++++++++++++++
 .../ipython/scoring_comparison.ipynb          |  28 ++++-
 .../data_pipeline/score/field_names.py        |   7 ++
 6 files changed, 174 insertions(+), 2 deletions(-)
 create mode 100644 data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/README.md
 create mode 100644 data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/__init__.py
 create mode 100644 data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py

diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py
index 8e5a832d..e3cae4db 100644
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@@ -104,6 +104,11 @@ DATASET_LIST = [
         "module_dir": "michigan_ejscreen",
         "class_name": "MichiganEnviroScreenETL",
     },
+    {
+        "name": "maryland_ejscreen",
+        "module_dir": "maryland_ejscreen",
+        "class_name": "MarylandEJScreenETL",
+    },
 ]
 CENSUS_INFO = {
     "name": "census",
diff --git a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/README.md b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/README.md
new file mode 100644
index 00000000..b64fd996
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/README.md
@@ -0,0 +1,23 @@
+# Maryland EJSCREEN
+
+The Maryland EJSCREEN application and tool can be found [here](https://p1.cgis.umd.edu/mdejscreen/).
+
+### Methodology Summary
+
+According to the [documentation](https://p1.cgis.umd.edu/mdejscreen/help.html):
+
+There exist two data categories: Population Burden and Population Characteristics.
+
+There are two indicators within Population Burden: Exposure, and Socioeconomic. Within Population Characteristics, there exist two indicators: Sensitive, Environmental Effects. Each respective indicator contains several relevant covariates, and an averaged score. 
+
+The two "Pollution Burden" average scores are then averaged together and the result is multiplied by the average of the "Population Characteristics" categories to get the total EJ Score for each tract.
+
+For each indicator, the percentile is given. For example, the indicator value for "Asthma Emergency Discharges" with 0.9 is therefore in the 90th percentile, which means only 10% of tracts in Maryland have higher values. EJ Scores near 1 represent areas of the greatest environmental justice concern.
+
+A study of Bladensburg, MD - located in Prince George’s County - demonstrated the application of the MD EJSCREEN (Driver et al., 2019). According to the study, The Bladensburg population is composed of 20.1% of the community members living below the federal poverty line. Through an analysis, leveraging the Maryland EJSCREEN, Bladensburg with MD EJSCREEN, the researchers found that Bladensburg has an EJ score higher than 99% of the census tracts in Prince George’s County, indicating a higher prevalence of environmental hazards in the region.
+
+Furthermore, it was determined that Bladensburg residents are at a higher risk of developing cancer due to air pollution than 90–100% of the census tracts in the state or county.
+
+Source:
+
+Driver, A.; Mehdizadeh, C.; Bara-Garcia, S.; Bodenreider, C.; Lewis, J.; Wilson, S. Utilization of the Maryland Environmental Justice Screening Tool: A Bladensburg, Maryland Case Study. Int. J. Environ. Res. Public Health 2019, 16, 348. 
\ No newline at end of file
diff --git a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/__init__.py b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
new file mode 100644
index 00000000..b59e3209
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
@@ -0,0 +1,113 @@
+from glob import glob
+import geopandas as gpd
+import pandas as pd
+
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.utils import get_module_logger
+from data_pipeline.score import field_names
+from data_pipeline.config import settings
+
+logger = get_module_logger(__name__)
+
+
+class MarylandEJScreenETL(ExtractTransformLoad):
+    """Maryland EJSCREEN class that ingests dataset represented
+    here: https://p1.cgis.umd.edu/mdejscreen/help.html
+    Please see the README in this module for further details.
+    """
+
+    def __init__(self):
+        self.MARYLAND_EJSCREEN_URL = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
+        )
+
+        self.SHAPE_FILES_PATH = self.TMP_PATH / "mdejscreen"
+        self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"
+
+        self.COLUMNS_TO_KEEP = [
+            self.GEOID_TRACT_FIELD_NAME,
+            field_names.MARYLAND_EJSCREEN_SCORE_FIELD,
+            field_names.MARYLAND_EJSCREEN_BURDENED_THRESHOLD_FIELD,
+        ]
+
+        self.df: pd.DataFrame
+
+    def extract(self) -> None:
+        logger.info("Downloading Maryland EJSCREEN Data")
+        super().extract(
+            self.MARYLAND_EJSCREEN_URL,
+            self.TMP_PATH,
+        )
+
+    def transform(self) -> None:
+        logger.info("Transforming Maryland EJSCREEN Data")
+
+        list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
+
+        # Ignore counties becauses this is not the level of measurement
+        # that is consistent with our current scoring and ranking methodology.
+        dfs_list = [
+            gpd.read_file(f)
+            for f in list_of_files
+            if not f.endswith("CountiesEJScore.shp")
+        ]
+
+        # Set the Census tract as the index and drop the geometry column
+        # that produces the census tract boundaries.
+        # The latter is because Geopandas raises an exception if there
+        # are duplicate geometry columns.
+        # Moreover, since the unit of measurement is at the tract level
+        # we can consistantly merge this with other datasets
+        dfs_list = [
+            df.set_index("Census_Tra").drop("geometry", axis=1)
+            for df in dfs_list
+        ]
+        # pylint: disable=unsubscriptable-object
+        self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1))
+
+        # Reset index so that we no longer have the tract as our index
+        self.df = self.df.reset_index()
+        # coerce GEODID into integer
+        # The only reason why this is done is because Maryland's GEODID's start with
+        # "24". This is NOT standard practice and should never be done as rightly pointed
+        # out by Lucas: "converting to int would lose the leading 0 and make this geoid invalid".
+        # pylint: disable=unsupported-assignment-operation, unsubscriptable-object
+        self.df["Census_Tra"] = (self.df["Census_Tra"]).astype(int)
+
+        # Drop the 10 census tracts that are zero: please see here:
+        # https://github.com/usds/justice40-tool/issues/239#issuecomment-995821572
+        self.df = self.df[self.df["Census_Tra"] != 0]
+        # Rename columns
+        self.df.rename(
+            columns={
+                "Census_Tra": self.GEOID_TRACT_FIELD_NAME,
+                "EJScore": field_names.MARYLAND_EJSCREEN_SCORE_FIELD,
+            },
+            inplace=True,
+        )
+
+        # This computational step will be used to establish a
+        # threshold for burden (line 104)
+        self.df[
+            field_names.MARYLAND_EJSCREEN_SCORE_FIELD
+            + field_names.PERCENTILE_FIELD_SUFFIX
+        ] = self.df[field_names.MARYLAND_EJSCREEN_SCORE_FIELD].rank(
+            pct=True, ascending=True
+        )
+
+        # An arbitrarily chosen threshold is used in the comparison tool output
+        self.df[field_names.MARYLAND_EJSCREEN_BURDENED_THRESHOLD_FIELD] = (
+            self.df[
+                field_names.MARYLAND_EJSCREEN_SCORE_FIELD
+                + field_names.PERCENTILE_FIELD_SUFFIX
+            ]
+            >= 0.75
+        )
+
+    def load(self) -> None:
+        logger.info("Saving Maryland EJSCREEN CSV")
+        # write maryland tracts to csv
+        self.OUTPUT_CSV_PATH.mkdir(parents=True, exist_ok=True)
+        self.df[self.COLUMNS_TO_KEEP].to_csv(
+            self.OUTPUT_CSV_PATH / "maryland.csv", index=False
+        )
diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
index 8211cff0..e8930f59 100644
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@@ -276,6 +276,25 @@
     "mapping_inequality_df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c290efa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load Maryland EJScreen\n",
+    "maryland_ejscreen_data_path = (\n",
+    "    DATA_DIR / \"dataset\" / \"maryland_ejscreen\" / \"maryland.csv\"\n",
+    ")\n",
+    "maryland_ejscreen_df = pd.read_csv(\n",
+    "    maryland_ejscreen_data_path,\n",
+    "    dtype={ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: \"string\"},\n",
+    ")\n",
+    "\n",
+    "maryland_ejscreen_df.tail()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -329,6 +348,7 @@
     "    calenviroscreen_df,\n",
     "    persistent_poverty_df,\n",
     "    mapping_inequality_df,\n",
+    "    maryland_ejscreen_df,\n",
     "    energy_definition_alternative_draft_df,\n",
     "    michigan_ejscreen_df\n",
     "]\n",
@@ -456,6 +476,10 @@
     "            priority_communities_field=PERSISTENT_POVERTY_TRACT_LEVEL_FIELD,\n",
     "        ),\n",
     "        Index(\n",
+    "            method_name=\"Maryland EJSCREEN\",\n",
+    "            priority_communities_field=field_names.MARYLAND_EJSCREEN_BURDENED_THRESHOLD_FIELD,\n",
+    "        ),        \n",
+    "        Index(\n",
     "            method_name=field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE,\n",
     "            priority_communities_field=field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE,\n",
     "        ),\n",
@@ -1301,7 +1325,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -1315,7 +1339,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.6.2"
   }
  },
  "nbformat": 4,
diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py
index 75090667..4d74f4f5 100644
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@@ -223,6 +223,13 @@ MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_FIELD: str = (
     "Michigan EJSCREEN Priority Community"
 )
 
+# Maryland EJSCREEN Data.
+MARYLAND_EJSCREEN_SCORE_FIELD: str = "Maryland Environmental Justice Score"
+
+MARYLAND_EJSCREEN_BURDENED_THRESHOLD_FIELD: str = (
+    "Maryland EJSCREEN Priority Community"
+)
+
 # Child Opportunity Index data
 # Summer days with maximum temperature above 90F.
 EXTREME_HEAT_FIELD = "Summer days above 90F"