Merge branch 'main' into esfoobar-usds/1062-implement-changes-export-files

2025-07-28 08:51:16 -07:00 · 2022-01-03 15:53:41 -05:00 · 2022-01-03 15:53:41 -05:00 · 006493ab24
commit 006493ab24
parent 45b33ea55f a4137fdc98
18 changed files with 577 additions and 172 deletions
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -89,11 +89,21 @@ DATASET_LIST = [
        "module_dir": "hud_recap",
        "class_name": "HudRecapETL",
    },
+    {
+        "name": "energy_definition_alternative_draft",
+        "module_dir": "energy_definition_alternative_draft",
+        "class_name": "EnergyDefinitionAlternativeDraft",
+    },
    {
        "name": "tree_equity_score",
        "module_dir": "tree_equity_score",
        "class_name": "TreeEquityScoreETL",
    },
+    {
+        "name": "michigan_ejscreen",
+        "module_dir": "michigan_ejscreen",
+        "class_name": "MichiganEnviroScreenETL",
+    },
 ]
 CENSUS_INFO = {
    "name": "census",
--- a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/README.md
--- a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
@ -0,0 +1,113 @@
+from pathlib import Path
+import pandas as pd
+
+from data_pipeline.config import settings
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.score import field_names
+from data_pipeline.utils import get_module_logger, unzip_file_from_url
+
+logger = get_module_logger(__name__)
+
+
+class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
+    def __init__(self):
+        self.DEFINITION_ALTERNATIVE_FILE_URL = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/alternative DAC definition.csv.zip"
+        )
+
+        self.OUTPUT_PATH: Path = (
+            self.DATA_PATH / "dataset" / "energy_definition_alternative_draft"
+        )
+
+        self.TRACT_INPUT_COLUMN_NAME = "GEOID"
+        self.ALTERNATIVE_DEFINITION_INPUT_COLUMN_NAME = "J40_DAC"
+
+        # Constants for output
+        self.COLUMNS_TO_KEEP = [
+            self.GEOID_TRACT_FIELD_NAME,
+            field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE,
+            field_names.COAL_EMPLOYMENT,
+            field_names.OUTAGE_EVENTS,
+            field_names.HOMELESSNESS,
+            field_names.DISABLED_POPULATION,
+            field_names.OUTAGE_DURATION,
+            field_names.JOB_ACCESS,
+            field_names.FOSSIL_ENERGY_EMPLOYMENT,
+            field_names.FOOD_DESERT,
+            field_names.INCOMPLETE_PLUMBING,
+            field_names.NON_GRID_CONNECTED_HEATING_FUEL,
+            field_names.PARKS,
+            field_names.GREATER_THAN_30_MIN_COMMUTE,
+            field_names.INTERNET_ACCESS,
+            field_names.MOBILE_HOME,
+            field_names.SINGLE_PARENT,
+            field_names.TRANSPORTATION_COSTS,
+        ]
+
+        self.df: pd.DataFrame
+
+    def extract(self) -> None:
+        logger.info("Starting data download.")
+
+        unzip_file_from_url(
+            file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
+            download_path=self.TMP_PATH,
+            unzipped_file_path=self.TMP_PATH
+            / "energy_definition_alternative_draft",
+        )
+
+        self.df = pd.read_csv(
+            filepath_or_buffer=self.TMP_PATH
+            / "energy_definition_alternative_draft"
+            / "J40 alternative DAC definition.csv",
+            # The following need to remain as strings for all of their digits, not get converted to numbers.
+            dtype={
+                self.TRACT_INPUT_COLUMN_NAME: "string",
+            },
+            low_memory=False,
+        )
+
+    def transform(self) -> None:
+        logger.info("Starting transforms.")
+
+        self.df = self.df.rename(
+            columns={
+                self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
+                self.ALTERNATIVE_DEFINITION_INPUT_COLUMN_NAME: field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE,
+                "Coal_Emp_Ratio": field_names.COAL_EMPLOYMENT,
+                "COUNT_Outage_Events": field_names.OUTAGE_EVENTS,
+                "den_hmls_pop": field_names.HOMELESSNESS,
+                "disability_pct": field_names.DISABLED_POPULATION,
+                "Duration_in_Minutes": field_names.OUTAGE_DURATION,
+                "emp_ovrll_ndx": field_names.JOB_ACCESS,
+                "FE_Emp_Ratio": field_names.FOSSIL_ENERGY_EMPLOYMENT,
+                "Food_LAhalfand10": field_names.FOOD_DESERT,
+                "incomplete_plumbing_pct": field_names.INCOMPLETE_PLUMBING,
+                "nongrid_heat_pct": field_names.NON_GRID_CONNECTED_HEATING_FUEL,
+                "num_parks": field_names.PARKS,
+                "Per_MoT_Dur_gte30": field_names.GREATER_THAN_30_MIN_COMMUTE,
+                "Per_NoInt": field_names.INTERNET_ACCESS,
+                "population_mobile_home_pct": field_names.MOBILE_HOME,
+                "single_parent_pct": field_names.SINGLE_PARENT,
+                "t_ami": field_names.TRANSPORTATION_COSTS,
+            }
+        )
+
+        # Convert to boolean:
+        self.df[field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE] = \
+            self.df[field_names.ENERGY_RELATED_COMMUNITIES_DEFINITION_ALTERNATIVE
+            ].astype('bool')
+
+    def validate(self) -> None:
+        logger.info("Validating data")
+
+        pass
+
+    def load(self) -> None:
+        logger.info("Saving CSV")
+
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+        self.df[self.COLUMNS_TO_KEEP].to_csv(
+            path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
+        )
--- a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/README.md
@ -0,0 +1,32 @@
+# Michigan EJSCREEN
+
+The Michigan EJSCREEN description and publication can be found [here](https://deepblue.lib.umich.edu/bitstream/handle/2027.42/149105/AssessingtheStateofEnvironmentalJusticeinMichigan_344.pdf).
+
+
+#### Some notes about the input source data column fields:
+
+There are two pertinent columns used - `EJ_Score_Cal_Min` and `Pct_CalMin` that are referenced in the source codebase. To our knowledge, these columns reflect the adoption and the comparative quantitative analysis from two different approaches. The "Cal" prefix reflects CalEPA's CalEnviroScreen that omits racial and ethnic data. The "Min" abbreviation reflects Minnesota Pollution Control Agency’s (MPCA) approach to including this data. Please see pages 37 - 39 in the above reference for further details. Briefly, the authors adopted a combination of both the CalEnviroScreen's methodology and the MCPA's methodology. The scores and percentile rankings in the input data source sheet are the same as those reflected in the cited report, included in Appendix I and in the latest version of the mapping [tool](https://www.arcgis.com/apps/webappviewer/index.html?id=dc4f0647dda34959963488d3f519fd24).
+
+#### Additional information on the adoption of the methodology from CalEnviroScreen and MCPA
+
+Both CalEPA's CalEnviroScreen and the Minnesota Pollution Control Agency’s (MPCA) methodology are adopted and used for both comparative purposes and for the identification of areas of concern. The latter, in particular, is used to identify tribal areas. According to the authors, to make permitting decisions, MPCA assesses whether the community, measured at the census tract level, fits at least one of the following criteria:
+
+* Percent of the non-white population is at least 50%
+* "More than 40% of the households have a household income of less than 185% of the federal
+poverty level (FPL)”
+* If the facility is within the boundaries of a “tribal community” (MPCA 2015).
+
+Furthermore, the authors state that the MCPA methodology included data on tribal community boundaries, as defined by the US Census Bureau, and data on poverty, race, and ethnicity. However, the authors also note that the MCPA's methodology does not rank any census tracts.
+
+In addition, although the CalEPA does not analyze data on race and ethnicity in CalEnviroScreen, the researchers incorporated race and ethnicity data in their assessment of environmental justice in Michigan. To justify the incorporation of race and ethnic data, the team compared the tract rankings with and without the data.
+
+A Spearman's rank-order correlation was calculated for the 2,741 census tracts within Michigan with the two variables being environmental justice scores using the CalEPA methodology 1) without racial and ethnic data and 2) with racial and ethnic data. These scores were then ranked and the Spearman rank-order correlation was calculated. These statistics are not included in the output of this ETL process. Please see Chapter 5 and Chapter 6 for further details.
+
+Finally, please see pages 104 -106 for details on the justification and details for the applicability of the upper quartile as a means to identify communities in Michigan with the potential for environmental justice concerns. It should also be noted that, according to the authors, that CalEPA also designates the top 25% scoring tracts as “disadvantaged communities".
+
+Sources:
+
+* Minnesota Pollution Control Agency. (2015, December 15). Environmental Justice Framework Report.
+Retrieved from https://www.pca.state.mn.us/sites/default/files/p-gen5-05.pdf.
+
+* Faust, J., L. August, K. Bangia, V. Galaviz, J. Leichty, S. Prasad… and L. Zeise. (2017, January). Update to the California Communities Environmental Health Screening Tool CalEnviroScreen 3.0. Retrieved from OEHHA website: https://oehha.ca.gov/media/downloads/calenviroscreen/report/ces3report.pdf
--- a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
@ -0,0 +1,69 @@
+import pandas as pd
+
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.utils import get_module_logger
+from data_pipeline.score import field_names
+from data_pipeline.config import settings
+
+logger = get_module_logger(__name__)
+
+
+class MichiganEnviroScreenETL(ExtractTransformLoad):
+    """Michigan EJ Screen class that ingests dataset represented
+    here: https://www.arcgis.com/apps/webappviewer/index.html?id=dc4f0647dda34959963488d3f519fd24
+    This class ingests the data presented in "Assessing the State of Environmental
+    Justice in Michigan." Please see the README in this module for further details.
+    """
+
+    def __init__(self):
+        self.MICHIGAN_EJSCREEN_S3_URL = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/michigan_ejscore_12212021.csv"
+        )
+
+        self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen"
+        self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75
+
+        self.COLUMNS_TO_KEEP = [
+            self.GEOID_TRACT_FIELD_NAME,
+            field_names.MICHIGAN_EJSCREEN_SCORE_FIELD,
+            field_names.MICHIGAN_EJSCREEN_PERCENTILE_FIELD,
+            field_names.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_FIELD,
+        ]
+
+        self.df: pd.DataFrame
+
+    def extract(self) -> None:
+        logger.info("Downloading Michigan EJSCREEN Data")
+        self.df = pd.read_csv(
+            filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
+            dtype={"GEO_ID": "string"},
+            low_memory=False,
+        )
+
+    def transform(self) -> None:
+        logger.info("Transforming Michigan EJSCREEN Data")
+
+        self.df.rename(
+            columns={
+                "GEO_ID": self.GEOID_TRACT_FIELD_NAME,
+                "EJ_Score_Cal_Min": field_names.MICHIGAN_EJSCREEN_SCORE_FIELD,
+                "Pct_CalMin": field_names.MICHIGAN_EJSCREEN_PERCENTILE_FIELD,
+            },
+            inplace=True,
+        )
+        # Calculate the top quartile of prioritized communities
+        # Please see pg. 104 - 109 from source:
+        # pg. https://deepblue.lib.umich.edu/bitstream/handle/2027.42/149105/AssessingtheStateofEnvironmentalJusticeinMichigan_344.pdf
+        self.df[field_names.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_FIELD] = (
+            self.df[field_names.MICHIGAN_EJSCREEN_PERCENTILE_FIELD]
+            >= self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD
+        )
+
+    def load(self) -> None:
+        logger.info("Saving Michigan Environmental Screening Tool to CSV")
+        # write nationwide csv
+        self.CSV_PATH.mkdir(parents=True, exist_ok=True)
+        self.df[self.COLUMNS_TO_KEEP].to_csv(
+            self.CSV_PATH / "michigan_ejscreen.csv", index=False
+        )
--- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
@ -7,6 +7,18 @@ logger = get_module_logger(__name__)


 class TreeEquityScoreETL(ExtractTransformLoad):
+    """Tree equity score methodology: https://www.treeequityscore.org/methodology/
+    A lower Tree Equity Score indicates a greater priority for closing the tree canopy gap
+    In order to estimate a general number of trees associated with an increase in tree
+    canopy, the authors utilize a basic multiplier of 600 sq-ft (55.74 sq-m) of canopy area
+    per urban tree assuming a medium-size urban tree crown width of 25-30 ft.
+    Sources:
+        1. Tree canopy cover. High resolution tree canopy where available.
+        In the event tree canopy is not defer to National Land Cover Database.
+        2. Census American Community Survey (ACS) 2018 5-year Block Group population estimates.
+        3. Census ACS 2018 5-year city and block group Median Income estimates.
+    """
+
    def __init__(self):
        self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
        self.TES_CSV = self.TMP_PATH / "tes_2021_data.csv"
@ -83,8 +95,42 @@ class TreeEquityScoreETL(ExtractTransformLoad):
            pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs
        )

+        # rename ID to Tract ID
+        self.df.rename(
+            # Block group ID delegated to attribute in superclass
+            columns={"geoid": ExtractTransformLoad.GEOID_FIELD_NAME},
+            inplace=True,
+        )
+
    def load(self) -> None:
-        logger.info("Saving Tree Equity Score GeoJSON")
+        logger.info("Saving Tree Equity Score CSV")
        # write nationwide csv
        self.CSV_PATH.mkdir(parents=True, exist_ok=True)
-        self.df.to_file(self.CSV_PATH / "tes_conus.geojson", driver="GeoJSON")
+        self.df = self.df[
+            [
+                ExtractTransformLoad.GEOID_FIELD_NAME,
+                "total_pop",  # Total Population according to ACS Estimates
+                "state",
+                "county",
+                "dep_ratio",  # Dependent ratio
+                "child_perc",  # Children (Age 0 -17) (ACS 2014 - 2018)
+                "seniorperc",  # Seniors (Age 65+) (ACS 2014 - 2018)
+                "treecanopy",  # Tree canopy cover
+                "area",  # Source: https://www.fs.fed.us/nrs/pubs/gtr/gtr_nrs200.pdf
+                "source",
+                "avg_temp",  # Average Temperature from USGS Earth Explorer
+                "ua_name",
+                "incorpname",  # Incorporated place name
+                "congressio",  # Congressional District
+                "biome",
+                "bgpopdense",
+                "popadjust",  # Adjusted population estimate
+                "tc_gap",  # Tree canopy gap
+                "tc_goal",  # Tree canopy goal
+                "priority",  # Priority community according to the index
+                "tes",  # Tree equity score
+                "tesctyscor",  # Tree equity score for the county
+                "geometry",  # Block group geometry coordinates
+            ]
+        ]
+        self.df.to_csv(self.CSV_PATH / "usa.csv", index=False)