Adding eamlis and fuds data to legacy pollution in score (#1832)

Update to add EAMLIS and FUDS data to score
2025-09-10 09:40:59 -07:00 · 2022-08-18 13:32:29 -04:00 · 2022-08-18 13:32:29 -04:00 · cb4866b93f
commit cb4866b93f
parent 6e41e0d9f0
14 changed files with 93 additions and 24 deletions
--- a/data/data-pipeline/data_pipeline/content/config/csv.yml
+++ b/data/data-pipeline/data_pipeline/content/config/csv.yml
@ -322,4 +322,16 @@ fields:
    format: percentage
  - score_name: Does the tract have at least 35 acres in it?
    label: Does the tract have at least 35 acres in it?
    format: bool
  - score_name: Is there at least one Formerly Used Defense Site (FUDS) in the tract?
    label: Is there at least one Formerly Used Defense Site (FUDS) in the tract?
    format: bool 
  - score_name: Is there at least one abandoned mine in this census tract?
    label: Is there at least one abandoned mine in this census tract?
    format: bool
  - score_name: There is at least one abandoned mine in this census tract and the tract is low income.
    label: There is at least one abandoned mine in this census tract and the tract is low income.
    format: bool
  - score_name: There is at least one Formerly Used Defense Site (FUDS) in the tract and the tract is low income.
    label: There is at least one Formerly Used Defense Site (FUDS) in the tract and the tract is low income.
    format: bool
--- a/data/data-pipeline/data_pipeline/content/config/excel.yml
+++ b/data/data-pipeline/data_pipeline/content/config/excel.yml
@ -326,4 +326,16 @@ sheets:
        format: percentage
      - score_name: Does the tract have at least 35 acres in it?
        label: Does the tract have at least 35 acres in it?
        format: bool
      - score_name: Is there at least one Formerly Used Defense Site (FUDS) in the tract?
        label: Is there at least one Formerly Used Defense Site (FUDS) in the tract?
        format: bool 
      - score_name: Is there at least one abandoned mine in this census tract?
        label: Is there at least one abandoned mine in this census tract?
        format: bool
      - score_name: There is at least one abandoned mine in this census tract and the tract is low income.
        label: There is at least one abandoned mine in this census tract and the tract is low income.
        format: bool
      - score_name: There is at least one Formerly Used Defense Site (FUDS) in the tract and the tract is low income.
        label: There is at least one Formerly Used Defense Site (FUDS) in the tract and the tract is low income.
        format: bool
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -93,21 +93,23 @@ def etl_runner(dataset_to_run: str = None) -> None:
        dataset for dataset in dataset_list if dataset["is_memory_intensive"]
    ]
-    logger.info("Running concurrent jobs")
+    if concurrent_datasets:
-    with concurrent.futures.ThreadPoolExecutor() as executor:
+        logger.info("Running concurrent jobs")
-        futures = {
+        with concurrent.futures.ThreadPoolExecutor() as executor:
-            executor.submit(_run_one_dataset, dataset=dataset)
+            futures = {
-            for dataset in concurrent_datasets
+                executor.submit(_run_one_dataset, dataset=dataset)
-        }
+                for dataset in concurrent_datasets
            }
-        for fut in concurrent.futures.as_completed(futures):
+            for fut in concurrent.futures.as_completed(futures):
-            # Calling result will raise an exception if one occurred.
+                # Calling result will raise an exception if one occurred.
-            # Otherwise, the exceptions are silently ignored.
+                # Otherwise, the exceptions are silently ignored.
-            fut.result()
+                fut.result()
-    logger.info("Running high-memory jobs")
+    if high_memory_datasets:
-    for dataset in high_memory_datasets:
+        logger.info("Running high-memory jobs")
-        _run_one_dataset(dataset=dataset)
+        for dataset in high_memory_datasets:
            _run_one_dataset(dataset=dataset)
 def score_generate() -> None:
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -312,6 +312,8 @@ TILES_SCORE_COLUMNS = {
    field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
    + field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
    field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
    field_names.AML_BOOLEAN: "AML_ET",
    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET"
    ## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
    ## FPL_200 (there is no higher ed in narwhal)
 }
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -14,6 +14,8 @@ from data_pipeline.etl.sources.dot_travel_composite.etl import (
 from data_pipeline.etl.sources.fsf_flood_risk.etl import (
    FloodRiskETL,
 )
 from data_pipeline.etl.sources.eamlis.etl import AbandonedMineETL
 from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
 from data_pipeline.etl.sources.nlcd_nature_deprived.etl import NatureDeprivedETL
 from data_pipeline.etl.sources.fsf_wildfire_risk.etl import WildfireRiskETL
 from data_pipeline.score.score_runner import ScoreRunner
@ -49,6 +51,8 @@ class ScoreETL(ExtractTransformLoad):
        self.fsf_flood_df: pd.DataFrame
        self.fsf_fire_df: pd.DataFrame
        self.nature_deprived_df: pd.DataFrame
        self.eamlis_df: pd.DataFrame
        self.fuds_df: pd.DataFrame
    def extract(self) -> None:
        logger.info("Loading data sets from disk.")
@ -139,6 +143,12 @@ class ScoreETL(ExtractTransformLoad):
        # Load NLCD Nature-Deprived Communities data
        self.nature_deprived_df = NatureDeprivedETL.get_data_frame()
        # Load eAMLIS dataset
        self.eamlis_df = AbandonedMineETL.get_data_frame()
        # Load FUDS dataset
        self.fuds_df = USArmyFUDS.get_data_frame()
        # Load GeoCorr Urban Rural Map
        geocorr_urban_rural_csv = (
            constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
@ -362,6 +372,8 @@ class ScoreETL(ExtractTransformLoad):
            self.fsf_flood_df,
            self.fsf_fire_df,
            self.nature_deprived_df,
            self.eamlis_df,
            self.fuds_df,
        ]
        # Sanity check each data frame before merging.
@ -457,6 +469,8 @@ class ScoreETL(ExtractTransformLoad):
            field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
            field_names.TRACT_ELIGIBLE_FOR_NONNATURAL_THRESHOLD,
            field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
            field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
            field_names.AML_BOOLEAN,
        ]
        # For some columns, high values are "good", so we want to reverse the percentile
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
@ -55,7 +55,7 @@ class USArmyFUDS(ExtractTransformLoad):
        # before we try to do any transformation, get the tract data
        # so it's loaded and the census ETL is out of scope
-        logger.info("Loading FUDs data as GeoDataFrame for transform")
+        logger.info("Loading FUDS data as GeoDataFrame for transform")
        raw_df = gpd.read_file(
            filename=self.DOWNLOAD_FILE_NAME,
            low_memory=False,
@ -88,7 +88,7 @@ class USArmyFUDS(ExtractTransformLoad):
            .size()
        )
        self.output_df = (
-            self.output_df.fillna(0).astype("int64").sort_index().reset_index()
+            self.output_df.fillna(0).astype(np.int64).sort_index().reset_index()
        )
        self.output_df[self.ELIGIBLE_FUDS_BINARY_FIELD_NAME] = np.where(
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -340,6 +340,12 @@ MOBILE_HOME = "Mobile Home"
 SINGLE_PARENT = "Single Parent"
 TRANSPORTATION_COSTS = "Transportation Costs"
 # eAMLIS and FUDS variables
 AML_BOOLEAN = "Is there at least one abandoned mine in this census tract?"
 ELIGIBLE_FUDS_BINARY_FIELD_NAME = (
    "Is there at least one Formerly Used Defense Site (FUDS) in the tract?"
 )
 #####
 # Names for individual factors being exceeded
@ -399,6 +405,10 @@ HAZARDOUS_WASTE_LOW_INCOME_FIELD = (
    f" for proximity to hazardous waste facilities and is low income?"
 )
 AML_LOW_INCOME_FIELD = "There is at least one abandoned mine in this census tract and the tract is low income."
 ELIGIBLE_FUDS_LOW_INCOME_FIELD = "There is at least one Formerly Used Defense Site (FUDS) in the tract and the tract is low income."
 # Critical Clean Water and Waste Infrastructure
 WASTEWATER_DISCHARGE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for wastewater discharge and is low income?"
 UST_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for leaky underground storage tanks and is low income?"
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -464,6 +464,8 @@ class ScoreNarwhal(Score):
            field_names.RMP_LOW_INCOME_FIELD,
            field_names.SUPERFUND_LOW_INCOME_FIELD,
            field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD,
            field_names.AML_LOW_INCOME_FIELD,
            field_names.ELIGIBLE_FUDS_LOW_INCOME_FIELD,
        ]
        self.df[field_names.RMP_PCTILE_THRESHOLD] = (
@ -483,10 +485,15 @@ class ScoreNarwhal(Score):
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )
-        self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = (
+        self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = self.df[
-            self.df[field_names.RMP_PCTILE_THRESHOLD]
+            [
-            | self.df[field_names.NPL_PCTILE_THRESHOLD]
+                field_names.RMP_PCTILE_THRESHOLD,
-        ) | self.df[field_names.TSDF_PCTILE_THRESHOLD]
+                field_names.NPL_PCTILE_THRESHOLD,
                field_names.TSDF_PCTILE_THRESHOLD,
                field_names.AML_BOOLEAN,
                field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
            ]
        ].any(axis="columns")
        # individual series-by-series
        self.df[field_names.RMP_LOW_INCOME_FIELD] = (
@ -502,6 +509,16 @@ class ScoreNarwhal(Score):
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.AML_LOW_INCOME_FIELD] = (
            self.df[field_names.AML_BOOLEAN]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.ELIGIBLE_FUDS_LOW_INCOME_FIELD] = (
            self.df[field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self._increment_total_eligibility_exceeded(
            pollution_eligibility_columns,
            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
--- a/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/eamlis/test_etl.py
@ -61,9 +61,9 @@ class TestAbandondedLandMineETL(TestETL):
        super().setup_method(_method=_method, filename=filename)
    def test_init(self, mock_etl, mock_paths):
-        """Tests that the mock NationalRiskIndexETL class instance was
+        """Tests that the mock class instance was
        initiliazed correctly.
-       """
+        """
        # setup
        etl = self._ETL_CLASS()
        # validation