updated to show T/F/null vs T/F for AML and FUDS (#1866)

2025-07-28 09:51:16 -07:00 · 2022-08-24 20:22:59 -04:00 · 2022-08-24 20:22:59 -04:00 · 637b8c305c
commit 637b8c305c
parent 6418335219
11 changed files with 40 additions and 10 deletions
--- a/data/data-pipeline/data_pipeline/content/config/csv.yml
+++ b/data/data-pipeline/data_pipeline/content/config/csv.yml
@ -359,6 +359,12 @@ fields:
 - score_name: Is there at least one abandoned mine in this census tract?
  label: Is there at least one abandoned mine in this census tract?
  format: bool
+- score_name: Is there at least one Formerly Used Defense Site (FUDS) in the tract, where missing data is treated as False?
+  label: Is there at least one Formerly Used Defense Site (FUDS) in the tract, where missing data is treated as False?
+  format: bool 
+- score_name: Is there at least one abandoned mine in this census tract, where missing data is treated as False?
+  label: Is there at least one abandoned mine in this census tract, where missing data is treated as False?
+  format: bool
 - score_name: There is at least one abandoned mine in this census tract and the tract is low income.
  label: There is at least one abandoned mine in this census tract and the tract is low income.
  format: bool
--- a/data/data-pipeline/data_pipeline/content/config/excel.yml
+++ b/data/data-pipeline/data_pipeline/content/config/excel.yml
@ -369,6 +369,12 @@ sheets:
      - score_name: There is at least one Formerly Used Defense Site (FUDS) in the tract and the tract is low income.
        label: There is at least one Formerly Used Defense Site (FUDS) in the tract and the tract is low income.
        format: bool
+      - score_name: Is there at least one Formerly Used Defense Site (FUDS) in the tract, where missing data is treated as False?
+        label: Is there at least one Formerly Used Defense Site (FUDS) in the tract, where missing data is treated as False?
+        format: bool 
+      - score_name: Is there at least one abandoned mine in this census tract, where missing data is treated as False?
+        label: Is there at least one abandoned mine in this census tract, where missing data is treated as False?
+        format: bool
      - score_name:  Tract-level redlining score meets or exceeds 3.25 and is low income
        label: Tract experienced historic underinvestment and remains low income
        format: bool
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -309,8 +309,10 @@ TILES_SCORE_COLUMNS = {
    field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
    + field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
    field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
-    field_names.AML_BOOLEAN: "AML_ET",
-    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET",
+    field_names.AML_BOOLEAN: "AML_RAW",
+    field_names.AML_BOOLEAN_FILLED_IN: "AML_ET",
+    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_RAW",
+    field_names.ELIGIBLE_FUDS_FILLED_IN_FIELD_NAME: "FUDS_ET",
    field_names.IMPUTED_INCOME_FLAG_FIELD_NAME: "IMP_FLG",
    ## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
    ## FPL_200 (there is no higher ed in narwhal)
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -322,6 +322,8 @@ class ScoreETL(ExtractTransformLoad):
        # which are now deprecated.
        if not drop_tracts:
            # Create the "basic" percentile.
+            ## note: I believe this is less performant than if we made a bunch of these PFS columns
+            ## and then concatenated the list. For the refactor!
            df[
                f"{output_column_name_root}"
                f"{field_names.PERCENTILE_FIELD_SUFFIX}"
@ -538,9 +540,12 @@ class ScoreETL(ExtractTransformLoad):

        df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)

-        # coerce all booleans to bools
+        # coerce all booleans to bools preserving nan character
+        # since this is a boolean, need to use `None`
        for col in boolean_columns:
-            df_copy[col] = df_copy[col].astype(bool)
+            tmp = df_copy[col].copy()
+            df_copy[col] = np.where(tmp.notna(), tmp.astype(bool), None)
+            logger.info(f"{col} contains {df_copy[col].isna().sum()} nulls.")

        # Convert all columns to numeric and do math
        # Note that we have a few special conditions here and we handle them explicitly.
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -355,9 +355,12 @@ TRANSPORTATION_COSTS = "Transportation Costs"

 # eAMLIS and FUDS variables
 AML_BOOLEAN = "Is there at least one abandoned mine in this census tract?"
+AML_BOOLEAN_FILLED_IN = "Is there at least one abandoned mine in this census tract, where missing data is treated as False?"
+
 ELIGIBLE_FUDS_BINARY_FIELD_NAME = (
    "Is there at least one Formerly Used Defense Site (FUDS) in the tract?"
 )
+ELIGIBLE_FUDS_FILLED_IN_FIELD_NAME = "Is there at least one Formerly Used Defense Site (FUDS) in the tract, where missing data is treated as False?"

 #####
 # Names for individual factors being exceeded
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -488,13 +488,21 @@ class ScoreNarwhal(Score):
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )

+        self.df[field_names.ELIGIBLE_FUDS_FILLED_IN_FIELD_NAME] = self.df[
+            field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME
+        ].fillna(False)
+
+        self.df[field_names.AML_BOOLEAN_FILLED_IN] = self.df[
+            field_names.AML_BOOLEAN
+        ].fillna(False)
+
        self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = self.df[
            [
                field_names.RMP_PCTILE_THRESHOLD,
                field_names.NPL_PCTILE_THRESHOLD,
                field_names.TSDF_PCTILE_THRESHOLD,
-                field_names.AML_BOOLEAN,
-                field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
+                field_names.AML_BOOLEAN_FILLED_IN,
+                field_names.ELIGIBLE_FUDS_FILLED_IN_FIELD_NAME,
            ]
        ].any(axis="columns")

@ -513,7 +521,7 @@ class ScoreNarwhal(Score):
        )

        self.df[field_names.AML_LOW_INCOME_FIELD] = (
-            self.df[field_names.AML_BOOLEAN]
+            self.df[field_names.AML_BOOLEAN_FILLED_IN]
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )