From bbb5bbc60a7ef601a21c785c06d515024ad0a963 Mon Sep 17 00:00:00 2001
From: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com>
Date: Wed, 3 Aug 2022 13:55:58 -0400
Subject: [PATCH] Changing LHE in tiles to a boolean (#1767)

also includes merging / clean up of the release
---
 .../data_pipeline/etl/score/constants.py      |  2 -
 .../data_pipeline/etl/score/etl_score.py      | 48 ++++---------------
 .../data_pipeline/score/field_names.py        |  1 -
 .../data_pipeline/score/score_narwhal.py      | 26 ++++------
 .../data_pipeline/score/score_runner.py       | 30 ------------
 5 files changed, 18 insertions(+), 89 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py
index 8e7d17b1..5db119e3 100644
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@@ -342,8 +342,6 @@ TILES_SCORE_FLOAT_COLUMNS = [
     + field_names.PERCENTILE_FIELD_SUFFIX,
     # Island areas HS degree attainment rate
     field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009,
-    field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD,
-    field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD,
     field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
     field_names.COLLEGE_NON_ATTENDANCE_FIELD,
     field_names.COLLEGE_ATTENDANCE_FIELD,
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
index b073ce5a..d43cc325 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -274,26 +274,6 @@ class ScoreETL(ExtractTransformLoad):
         E.g., "PM2.5 exposure (percentile)".
         This will be for the entire country.
 
-        For an "apples-to-apples" comparison of urban tracts to other urban tracts,
-        and compare rural tracts to other rural tracts.
-
-        This percentile will be created and returned as
-        f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}".
-        E.g., "PM2.5 exposure (percentile urban/rural)".
-        This field exists for every tract, but for urban tracts this value will be the
-        percentile compared to other urban tracts, and for rural tracts this value
-        will be the percentile compared to other rural tracts.
-
-        Specific methdology:
-            1. Decide a methodology for confirming whether a tract counts as urban or
-            rural. Currently in the codebase, we use Geocorr to identify the % rural of
-            a tract, and mark the tract as rural if the percentage is >50% and urban
-            otherwise. This may or may not be the right methodology.
-            2. Once tracts are marked as urban or rural, create one percentile rank
-            that only ranks urban tracts, and one percentile rank that only ranks rural
-            tracts.
-            3. Combine into a single field.
-
         `output_column_name_root` is different from `input_column_name` to enable the
         reverse percentile use case. In that use case, `input_column_name` may be
         something like "3rd grade reading proficiency" and `output_column_name_root`
@@ -413,7 +393,6 @@ class ScoreETL(ExtractTransformLoad):
             field_names.NPL_FIELD,
             field_names.WASTEWATER_FIELD,
             field_names.LEAD_PAINT_FIELD,
-            field_names.UST_FIELD,
             field_names.UNDER_5_FIELD,
             field_names.OVER_64_FIELD,
             field_names.LINGUISTIC_ISO_FIELD,
@@ -436,6 +415,7 @@ class ScoreETL(ExtractTransformLoad):
             field_names.EXTREME_HEAT_FIELD,
             field_names.HEALTHY_FOOD_FIELD,
             field_names.IMPENETRABLE_SURFACES_FIELD,
+            field_names.UST_FIELD,
             # We have to pass this boolean here in order to include it in ag value loss percentiles.
             field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
             field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
@@ -488,7 +468,13 @@ class ScoreETL(ExtractTransformLoad):
 
         df_copy = df[columns_to_keep].copy()
 
-        df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)
+        assert len(numeric_columns) == len(
+            set(numeric_columns)
+        ), "You have a double-entered column in the numeric columns list"
+
+        df_copy[numeric_columns] = df_copy[numeric_columns].apply(
+            pd.to_numeric
+        )
 
         # Convert all columns to numeric and do math
         # Note that we have a few special conditions here, that we handle explicitly.
@@ -534,24 +520,6 @@ class ScoreETL(ExtractTransformLoad):
                 drop_tracts=drop_tracts,
             )
 
-            # Min-max normalization:
-            # (
-            #     Observed value
-            #     - minimum of all values
-            # )
-            # divided by
-            # (
-            #    Maximum of all values
-            #     - minimum of all values
-            # )
-            min_value = df_copy[numeric_column].min(skipna=True)
-
-            max_value = df_copy[numeric_column].max(skipna=True)
-
-            df_copy[f"{numeric_column}{field_names.MIN_MAX_FIELD_SUFFIX}"] = (
-                df_copy[numeric_column] - min_value
-            ) / (max_value - min_value)
-
         # Create reversed percentiles for these fields
         for reverse_percentile in reverse_percentiles:
             # Calculate reverse percentiles
diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py
index 5b7a88af..895c52b1 100644
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@@ -432,7 +432,6 @@ HAZARDOUS_WASTE_LOW_INCOME_FIELD = (
 WASTEWATER_DISCHARGE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for wastewater discharge and is low income?"
 UST_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for leaky underground storage tanks and is low income?"
 
-
 # Health Burdens
 DIABETES_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for diabetes and is low income?"
 ASTHMA_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for asthma and is low income?"
diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py
index 1299acaa..0480d93a 100644
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@@ -435,6 +435,11 @@ class ScoreNarwhal(Score):
         # poverty level and has a low percent of higher ed students
         # Source: Census's American Community Survey
 
+        eligibility_columns = [
+            field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
+            field_names.UST_LOW_INCOME_FIELD,
+        ]
+
         self.df[field_names.WASTEWATER_PCTILE_THRESHOLD] = (
             self.df[
                 field_names.WASTEWATER_FIELD
@@ -457,28 +462,17 @@ class ScoreNarwhal(Score):
             & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
         )
 
-        self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[
-            [
-                field_names.WASTEWATER_PCTILE_THRESHOLD,
-                field_names.UST_PCTILE_THRESHOLD,
-            ]
-        ].max(axis=1)
-
         self._increment_total_eligibility_exceeded(
-            [
-                field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
-                field_names.UST_LOW_INCOME_FIELD,
-            ],
+            eligibility_columns,
             skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
         )
 
-        return self.df[
-            [
-                field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
-                field_names.UST_LOW_INCOME_FIELD,
-            ]
+        self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[
+            eligibility_columns
         ].any(axis=1)
 
+        return self.df[field_names.WATER_THRESHOLD_EXCEEDED]
+
     def _health_factor(self) -> bool:
         # In Xth percentile or above for diabetes (Source: CDC Places)
         # or
diff --git a/data/data-pipeline/data_pipeline/score/score_runner.py b/data/data-pipeline/data_pipeline/score/score_runner.py
index 64b192f4..c1838ca1 100644
--- a/data/data-pipeline/data_pipeline/score/score_runner.py
+++ b/data/data-pipeline/data_pipeline/score/score_runner.py
@@ -28,7 +28,6 @@ class ScoreRunner:
         self.df = ScoreA(df=self.df).add_columns()
         self.df = ScoreB(df=self.df).add_columns()
         self.df = ScoreC(df=self.df).add_columns()
-        self.df = ScoreD(df=self.df).add_columns()
         self.df = ScoreF(df=self.df).add_columns()
         self.df = ScoreG(df=self.df).add_columns()
         self.df = ScoreH(df=self.df).add_columns()
@@ -38,33 +37,4 @@ class ScoreRunner:
         self.df = ScoreM(df=self.df).add_columns()
         self.df = ScoreNarwhal(df=self.df).add_columns()
 
-        # TODO do this with each score instead of in a bundle
-        # Create percentiles for these index scores
-        self.df = self._add_score_percentiles()
-
-        return self.df
-
-    def _add_score_percentiles(self) -> pd.DataFrame:
-        logger.info("Adding Score Percentiles")
-        for score_field in [
-            field_names.SCORE_A,
-            field_names.SCORE_B,
-            field_names.SCORE_C,
-            field_names.SCORE_D,
-            field_names.SCORE_E,
-        ]:
-            self.df[
-                f"{score_field}{field_names.PERCENTILE_FIELD_SUFFIX}"
-            ] = self.df[score_field].rank(pct=True)
-
-            for threshold in [0.25, 0.3, 0.35, 0.4]:
-                fraction_converted_to_percent = int(100 * threshold)
-                self.df[
-                    f"{score_field} (top {fraction_converted_to_percent}th percentile)"
-                ] = (
-                    self.df[
-                        f"{score_field}{field_names.PERCENTILE_FIELD_SUFFIX}"
-                    ]
-                    >= 1 - threshold
-                )
         return self.df