From 29419dd2fd77d2213b9ba0cc4f5777f04a4edf83 Mon Sep 17 00:00:00 2001
From: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com>
Date: Tue, 2 Aug 2022 16:28:05 -0400
Subject: [PATCH] Rescaling linguistic isolation  (#1750)

Rescales linguistic isolation to drop puerto rico
---
 .../data_pipeline/etl/score/etl_score.py      | 113 +++++++++---------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
index c736e3fc..5d528dd6 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -264,6 +264,7 @@ class ScoreETL(ExtractTransformLoad):
         df: pd.DataFrame,
         input_column_name: str,
         output_column_name_root: str,
+        drop_tracts: list = None,
         ascending: bool = True,
     ) -> pd.DataFrame:
         """Creates percentiles.
@@ -298,10 +299,15 @@ class ScoreETL(ExtractTransformLoad):
         something like "3rd grade reading proficiency" and `output_column_name_root`
         may be something like "Low 3rd grade reading proficiency".
         """
-        if (
-            output_column_name_root
-            != field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
-        ):
+
+        # We have two potential options for assessing how to calculate percentiles.
+        # For the vast majority of columns, we will simply calculate percentiles overall.
+        # However, for Linguistic Isolation and Agricultural  Value Loss, there exist conditions
+        # for which we drop out tracts from consideration in the percentile. More details on those
+        # are below, for them, we provide a list of tracts to not include.
+        # Because of the fancy transformations below, I have removed the urban / rural percentiles,
+        # which are now deprecated.
+        if not drop_tracts:
             # Create the "basic" percentile.
             df[
                 f"{output_column_name_root}"
@@ -309,62 +315,23 @@ class ScoreETL(ExtractTransformLoad):
             ] = df[input_column_name].rank(pct=True, ascending=ascending)
 
         else:
-            # For agricultural loss, we are using whether there is value at all to determine percentile and then
-            # filling places where the value is False with 0
+            tmp_series = df[input_column_name].where(
+                ~df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts),
+                np.nan,
+            )
+            logger.info(
+                f"Creating special case column for percentiles from {input_column_name}"
+            )
             df[
                 f"{output_column_name_root}"
                 f"{field_names.PERCENTILE_FIELD_SUFFIX}"
-            ] = (
-                df.where(
-                    df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float)
-                    == 1.0
-                )[input_column_name]
-                .rank(ascending=ascending, pct=True)
-                .fillna(
-                    df[field_names.AGRICULTURAL_VALUE_BOOL_FIELD].astype(float)
-                )
-            )
+            ] = tmp_series.rank(ascending=ascending, pct=True)
 
-        # Create the urban/rural percentiles.
-        urban_rural_percentile_fields_to_combine = []
-        for (urban_or_rural_string, urban_heuristic_bool) in [
-            ("urban", True),
-            ("rural", False),
-        ]:
-            # Create a field with only those values
-            this_category_only_value_field = (
-                f"{input_column_name} (value {urban_or_rural_string} only)"
-            )
-            df[this_category_only_value_field] = np.where(
-                df[field_names.URBAN_HEURISTIC_FIELD] == urban_heuristic_bool,
-                df[input_column_name],
-                None,
-            )
-
-            # Calculate the percentile for only this category
-            this_category_only_percentile_field = (
-                f"{output_column_name_root} "
-                f"(percentile {urban_or_rural_string} only)"
-            )
-            df[this_category_only_percentile_field] = df[
-                this_category_only_value_field
-            ].rank(
-                pct=True,
-                # Set ascending to the parameter value.
-                ascending=ascending,
-            )
-
-            # Add the field name to this list. Later, we'll combine this list.
-            urban_rural_percentile_fields_to_combine.append(
-                this_category_only_percentile_field
-            )
-
-        # Combine both urban and rural into one field:
-        df[
-            f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}"
-        ] = df[urban_rural_percentile_fields_to_combine].mean(
-            axis=1, skipna=True
-        )
+            # Check that "drop tracts" were dropped (quicker than creating a fixture?)
+            assert df[df[field_names.GEOID_TRACT_FIELD].isin(drop_tracts)][
+                f"{output_column_name_root}"
+                f"{field_names.PERCENTILE_FIELD_SUFFIX}"
+            ].isna().sum() == len(drop_tracts), "Not all tracts were dropped"
 
         return df
 
@@ -523,13 +490,47 @@ class ScoreETL(ExtractTransformLoad):
         df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)
 
         # Convert all columns to numeric and do math
+        # Note that we have a few special conditions here, that we handle explicitly.
+        #     For *Linguistic Isolation*, we do NOT want to include Puerto Rico in the percentile
+        #     calculation. This is because linguistic isolation as a category doesn't make much sense
+        #     in Puerto Rico, where Spanish is a recognized language. Thus, we construct a list
+        #     of tracts to drop from the percentile calculation.
+        #
+        #     For *Expected Agricultural Loss*, we only want to include in the percentile tracts
+        #     in which there is some agricultural value. This helps us adjust the data such that we have
+        #     the ability to discern which tracts truly are at the 90th percentile, since many tracts have 0 value.
+
         for numeric_column in numeric_columns:
+            drop_tracts = []
+            if (
+                numeric_column
+                == field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+            ):
+                drop_tracts = df_copy[
+                    ~df_copy[field_names.AGRICULTURAL_VALUE_BOOL_FIELD]
+                    .astype(bool)
+                    .fillna(False)
+                ][field_names.GEOID_TRACT_FIELD].to_list()
+                logger.info(
+                    f"Dropping {len(drop_tracts)} tracts from Agricultural Value Loss"
+                )
+
+            elif numeric_column == field_names.LINGUISTIC_ISO_FIELD:
+                drop_tracts = df_copy[
+                    # 72 is the FIPS code for Puerto Rico
+                    df_copy[field_names.GEOID_TRACT_FIELD].str.startswith("72")
+                ][field_names.GEOID_TRACT_FIELD].to_list()
+                logger.info(
+                    f"Dropping {len(drop_tracts)} tracts from Linguistic Isolation"
+                )
+
             df_copy = self._add_percentiles_to_df(
                 df=df_copy,
                 input_column_name=numeric_column,
                 # For this use case, the input name and output name root are the same.
                 output_column_name_root=numeric_column,
                 ascending=True,
+                drop_tracts=drop_tracts,
             )
 
             # Min-max normalization: