From 1a61026ecfe9909102048fd63204940c0125f13c Mon Sep 17 00:00:00 2001
From: Lucas Merrill Brown <lucas.m.brown@omb.eop.gov>
Date: Tue, 7 Dec 2021 17:28:36 -0500
Subject: [PATCH] Issue 967: Calculate urban/rural percentiles (#1006)

---
 .../data_pipeline/etl/score/etl_score.py      | 122 +++++++++++++++---
 .../data_pipeline/score/field_names.py        |   1 +
 2 files changed, 108 insertions(+), 15 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
index 064d1670..38ce2ace 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -1,6 +1,7 @@
 import functools
 from collections import namedtuple
 
+import numpy as np
 import pandas as pd
 
 from data_pipeline.etl.base import ExtractTransformLoad
@@ -253,6 +254,94 @@ class ScoreETL(ExtractTransformLoad):
                 f"Too many rows in the join: {len(df_to_check)} in {dataframe_descriptor}"
             )
 
+    @staticmethod
+    def _add_percentiles_to_df(
+        df: pd.DataFrame,
+        input_column_name: str,
+        output_column_name_root: str,
+        ascending: bool = True,
+    ) -> pd.DataFrame:
+        """Creates percentiles.
+
+        One percentile will be created and returned as
+        f"{output_column_name_root}{field_names.PERCENTILE_FIELD_SUFFIX}".
+        E.g., "PM2.5 exposure (percentile)".
+        This will be for the entire country.
+
+        For an "apples-to-apples" comparison of urban tracts to other urban tracts,
+        and compare rural tracts to other rural tracts.
+
+        This percentile will be created and returned as
+        f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}".
+        E.g., "PM2.5 exposure (percentile urban/rural)".
+        This field exists for every tract, but for urban tracts this value will be the
+        percentile compared to other urban tracts, and for rural tracts this value
+        will be the percentile compared to other rural tracts.
+
+        Specific methdology:
+            1. Decide a methodology for confirming whether a tract counts as urban or
+            rural. Currently in the codebase, we use Geocorr to identify the % rural of
+            a tract, and mark the tract as rural if the percentage is >50% and urban
+            otherwise. This may or may not be the right methodology.
+            2. Once tracts are marked as urban or rural, create one percentile rank
+            that only ranks urban tracts, and one percentile rank that only ranks rural
+            tracts.
+            3. Combine into a single field.
+
+        `output_column_name_root` is different from `input_column_name` to enable the
+        reverse percentile use case. In that use case, `input_column_name` may be
+        something like "3rd grade reading proficiency" and `output_column_name_root`
+        may be something like "Low 3rd grade reading proficiency".
+        """
+        # Create the "basic" percentile.
+        df[
+            f"{output_column_name_root}"
+            f"{field_names.PERCENTILE_FIELD_SUFFIX}"
+        ] = df[input_column_name].rank(pct=True, ascending=ascending)
+
+        # Create the urban/rural percentiles.
+        urban_rural_percentile_fields_to_combine = []
+        for (urban_or_rural_string, urban_heuristic_bool) in [
+            ("urban", True),
+            ("rural", False),
+        ]:
+            # Create a field with only those values
+            this_category_only_value_field = (
+                f"{input_column_name} (value {urban_or_rural_string} only)"
+            )
+            df[this_category_only_value_field] = np.where(
+                df[field_names.URBAN_HEURISTIC_FIELD] == urban_heuristic_bool,
+                df[input_column_name],
+                None,
+            )
+
+            # Calculate the percentile for only this category
+            this_category_only_percentile_field = (
+                f"{output_column_name_root} "
+                f"(percentile {urban_or_rural_string} only)"
+            )
+            df[this_category_only_percentile_field] = df[
+                this_category_only_value_field
+            ].rank(
+                pct=True,
+                # Set ascending to the parameter value.
+                ascending=ascending,
+            )
+
+            # Add the field name to this list. Later, we'll combine this list.
+            urban_rural_percentile_fields_to_combine.append(
+                this_category_only_percentile_field
+            )
+
+        # Combine both urban and rural into one field:
+        df[
+            f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}"
+        ] = df[urban_rural_percentile_fields_to_combine].mean(
+            axis=1, skipna=True
+        )
+
+        return df
+
     # TODO Move a lot of this to the ETL part of the pipeline
     def _prepare_initial_df(self) -> pd.DataFrame:
         logger.info("Preparing initial dataframe")
@@ -393,11 +482,14 @@ class ScoreETL(ExtractTransformLoad):
         df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)
 
         # Convert all columns to numeric and do math
-        for col in numeric_columns:
-            # Calculate percentiles
-            df_copy[f"{col}{field_names.PERCENTILE_FIELD_SUFFIX}"] = df_copy[
-                col
-            ].rank(pct=True)
+        for numeric_column in numeric_columns:
+            df_copy = self._add_percentiles_to_df(
+                df=df_copy,
+                input_column_name=numeric_column,
+                # For this use case, the input name and output name root are the same.
+                output_column_name_root=numeric_column,
+                ascending=True,
+            )
 
             # Min-max normalization:
             # (
@@ -409,16 +501,16 @@ class ScoreETL(ExtractTransformLoad):
             #    Maximum of all values
             #     - minimum of all values
             # )
-            min_value = df_copy[col].min(skipna=True)
+            min_value = df_copy[numeric_column].min(skipna=True)
 
-            max_value = df_copy[col].max(skipna=True)
+            max_value = df_copy[numeric_column].max(skipna=True)
 
             logger.info(
-                f"For data set {col}, the min value is {min_value} and the max value is {max_value}."
+                f"For data set {numeric_column}, the min value is {min_value} and the max value is {max_value}."
             )
 
-            df_copy[f"{col}{field_names.MIN_MAX_FIELD_SUFFIX}"] = (
-                df_copy[col] - min_value
+            df_copy[f"{numeric_column}{field_names.MIN_MAX_FIELD_SUFFIX}"] = (
+                df_copy[numeric_column] - min_value
             ) / (max_value - min_value)
 
         # Create reversed percentiles for these fields
@@ -427,11 +519,11 @@ class ScoreETL(ExtractTransformLoad):
             # For instance, for 3rd grade reading level (score from 0-500),
             # calculate reversed percentiles and give the result the name
             # `Low 3rd grade reading level (percentile)`.
-            df_copy[
-                f"{reverse_percentile.low_field_name}"
-                f"{field_names.PERCENTILE_FIELD_SUFFIX}"
-            ] = df_copy[reverse_percentile.field_name].rank(
-                pct=True, ascending=False
+            df_copy = self._add_percentiles_to_df(
+                df=df_copy,
+                input_column_name=reverse_percentile.field_name,
+                output_column_name_root=reverse_percentile.low_field_name,
+                ascending=False,
             )
 
         # Special logic: create a combined population field.
diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py
index c323e8f2..a71feed2 100644
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@@ -1,5 +1,6 @@
 # Suffixes
 PERCENTILE_FIELD_SUFFIX = " (percentile)"
+PERCENTILE_URBAN_RURAL_FIELD_SUFFIX = " (percentile urban/rural)"
 MIN_MAX_FIELD_SUFFIX = " (min-max normalized)"
 TOP_25_PERCENTILE_SUFFIX = " (top 25th percentile)"