From 1083e953da638a490dda3787339e6da687822caa Mon Sep 17 00:00:00 2001
From: Lucas Merrill Brown <lucas.m.brown@omb.eop.gov>
Date: Tue, 14 Sep 2021 09:48:11 -0500
Subject: [PATCH] Prototype G (#672)

* wip

* cleanup

* cleanup 2

* fixing import ordering linter error

* updating backend to use score G

* adding percentile to score output

* update tippeanoe compression

Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
---
 .../data_pipeline/etl/score/constants.py      |  2 +
 .../data_pipeline/etl/score/etl_score.py      | 92 ++++++++++++++-----
 .../data_pipeline/etl/score/etl_score_geo.py  |  4 +-
 .../etl/sources/census_acs/etl.py             | 10 ++
 .../ipython/scoring_comparison.ipynb          | 52 ++++++++---
 .../data_pipeline/tile/generate.py            |  2 +
 6 files changed, 123 insertions(+), 39 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py
index 3d262f98..9eac120f 100644
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@@ -57,6 +57,8 @@ TILES_SCORE_COLUMNS = [
     "Score D (top 25th percentile)",
     "Score E (percentile)",
     "Score E (top 25th percentile)",
+    "Score G (communities)",
+    "Score G",
     "Poverty (Less than 200% of federal poverty line) (percentile)",
     "Percent individuals age 25 or over with less than high school degree (percentile)",
     "Linguistic isolation (percent) (percentile)",
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
index 1ede5a5a..14f181b7 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -1,7 +1,8 @@
 import collections
 import functools
-
+from pathlib import Path
 import pandas as pd
+
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.utils import get_module_logger
 
@@ -11,11 +12,11 @@ logger = get_module_logger(__name__)
 class ScoreETL(ExtractTransformLoad):
     def __init__(self):
         # Define some global parameters
-        self.BUCKET_SOCIOECONOMIC = "Socioeconomic Factors"
-        self.BUCKET_SENSITIVE = "Sensitive populations"
-        self.BUCKET_ENVIRONMENTAL = "Environmental effects"
-        self.BUCKET_EXPOSURES = "Exposures"
-        self.BUCKETS = [
+        self.BUCKET_SOCIOECONOMIC: str = "Socioeconomic Factors"
+        self.BUCKET_SENSITIVE: str = "Sensitive populations"
+        self.BUCKET_ENVIRONMENTAL: str = "Environmental effects"
+        self.BUCKET_EXPOSURES: str = "Exposures"
+        self.BUCKETS: str = [
             self.BUCKET_SOCIOECONOMIC,
             self.BUCKET_SENSITIVE,
             self.BUCKET_ENVIRONMENTAL,
@@ -24,43 +25,47 @@ class ScoreETL(ExtractTransformLoad):
 
         # A few specific field names
         # TODO: clean this up, I name some fields but not others.
-        self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)"
-        self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)"
-        self.HOUSING_BURDEN_FIELD_NAME = "Housing burden (percent)"
-        self.POVERTY_FIELD_NAME = (
+        self.UNEMPLOYED_FIELD_NAME: str = "Unemployed civilians (percent)"
+        self.LINGUISTIC_ISOLATION_FIELD_NAME: str = "Linguistic isolation (percent)"
+        self.HOUSING_BURDEN_FIELD_NAME: str = "Housing burden (percent)"
+        self.POVERTY_FIELD_NAME: str = (
             "Poverty (Less than 200% of federal poverty line)"
         )
-        self.HIGH_SCHOOL_FIELD_NAME = "Percent individuals age 25 or over with less than high school degree"
+        self.HIGH_SCHOOL_FIELD_NAME: str = "Percent individuals age 25 or over with less than high school degree"
         self.STATE_MEDIAN_INCOME_FIELD_NAME: str = (
             "Median household income (State; 2019 inflation-adjusted dollars)"
         )
-        self.MEDIAN_INCOME_FIELD_NAME = (
+        self.MEDIAN_INCOME_FIELD_NAME: str = (
             "Median household income in the past 12 months"
         )
-        self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME = (
+        self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME: str = (
             "Median household income (% of state median household income)"
         )
+        self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME: str = (
+            "Median household income (% of AMI)"
+        )
+        self.AMI_FIELD_NAME: str = "Area Median Income (State or metropolitan)"
 
         # Note: these variable names are slightly different (missing the word `PERCENT`) than those in the source ETL to avoid pylint's duplicate
         # code error. - LMB
-        self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME = (
+        self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME: str = (
             "Percent of individuals < 100% Federal Poverty Line"
         )
-        self.POVERTY_LESS_THAN_150_FPL_FIELD_NAME = (
+        self.POVERTY_LESS_THAN_150_FPL_FIELD_NAME: str = (
             "Percent of individuals < 150% Federal Poverty Line"
         )
-        self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME = (
+        self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME: str = (
             "Percent of individuals < 200% Federal Poverty Line"
         )
 
         # There's another aggregation level (a second level of "buckets").
-        self.AGGREGATION_POLLUTION = "Pollution Burden"
-        self.AGGREGATION_POPULATION = "Population Characteristics"
+        self.AGGREGATION_POLLUTION: str = "Pollution Burden"
+        self.AGGREGATION_POPULATION: str = "Population Characteristics"
 
-        self.PERCENTILE_FIELD_SUFFIX = " (percentile)"
-        self.MIN_MAX_FIELD_SUFFIX = " (min-max normalized)"
+        self.PERCENTILE_FIELD_SUFFIX: str = " (percentile)"
+        self.MIN_MAX_FIELD_SUFFIX: str = " (min-max normalized)"
 
-        self.SCORE_CSV_PATH = self.DATA_PATH / "score" / "csv" / "full"
+        self.SCORE_CSV_PATH: Path = self.DATA_PATH / "score" / "csv" / "full"
 
         # dataframes
         self.df: pd.DataFrame
@@ -146,6 +151,16 @@ class ScoreETL(ExtractTransformLoad):
                 renamed_field=self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME,
                 bucket=None,
             ),
+            DataSet(
+                input_field=self.AMI_FIELD_NAME,
+                renamed_field=self.AMI_FIELD_NAME,
+                bucket=None,
+            ),
+            DataSet(
+                input_field=self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME,
+                renamed_field=self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME,
+                bucket=None,
+            ),
             # The following data sets have buckets, because they're used in Score C
             DataSet(
                 input_field="CANCER",
@@ -523,7 +538,33 @@ class ScoreETL(ExtractTransformLoad):
 
     def _add_score_g(self, df: pd.DataFrame) -> pd.DataFrame:
         logger.info("Adding Score G")
-        # TODO: add scoring
+
+        high_school_cutoff_threshold = 0.05
+
+        df["Score G (communities)"] = (
+            (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.7)
+            & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
+        ) | (
+            (df[self.POVERTY_LESS_THAN_200_FPL_FIELD_NAME] > 0.50)
+            & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
+        )
+        df["Score G"] = df["Score G (communities)"].astype(int)
+        df["Score G (percentile)"] = df["Score G"]
+
+        df["NMTC (communities)"] = (
+            (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
+        ) | (
+            (df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20)
+        )
+
+        df["NMTC modified (communities)"] = (
+            (df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] < 0.8)
+            & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
+        ) | (
+            (df[self.POVERTY_LESS_THAN_100_FPL_FIELD_NAME] > 0.20)
+            & (df[self.HIGH_SCHOOL_FIELD_NAME] > high_school_cutoff_threshold)
+        )
+
         return df
 
     # TODO Move a lot of this to the ETL part of the pipeline
@@ -564,13 +605,15 @@ class ScoreETL(ExtractTransformLoad):
 
         # Calculate median income variables.
         # First, calculate the income of the block group as a fraction of the state income.
-        # TODO: handle null values for CBG median income, which are `-666666666`.
         df[self.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD_NAME] = (
             df[self.MEDIAN_INCOME_FIELD_NAME]
             / df[self.STATE_MEDIAN_INCOME_FIELD_NAME]
         )
 
-        # TODO: Calculate the income of the block group as a fraction of the AMI (either state or metropolitan, depending on reference).
+        # Calculate the income of the block group as a fraction of the AMI (either state or metropolitan, depending on reference).
+        df[self.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD_NAME] = (
+            df[self.MEDIAN_INCOME_FIELD_NAME] / df[self.AMI_FIELD_NAME]
+        )
 
         # TODO Refactor to no longer use the data_sets list and do all renaming in ETL step
         # Rename columns:
@@ -669,4 +712,5 @@ class ScoreETL(ExtractTransformLoad):
     def load(self) -> None:
         logger.info("Saving Score CSV")
         self.SCORE_CSV_PATH.mkdir(parents=True, exist_ok=True)
+
         self.df.to_csv(self.SCORE_CSV_PATH / "usa.csv", index=False)
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
index 50172186..5ded2956 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@@ -26,8 +26,8 @@ class GeoScoreETL(ExtractTransformLoad):
             self.DATA_PATH / "census" / "geojson" / "us.json"
         )
 
-        self.TARGET_SCORE_NAME = "Score D (percentile)"
-        self.TARGET_SCORE_RENAME_TO = "D_SCORE"
+        self.TARGET_SCORE_NAME = "Score G"
+        self.TARGET_SCORE_RENAME_TO = "G_SCORE"
 
         self.NUMBER_OF_BUCKETS = 10
 
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
index 052a6f36..29a6c9ca 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@@ -99,6 +99,16 @@ class CensusACSETL(ExtractTransformLoad):
             self.MEDIAN_INCOME_FIELD
         ]
 
+        # Handle null values for CBG median income, which are `-666666666`.
+        missing_value_count = sum(self.df[self.MEDIAN_INCOME_FIELD_NAME]==-666666666)
+        logger.info(
+            f"There are {missing_value_count} ({int(100*missing_value_count/self.df[self.MEDIAN_INCOME_FIELD_NAME].count())}%) values of "
+            + f"`{self.MEDIAN_INCOME_FIELD_NAME}` being marked as null values."
+        )
+        self.df[self.MEDIAN_INCOME_FIELD_NAME] = self.df[
+            self.MEDIAN_INCOME_FIELD_NAME
+        ].replace(to_replace=-666666666, value=None)
+
         # Calculate percent unemployment.
         # TODO: remove small-sample data that should be `None` instead of a high-variance fraction.
         self.df[self.UNEMPLOYED_FIELD_NAME] = (
diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
index 22d7b865..01538091 100644
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@@ -85,7 +85,7 @@
    "execution_count": null,
    "id": "3b1b5ccf",
    "metadata": {
-    "scrolled": false
+    "scrolled": true
    },
    "outputs": [],
    "source": [
@@ -107,6 +107,21 @@
     "cejst_df.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9968187",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analyze one field at a time (useful for setting thresholds)\n",
+    "field = \"Percent of individuals < 200% Federal Poverty Line\"\n",
+    "print(cejst_df[field].describe())\n",
+    "quantile = .8\n",
+    "print(f\"Quantile at {quantile} is {np.nanquantile(a=cejst_df[field], q=quantile)}\")\n",
+    "cejst_df[field].hist()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -201,8 +216,8 @@
     ")\n",
     "\n",
     "\n",
-    "if len(merged_df) > 220333:\n",
-    "    raise ValueError(\"Too many rows in the join.\")\n",
+    "if len(merged_df) > 220335:\n",
+    "    raise ValueError(f\"Too many rows in the join: {len(merged_df)}\")\n",
     "\n",
     "merged_df.head()\n",
     "\n",
@@ -232,22 +247,33 @@
     "\n",
     "# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n",
     "# (`census_tract_indices`).\n",
+    "\n",
     "census_block_group_indices = [\n",
     "    Index(\n",
+    "        method_name=\"Score G\",\n",
+    "        priority_communities_field=\"Score G (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
+    "        method_name=\"NMTC\",\n",
+    "        priority_communities_field=\"NMTC (communities)\",\n",
+    "        other_census_tract_fields_to_keep=[],\n",
+    "    ),\n",
+    "    Index(\n",
     "        method_name=\"Score F\",\n",
     "        priority_communities_field=\"Score F (communities)\",\n",
     "        other_census_tract_fields_to_keep=[],\n",
     "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score F (socioeconomic only)\",\n",
-    "        priority_communities_field=\"Meets socioeconomic criteria\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score F (burden only)\",\n",
-    "        priority_communities_field=\"Meets burden criteria\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
+    "#     Index(\n",
+    "#         method_name=\"Score F (socioeconomic only)\",\n",
+    "#         priority_communities_field=\"Meets socioeconomic criteria\",\n",
+    "#         other_census_tract_fields_to_keep=[],\n",
+    "#     ),\n",
+    "#     Index(\n",
+    "#         method_name=\"Score F (burden only)\",\n",
+    "#         priority_communities_field=\"Meets burden criteria\",\n",
+    "#         other_census_tract_fields_to_keep=[],\n",
+    "#     ),\n",
     "    Index(\n",
     "        method_name=\"Score A\",\n",
     "        priority_communities_field=\"Score A (top 25th percentile)\",\n",
diff --git a/data/data-pipeline/data_pipeline/tile/generate.py b/data/data-pipeline/data_pipeline/tile/generate.py
index 17c18922..328555b0 100644
--- a/data/data-pipeline/data_pipeline/tile/generate.py
+++ b/data/data-pipeline/data_pipeline/tile/generate.py
@@ -38,6 +38,7 @@ def generate_tiles(data_path: Path) -> None:
     logger.info("Generating USA High mvt folders and files")
     cmd = "tippecanoe "
     cmd += f"--minimum-zoom={USA_HIGH_MIN_ZOOM} --maximum-zoom={USA_HIGH_MAX_ZOOM} --no-tile-compression "
+    cmd += "--drop-densest-as-needed "
     cmd += f"--output-to-directory={high_tile_path} --layer=blocks "
     cmd += str(score_geojson_dir / "usa-high.json")
     call(cmd, shell=True)
@@ -54,6 +55,7 @@ def generate_tiles(data_path: Path) -> None:
     logger.info("Generating USA Low mvt folders and files")
     cmd = "tippecanoe "
     cmd += f"--minimum-zoom={USA_LOW_MIN_ZOOM} --maximum-zoom={USA_LOW_MAX_ZOOM} --no-tile-compression "
+    cmd += "--drop-densest-as-needed "
     cmd += f"--output-to-directory={low_tile_path} --layer=blocks "
     cmd += str(score_geojson_dir / "usa-low.json")
     call(cmd, shell=True)