Add Score L (#812)

* Create ScoreCalculator This calculates all the factors for score L for now (with placeholder formulae because this is a WIP). I think ideallly we'll want to refactor all the score code to be extracted into this or similar classes. * Add factor logic for score L Updated factor logic to match score L factors methodology. Still need to get the Score L field itself working. Cleanup needed: Pull field names into constants file, extract all score calculation into score calculator Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov> Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
2025-02-23 01:54:18 -08:00 · 2021-10-28 16:07:41 -04:00 · 2021-10-28 16:07:41 -04:00 · 7b87e0ec99
commit 7b87e0ec99
parent b1adc1f69f
3 changed files with 385 additions and 73 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -5,6 +5,7 @@ import pandas as pd

 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.score.score_calculator import ScoreCalculator

 logger = get_module_logger(__name__)

@ -522,7 +523,7 @@ class ScoreETL(ExtractTransformLoad):
        )
        return df

-    def _add_scores_d_and_e(self, df: pd.DataFrame) -> pd.DataFrame:
+    def _add_scores_d_e(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info("Adding Scores D and E")
        fields_to_use_in_score = [
            self.UNEMPLOYED_FIELD_NAME,
@ -641,8 +642,8 @@ class ScoreETL(ExtractTransformLoad):
        )
        return df

-    def _add_score_g(self, df: pd.DataFrame) -> pd.DataFrame:
-        logger.info("Adding Score G")
+    def _add_score_g_k(self, df: pd.DataFrame) -> pd.DataFrame:
+        logger.info("Adding Score G through K")

        high_school_cutoff_threshold = 0.05
        high_school_cutoff_threshold_2 = 0.06
@ -691,6 +692,12 @@ class ScoreETL(ExtractTransformLoad):

        return df

+    def _add_definition_l_factors(self, df: pd.DataFrame) -> pd.DataFrame:
+        logger.info("Adding Definition L and factors")
+        calc = ScoreCalculator(df=df)
+        df = calc.add_definition_l_factors()
+        return df
+
    # TODO Move a lot of this to the ETL part of the pipeline
    def _prepare_initial_df(self, data_sets: list) -> pd.DataFrame:
        logger.info("Preparing initial dataframe")
@ -831,7 +838,7 @@ class ScoreETL(ExtractTransformLoad):
        self.df = self._add_score_c(self.df, data_sets)

        # Calculate scores "D" and "E"
-        self.df = self._add_scores_d_and_e(self.df)
+        self.df = self._add_scores_d_e(self.df)

        # Create percentiles for the scores
        self.df = self._add_score_percentiles(self.df)
@ -840,8 +847,11 @@ class ScoreETL(ExtractTransformLoad):
        # Calculate "Score F", which uses "either/or" thresholds.
        self.df = self._add_score_f(self.df)

-        # Calculate "Score G", which uses AMI and poverty.
-        self.df = self._add_score_g(self.df)
+        # Calculate "Score G through K", which uses AMI and poverty.
+        self.df = self._add_score_g_k(self.df)
+
+        # Calculate Definition L and its factors
+        self.df = self._add_definition_l_factors(self.df)

    def load(self) -> None:
        logger.info("Saving Score CSV")
--- a/data/data-pipeline/data_pipeline/etl/score/score_calculator.py
+++ b/data/data-pipeline/data_pipeline/etl/score/score_calculator.py
@ -0,0 +1,284 @@
+import pandas as pd
+
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+class ScoreCalculator:
+    def __init__(self, df: pd.DataFrame):
+        # Define some global parameters
+        self.df = df
+
+        self.POVERTY_LESS_THAN_200_FPL_FIELD: str = (
+            "Percent of individuals < 200% Federal Poverty Line (percentile)"
+        )
+
+        self.POVERTY_LESS_THAN_100_FPL_FIELD: str = (
+            "Percent of individuals < 100% Federal Poverty Line (percentile)"
+        )
+
+        # FEMA Risk Index
+        self.NATIONAL_RISK_FIELD: str = (
+            "FEMA Risk Index Expected Annual Loss Score (percentile)"
+        )
+
+        # DOE energy burden
+        self.ENERGY_BURDEN_FIELD: str = "Energy burden (percentile)"
+
+        # Diesel particulate matter
+        self.DIESEL_FIELD: str = "Diesel particulate matter (percentile)"
+
+        # PM2.5
+        self.PM25_FIELD: str = "Particulate matter (PM2.5) (percentile)"
+
+        # Traffic proximity and volume
+        self.TRAFFIC_FIELD: str = "Traffic proximity and volume (percentile)"
+
+        # Lead paint
+        self.LEAD_PAINT_FIELD: str = (
+            "Percent pre-1960s housing (lead paint indicator) (percentile)"
+        )
+
+        # Housing cost burden
+        self.HOUSING_BURDEN_FIELD: str = "Housing burden (percent) (percentile)"
+
+        # Wastewater discharge
+        self.WASTEWATER_FIELD: str = "Wastewater discharge (percentile)"
+
+        # Diabetes
+        self.DIABETES_FIELD: str = (
+            "Diagnosed diabetes among adults aged >=18 years (percentile)"
+        )
+
+        # Asthma
+        self.ASTHMA_FIELD: str = (
+            "Current asthma among adults aged >=18 years (percentile)"
+        )
+
+        # Heart disease
+        self.HEART_DISEASE_FIELD: str = (
+            "Coronary heart disease among adults aged >=18 years (percentile)"
+        )
+
+        # Life expectancy
+        self.LIFE_EXPECTANCY_FIELD: str = "Life expectancy (years) (percentile)"
+
+        # Unemployment
+        self.UNEMPLOYMENT_FIELD: str = (
+            "Unemployed civilians (percent) (percentile)"
+        )
+
+        # Median income as % of AMI
+        self.MEDIAN_INCOME_FIELD: str = (
+            "Median household income (% of AMI) (percentile)"
+        )
+
+        # Linguistic isolation
+        self.LINGUISTIC_ISO_FIELD: str = (
+            "Linguistic isolation (percent) (percentile)"
+        )
+
+        # Less than high school education
+        self.HIGH_SCHOOL_ED_FIELD: str = "Percent individuals age 25 or over with less than high school degree (percentile)"
+
+        # Set thresholds for score L
+        self.LOW_INCOME_THRESHOLD: float = 0.60
+        self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
+
+    def add_definition_l_factors(self):
+        self.df["Climate Factor (Definition L)"] = self.climate_factor()
+        self.df["Energy Factor (Definition L)"] = self.energy_factor()
+        self.df[
+            "Transportation Factor (Definition L)"
+        ] = self.transportation_factor()
+        self.df["Housing Factor (Definition L)"] = self.housing_factor()
+        self.df["Pollution Factor (Definition L)"] = self.pollution_factor()
+        self.df["Water Factor (Definition L)"] = self.water_factor()
+        self.df["Health Factor (Definition L)"] = self.health_factor()
+        self.df["Workforce Factor (Definition L)"] = self.workforce_factor()
+
+        factors = [
+            "Climate Factor (Definition L)",
+            "Energy Factor (Definition L)",
+            "Transportation Factor (Definition L)",
+            "Housing Factor (Definition L)",
+            "Pollution Factor (Definition L)",
+            "Water Factor (Definition L)",
+            "Health Factor (Definition L)",
+            "Workforce Factor (Definition L)",
+        ]
+        self.df["Definition L (communities)"] = self.df[factors].any(axis=1)
+
+        # Note: this is purely used for comparison tool analysis, and can be removed at a later date. - LMB.
+        non_workforce_factors = [
+            "Climate Factor (Definition L)",
+            "Energy Factor (Definition L)",
+            "Transportation Factor (Definition L)",
+            "Housing Factor (Definition L)",
+            "Pollution Factor (Definition L)",
+            "Water Factor (Definition L)",
+            "Health Factor (Definition L)",
+        ]
+        self.df["Any Non-Workforce Factor (Definition L)"] = self.df[
+            non_workforce_factors
+        ].any(axis=1)
+
+        return self.df
+
+    def climate_factor(self) -> bool:
+        # In Xth percentile or above for FEMA’s Risk Index (Source: FEMA
+        # AND
+        # Low income: In 60th percentile or above for percent of block group population
+        # of households where household income is less than or equal to twice the federal
+        # poverty level. Source: Census's American Community Survey]
+        return (
+            self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
+            > self.LOW_INCOME_THRESHOLD
+        ) & (
+            self.df[self.NATIONAL_RISK_FIELD]
+            > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+        )
+
+    def energy_factor(self) -> bool:
+        # In Xth percentile or above for DOE’s energy cost burden score (Source: LEAD Score)
+        # AND
+        # Low income: In 60th percentile or above for percent of block group population
+        # of households where household income is less than or equal to twice the federal
+        # poverty level. Source: Census's American Community Survey]
+        return (
+            self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
+            > self.LOW_INCOME_THRESHOLD
+        ) & (
+            self.df[self.ENERGY_BURDEN_FIELD]
+            > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+        )
+
+    def transportation_factor(self) -> bool:
+        # In Xth percentile or above for diesel particulate matter (Source: EPA National Air Toxics Assessment (NATA)
+        # or
+        # In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)]
+        # or
+        # In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data
+        # AND
+        # Low income: In 60th percentile or above for percent of block group population
+        # of households where household income is less than or equal to twice the federal
+        # poverty level. Source: Census's American Community Survey]
+        transportation_criteria = (
+            (self.df[self.DIESEL_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD)
+            | (self.df[self.PM25_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD)
+            | (
+                self.df[self.TRAFFIC_FIELD]
+                > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+        )
+
+        return (
+            self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
+            > self.LOW_INCOME_THRESHOLD
+        ) & transportation_criteria
+
+    def housing_factor(self) -> bool:
+        # In Xth percentile or above for lead paint (Source: Census's American Community Survey’s
+        # percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
+        # or
+        # In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
+        # AND
+        # Low income: In 60th percentile or above for percent of block group population
+        # of households where household income is less than or equal to twice the federal
+        # poverty level. Source: Census's American Community Survey]
+        housing_criteria = (
+            self.df[self.LEAD_PAINT_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+        ) | (
+            self.df[self.HOUSING_BURDEN_FIELD]
+            > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+        )
+        return (
+            self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
+            > self.LOW_INCOME_THRESHOLD
+        ) & housing_criteria
+
+    def pollution_factor(self) -> bool:
+        # TBD
+        # AND
+        # Low income: In 60th percentile or above for percent of block group population
+        # of households where household income is less than or equal to twice the federal
+        # poverty level. Source: Census's American Community Survey]
+        return False
+
+    def water_factor(self) -> bool:
+        # In Xth percentile or above for wastewater discharge (Source: EPA Risk-Screening Environmental Indicators (RSEI) Model)
+        # AND
+        # Low income: In 60th percentile or above for percent of block group population
+        # of households where household income is less than or equal to twice the federal
+        # poverty level. Source: Census's American Community Survey]
+        return (
+            self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
+            > self.LOW_INCOME_THRESHOLD
+        ) & (
+            self.df[self.WASTEWATER_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+        )
+
+    def health_factor(self) -> bool:
+        # In Xth percentile or above for diabetes (Source: CDC Places)
+        # or
+        # In Xth percentile or above for asthma (Source: CDC Places)
+        # or
+        # In Xth percentile or above for heart disease
+        # or
+        # In Xth percentile or above for low life expectancy (Source: CDC Places)
+        # AND
+        # Low income: In 60th percentile or above for percent of block group population
+        # of households where household income is less than or equal to twice the federal
+        # poverty level. Source: Census's American Community Survey]
+
+        health_criteria = (
+            (self.df[self.DIABETES_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD)
+            | (self.df[self.ASTHMA_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD)
+            | (
+                self.df[self.HEART_DISEASE_FIELD]
+                > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+            | (
+                self.df[self.LIFE_EXPECTANCY_FIELD]
+                # Note: a high life expectancy is good, so take 1 minus the threshold to invert it,
+                # and then look for life expenctancies lower than that (not greater than).
+                < 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+        )
+        return (
+            self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
+            > self.LOW_INCOME_THRESHOLD
+        ) & health_criteria
+
+    def workforce_factor(self) -> bool:
+        # Where unemployment is above X%
+        # or
+        # Where median income is less than Y% of the area median income
+        # or
+        # Where the percent of households at or below 100% of the federal poverty level is greater than Z%
+        # or
+        # Where linguistic isolation is greater than Y%
+        # AND
+        # Where the high school degree achievement rates for adults 25 years and older is less than 95%
+        # (necessary to screen out university block groups)
+        workforce_criteria = (
+            (
+                self.df[self.UNEMPLOYMENT_FIELD]
+                > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+            | (
+                self.df[self.MEDIAN_INCOME_FIELD]
+                # Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.
+                # and then look for median income lower than that (not greater than).
+                < 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+            | (
+                self.df[self.POVERTY_LESS_THAN_100_FPL_FIELD]
+                > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+            | (
+                self.df[self.LINGUISTIC_ISO_FIELD]
+                > self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+        )
+        return (self.df[self.HIGH_SCHOOL_ED_FIELD] > 0.05) & workforce_criteria
--- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb
@ -119,13 +119,14 @@
   "source": [
    "# Analyze one field at a time (useful for setting thresholds)\n",
    "\n",
-    "quantile = 0.8\n",
+    "quantile = 0.9\n",
    "\n",
    "for field in [\n",
-    "    \"Percent of individuals < 200% Federal Poverty Line\",\n",
-    "    \"Life expectancy (years)\",\n",
-    "    \"Energy burden\",\n",
-    "    URBAN_HEURISTIC_FIELD,\n",
+    "    \"Linguistic isolation (percent)\",\n",
+    "    \"Diesel particulate matter (percentile)\",\n",
+    "    \"Particulate matter (PM2.5) (percentile)\",\n",
+    "    \"Traffic proximity and volume (percentile)\",\n",
+    "    \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n",
    "]:\n",
    "    print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
    "    print(cejst_df[field].describe())\n",
@ -234,7 +235,7 @@
   "execution_count": null,
   "id": "8da016db",
   "metadata": {
-    "scrolled": false
+    "scrolled": true
   },
   "outputs": [],
   "source": [
@ -278,64 +279,83 @@
    "\n",
    "# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n",
    "# (`census_tract_indices`).\n",
-    "census_block_group_indices = [\n",
-    "    Index(\n",
-    "        method_name=\"Score G\",\n",
-    "        priority_communities_field=\"Score G (communities)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score H\",\n",
-    "        priority_communities_field=\"Score H (communities)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score I\",\n",
-    "        priority_communities_field=\"Score I (communities)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"NMTC\",\n",
-    "        priority_communities_field=\"NMTC (communities)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score A\",\n",
-    "        priority_communities_field=\"Score A (top 25th percentile)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score B\",\n",
-    "        priority_communities_field=\"Score B (top 25th percentile)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score C\",\n",
-    "        priority_communities_field=\"Score C (top 25th percentile)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score D (25th percentile)\",\n",
-    "        priority_communities_field=\"Score D (top 25th percentile)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Score F\",\n",
-    "        priority_communities_field=\"Score F (communities)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Poverty\",\n",
-    "        priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
-    "    Index(\n",
-    "        method_name=\"Persistent Poverty (CBG)\",\n",
-    "        priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
-    "        other_census_tract_fields_to_keep=[],\n",
-    "    ),\n",
+    "\n",
+    "definition_l_factors = [\n",
+    "    \"Climate Factor (Definition L)\",\n",
+    "    \"Energy Factor (Definition L)\",\n",
+    "    \"Transportation Factor (Definition L)\",\n",
+    "    \"Housing Factor (Definition L)\",\n",
+    "    \"Pollution Factor (Definition L)\",\n",
+    "    \"Water Factor (Definition L)\",\n",
+    "    \"Health Factor (Definition L)\",\n",
+    "    \"Workforce Factor (Definition L)\",\n",
+    "    # Also include a combined factor for all the non-workforce elements.\n",
+    "    \"Any Non-Workforce Factor (Definition L)\",\n",
    "]\n",
    "\n",
+    "census_block_group_indices = (\n",
+    "    [\n",
+    "        Index(\n",
+    "            method_name=\"Definition L\",\n",
+    "            priority_communities_field=\"Definition L (communities)\",\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        ),\n",
+    "    ]\n",
+    "    # Insert indices for each of the factors from Definition L.\n",
+    "    # Note: since these involve no renaming, we write them using list comprehension.\n",
+    "    + [\n",
+    "        Index(\n",
+    "            method_name=factor,\n",
+    "            priority_communities_field=factor,\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        )\n",
+    "        for factor in definition_l_factors\n",
+    "    ]\n",
+    "    + [\n",
+    "        Index(\n",
+    "            # Note: we're renaming Score G as NMTC Modified for clarity, since that's what Score G is under the hood.\n",
+    "            method_name=\"NMTC Modified\",\n",
+    "            priority_communities_field=\"Score G (communities)\",\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        ),\n",
+    "        Index(\n",
+    "            method_name=\"NMTC\",\n",
+    "            priority_communities_field=\"NMTC (communities)\",\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        ),\n",
+    "        Index(\n",
+    "            method_name=\"Score C\",\n",
+    "            priority_communities_field=\"Score C (top 25th percentile)\",\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        ),\n",
+    "        Index(\n",
+    "            method_name=\"Score D (30th percentile)\",\n",
+    "            priority_communities_field=\"Score D (top 30th percentile)\",\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        ),\n",
+    "        Index(\n",
+    "            method_name=\"Score D (25th percentile)\",\n",
+    "            priority_communities_field=\"Score D (top 25th percentile)\",\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        ),\n",
+    "        Index(\n",
+    "            method_name=\"Score F\",\n",
+    "            priority_communities_field=\"Score F (communities)\",\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        ),\n",
+    "        Index(\n",
+    "            method_name=\"Poverty\",\n",
+    "            priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        ),\n",
+    "        Index(\n",
+    "            method_name=\"Persistent Poverty (CBG)\",\n",
+    "            priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
+    "            other_census_tract_fields_to_keep=[],\n",
+    "        ),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
    "census_tract_indices = [\n",
    "    Index(\n",
    "        method_name=\"Persistent Poverty\",\n",
@ -623,9 +643,7 @@
    "write_state_distribution_excel(\n",
    "    state_distribution_df=state_distribution_df,\n",
    "    file_path=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.xlsx\",\n",
-    ")\n",
-    "\n",
-    "state_distribution_df.head()"
+    ")"
   ]
  },
  {
@ -633,7 +651,7 @@
   "execution_count": null,
   "id": "8790cd64",
   "metadata": {
-    "scrolled": false
+    "scrolled": true
   },
   "outputs": [],
   "source": [
@ -1461,7 +1479,7 @@
   "execution_count": null,
   "id": "908e0ad4",
   "metadata": {
-    "scrolled": true
+    "scrolled": false
   },
   "outputs": [],
   "source": [