Issue 844: Add island areas to Definition L (#957)

This ended up being a pretty large task. Here's what this PR does: 1. Pulls in Vincent's data from island areas into the score ETL. This is from the 2010 decennial census, the last census of any kind in the island areas. 2. Grabs a few new fields from 2010 island areas decennial census. 3. Calculates area median income for island areas. 4. Stops using EJSCREEN as the source of our high school education data and directly pulls that from census (this was related to this project so I went ahead and fixed it). 5. Grabs a bunch of data from the 2010 ACS in the states/Puerto Rico/DC, so that we can create percentiles comparing apples-to-apples (ish) from 2010 island areas decennial census data to 2010 ACS data. This required creating a new class because all the ACS fields are different between 2010 and 2019, so it wasn't as simple as looping over a year parameter. 6. Creates a combined population field of island areas and mainland so we can use those stats in our comparison tool, and updates the comparison tool accordingly.
2025-07-28 08:41:16 -07:00 · 2021-12-03 15:46:10 -05:00 · 2021-12-03 15:46:10 -05:00 · 1d101c93d2
commit 1d101c93d2
parent 8cb9d197df
15 changed files with 882 additions and 153 deletions
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -57,13 +57,13 @@ AMI_FIELD = "Area Median Income (State or metropolitan)"

 # Climate
 FEMA_RISK_FIELD = "FEMA Risk Index Expected Annual Loss Score"
-EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME = (
+EXPECTED_BUILDING_LOSS_RATE_FIELD = (
    "Expected building loss rate (Natural Hazards Risk Index)"
 )
-EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME = (
+EXPECTED_AGRICULTURE_LOSS_RATE_FIELD = (
    "Expected agricultural loss rate (Natural Hazards Risk Index)"
 )
-EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME = (
+EXPECTED_POPULATION_LOSS_RATE_FIELD = (
    "Expected population loss rate (Natural Hazards Risk Index)"
 )

@ -117,6 +117,34 @@ AGGREGATION_POPULATION_FIELD = "Population Characteristics"
 UNDER_5_FIELD = "Individuals under 5 years old"
 OVER_64_FIELD = "Individuals over 64 years old"

+# Fields from 2010 decennial census (generally only loaded for the territories)
+CENSUS_DECENNIAL_MEDIAN_INCOME_2009 = "Median household income in 2009 ($)"
+CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = (
+    "Median household income as a percent of territory median income in 2009"
+)
+CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 = (
+    "Percentage households below 100% of federal poverty line in 2009"
+)
+CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009 = "Percent individuals age 25 or over with less than high school degree in 2009"
+CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = "Unemployed civilians (percent) in 2009"
+CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009 = "Total population in 2009"
+
+# Fields from 2010 ACS (loaded for comparison with the territories)
+CENSUS_UNEMPLOYMENT_FIELD_2010 = "Unemployed civilians (percent) in 2010"
+CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = (
+    "Percent of individuals < 100% Federal Poverty Line in 2010"
+)
+
+# Combined fields that merge island areas and states data
+COMBINED_CENSUS_TOTAL_POPULATION_2010 = (
+    "Total population in 2009 (island areas) and 2019 (states and PR)"
+)
+COMBINED_UNEMPLOYMENT_2010 = "Unemployed civilians (percent) in 2009 (island areas) and 2010 (states and PR)"
+COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = (
+    "Percentage households below 100% of federal poverty line in 2009 (island areas) "
+    "and 2010 (states and PR)"
+)
+
 # Urban Rural Map
 URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag"

@ -124,39 +152,39 @@ URBAN_HEURISTIC_FIELD = "Urban Heuristic Flag"
 MEDIAN_HOUSE_VALUE_FIELD = "Median value ($) of owner-occupied housing units"

 # EJSCREEN Areas of Concern
-EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_70TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, National, 70th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_75TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, National, 75th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_80TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, National, 80th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_85TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, National, 85th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_90TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, National, 90th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_NATIONAL_95TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, National, 95th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_STATE_70TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, State, 70th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_STATE_75TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, State, 75th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_STATE_80TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, State, 80th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_STATE_85TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, State, 85th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_STATE_90TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, State, 90th percentile (communities)"
 )
-EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD_NAME = (
+EJSCREEN_AREAS_OF_CONCERN_STATE_95TH_PERCENTILE_COMMUNITIES_FIELD = (
    "EJSCREEN Areas of Concern, State, 95th percentile (communities)"
 )
--- a/data/data-pipeline/data_pipeline/score/score_l.py
+++ b/data/data-pipeline/data_pipeline/score/score_l.py
@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd

 from data_pipeline.score.score import Score
@ -12,8 +13,86 @@ class ScoreL(Score):
        self.LOW_INCOME_THRESHOLD: float = 0.65
        self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
        self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
+        self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
+
        super().__init__(df)

+    def _combine_island_areas_with_states_and_set_thresholds(
+        self,
+        df: pd.DataFrame,
+        column_from_island_areas: str,
+        column_from_decennial_census: str,
+        combined_column_name: str,
+        threshold_cutoff_for_island_areas: float,
+    ) -> (pd.DataFrame, str):
+        """Steps to set thresholds for island areas.
+
+        This function is fairly logically complicated. It takes the following steps:
+
+            1. Combine the two different fields into a single field.
+            2. Calculate the 90th percentile cutoff raw value for the combined field.
+            3. Create a boolean series that is true for any census tract in the island
+                areas (and only the island areas) that exceeds this cutoff.
+
+        For step one, it combines data that is either the island area's Decennial Census
+        value in 2009 or the state's value in 5-year ACS ending in 2010.
+
+        This will be used to generate the percentile cutoff for the 90th percentile.
+
+        The stateside decennial census stopped asking economic comparisons,
+        so this is as close to apples-to-apples as we get. We use 5-year ACS for data
+        robustness over 1-year ACS.
+        """
+        # Create the combined field.
+        # There should only be one entry in either 2009 or 2019 fields, not one in both.
+        # But just to be safe, we take the mean and ignore null values so if there
+        # *were* entries in both, this result would make sense.
+        df[combined_column_name] = df[
+            [column_from_island_areas, column_from_decennial_census]
+        ].mean(axis=1, skipna=True)
+
+        logger.info(
+            f"Combined field `{combined_column_name}` has "
+            f"{df[combined_column_name].isnull().sum()} "
+            f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) "
+            f"missing values for census tracts. "
+        )
+
+        # Calculate the percentile threshold raw value.
+        raw_threshold = np.nanquantile(
+            a=df[combined_column_name], q=threshold_cutoff_for_island_areas
+        )
+
+        logger.info(
+            f"For combined field `{combined_column_name}`, "
+            f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a "
+            f"raw value of {raw_threshold:.3f}."
+        )
+
+        threshold_column_name = (
+            f"{column_from_island_areas} exceeds "
+            f"{threshold_cutoff_for_island_areas*100:.0f}th percentile"
+        )
+
+        df[threshold_column_name] = (
+            df[column_from_island_areas] >= raw_threshold
+        )
+
+        percent_of_tracts_highlighted = (
+            100
+            * df[threshold_column_name].sum()
+            / df[column_from_island_areas].notnull().sum()
+        )
+
+        logger.info(
+            f"For `{threshold_column_name}`, "
+            f"{df[threshold_column_name].sum()} ("
+            f"{percent_of_tracts_highlighted:.2f}% of tracts that have non-null data "
+            f"in the column) have a value of TRUE."
+        )
+
+        return df, threshold_column_name
+
    def add_columns(self) -> pd.DataFrame:
        logger.info("Adding Score L")

@ -67,21 +146,21 @@ class ScoreL(Score):
        climate_criteria = (
            (
                self.df[
-                    field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME
+                    field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
                    + field_names.PERCENTILE_FIELD_SUFFIX
                ]
                >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
            )
            | (
                self.df[
-                    field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
+                    field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
                    + field_names.PERCENTILE_FIELD_SUFFIX
                ]
                >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
            )
            | (
                self.df[
-                    field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME
+                    field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
                    + field_names.PERCENTILE_FIELD_SUFFIX
                ]
                >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
@ -204,14 +283,24 @@ class ScoreL(Score):
        # poverty level. Source: Census's American Community Survey]

        pollution_criteria = (
-            self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
-            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
-        ) | (
-            self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
-            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
-        ) | (
-            self.df[field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
-            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            (
+                self.df[
+                    field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
+                ]
+                >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+            | (
+                self.df[
+                    field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
+                ]
+                >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+            | (
+                self.df[
+                    field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
+                ]
+                >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
        )

        return pollution_criteria & (
@ -306,7 +395,7 @@ class ScoreL(Score):
        # AND
        # Where the high school degree achievement rates for adults 25 years and older is less than 95%
        # (necessary to screen out university block groups)
-        workforce_criteria = (
+        workforce_criteria_for_states = (
            (
                self.df[
                    field_names.UNEMPLOYMENT_FIELD
@ -338,6 +427,76 @@ class ScoreL(Score):
                >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
            )
        )
+        workforce_combined_criteria_for_states = (
+            self.df[field_names.HIGH_SCHOOL_ED_FIELD]
+            >= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
+        ) & workforce_criteria_for_states
+
+        # Now, calculate workforce criteria for island territories.
+
+        # F a couple of values, create a combined field and criteria field.
+        # First, combine unemployment.
+        (
+            self.df,
+            unemployment_island_areas_criteria_field_name,
+        ) = self._combine_island_areas_with_states_and_set_thresholds(
+            df=self.df,
+            column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
+            column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
+            combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010,
+            threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
+        )
+
+        # Next, combine poverty.
+        (
+            self.df,
+            poverty_island_areas_criteria_field_name,
+        ) = self._combine_island_areas_with_states_and_set_thresholds(
+            df=self.df,
+            column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
+            column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
+            combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
+            threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
+        )
+
+        workforce_combined_criteria_for_island_areas = (
+            self.df[unemployment_island_areas_criteria_field_name]
+            | self.df[poverty_island_areas_criteria_field_name]
+            # Also check whether area median income is 10th percentile or lower
+            # within the islands.
+            | (
+                self.df[
+                    field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
+                    + field_names.PERCENTILE_FIELD_SUFFIX
+                ]
+                # Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.
+                # and then look for median income lower than that (not greater than).
+                < 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            )
+        ) & (
+            self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
+            > self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
+        )
+
+        percent_of_island_tracts_highlighted = (
+            100
+            * workforce_combined_criteria_for_island_areas.sum()
+            # Choosing a random column from island areas to calculate the denominator.
+            / self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009]
+            .notnull()
+            .sum()
+        )
+
+        logger.info(
+            f"For workforce criteria in island areas, "
+            f"{workforce_combined_criteria_for_island_areas.sum()} ("
+            f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
+            f"in the column) have a value of TRUE."
+        )
+
+        # A tract is included if it meets either the states tract criteria or the
+        # island areas tract criteria.
        return (
-            self.df[field_names.HIGH_SCHOOL_ED_FIELD] >= 0.10
-        ) & workforce_criteria
+            workforce_combined_criteria_for_states
+            | workforce_combined_criteria_for_island_areas
+        )