Issue 970: reverse percentiles for AMI and life expectancy (#1018)

* switching to low * fixing score-etl-post * updating comments * fixing comparison * create separate field for clarity * comment fix * removing healthy food * fixing bug in score post * running black and adding comment * Update pickles and add a helpful notes to README Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
2025-09-30 03:03:17 -07:00 · 2021-12-10 10:16:22 -05:00 · 2021-12-10 10:16:22 -05:00 · 7fcecaee42
commit 7fcecaee42
parent 24bac56d9e
11 changed files with 144 additions and 100 deletions
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -60,11 +60,15 @@ MEDIAN_INCOME_FIELD = "Median household income in the past 12 months"
 MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD = (
    "Median household income (% of state median household income)"
 )
-MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD = "Median household income (% of AMI)"
 PERSISTENT_POVERTY_FIELD = "Persistent Poverty Census Tract"
 AMI_FIELD = "Area Median Income (State or metropolitan)"
-
 COLLEGE_ATTENDANCE_FIELD = "Percent enrollment in college or graduate school"
+MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD = (
+    "Median household income as a percent of area median income"
+)
+LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD = (
+    "Low median household income as a percent of area median income"
+)

 # Climate
 FEMA_RISK_FIELD = "FEMA Risk Index Expected Annual Loss Score"
@ -105,7 +109,6 @@ ENERGY_BURDEN_FIELD = "Energy burden"
 DIABETES_FIELD = "Diagnosed diabetes among adults aged >=18 years"
 ASTHMA_FIELD = "Current asthma among adults aged >=18 years"
 HEART_DISEASE_FIELD = "Coronary heart disease among adults aged >=18 years"
-LIFE_EXPECTANCY_FIELD = "Life expectancy (years)"
 CANCER_FIELD = "Cancer (excluding skin cancer) among adults aged >=18 years"
 HEALTH_INSURANCE_FIELD = (
    "Current lack of health insurance among adults aged 18-64 years"
@ -113,6 +116,8 @@ HEALTH_INSURANCE_FIELD = (
 PHYS_HEALTH_NOT_GOOD_FIELD = (
    "Physical health not good for >=14 days among adults aged >=18 years"
 )
+LIFE_EXPECTANCY_FIELD = "Life expectancy (years)"
+LOW_LIFE_EXPECTANCY_FIELD = "Low life expectancy"

 # Other Demographics
 TOTAL_POP_FIELD = "Total population"
@ -130,9 +135,6 @@ OVER_64_FIELD = "Individuals over 64 years old"

 # Fields from 2010 decennial census (generally only loaded for the territories)
 CENSUS_DECENNIAL_MEDIAN_INCOME_2009 = "Median household income in 2009 ($)"
-CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = (
-    "Median household income as a percent of territory median income in 2009"
-)
 CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 = (
    "Percentage households below 100% of federal poverty line in 2009"
 )
@ -141,7 +143,10 @@ CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = (
    "Unemployed civilians (percent) in 2009"
 )
 CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009 = "Total population in 2009"
-
+CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = (
+    "Median household income as a percent of territory median income in 2009"
+)
+LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = "Low median household income as a percent of territory median income in 2009"
 # Fields from 2010 ACS (loaded for comparison with the territories)
 CENSUS_UNEMPLOYMENT_FIELD_2010 = "Unemployed civilians (percent) in 2010"
 CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = (
@ -265,7 +270,10 @@ ASTHMA_LOW_INCOME_FIELD = (
 )
 HEART_DISEASE_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for heart disease and is low income"

-LIFE_EXPECTANCY_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for life expectancy and is low income"
+LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD = (
+    f"At or above the {PERCENTILE}th percentile "
+    f"for low life expectancy and is low income"
+)

 # Workforce
 UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD = (
@ -288,9 +296,9 @@ LOW_READING_LOW_HS_EDUCATION_FIELD = (
    " and has low HS education"
 )

-MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = (
-    f"At or below the {PERCENTILE}th percentile for median income"
-    "  and has low HS education"
+LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = (
+    f"At or below the {PERCENTILE}th percentile for low median household income as a "
+    f"percent of area median income and has low HS education"
 )

 # Not currently used in a factor
--- a/data/data-pipeline/data_pipeline/score/score_l.py
+++ b/data/data-pipeline/data_pipeline/score/score_l.py
@ -44,6 +44,8 @@ class ScoreL(Score):
        robustness over 1-year ACS.
        """
        # Create the combined field.
+        # TODO: move this combined field percentile calculation to `etl_score`,
+        #  since most other percentile logic is there.
        # There should only be one entry in either 2009 or 2019 fields, not one in both.
        # But just to be safe, we take the mean and ignore null values so if there
        # *were* entries in both, this result would make sense.
@ -169,7 +171,7 @@ class ScoreL(Score):
    def _climate_factor(self) -> bool:
        # In Xth percentile or above for FEMA’s Risk Index (Source: FEMA
        # AND
-        # Low income: In 60th percentile or above for percent of block group population
+        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level. Source: Census's American Community Survey]

@ -225,7 +227,7 @@ class ScoreL(Score):
    def _energy_factor(self) -> bool:
        # In Xth percentile or above for DOE’s energy cost burden score (Source: LEAD Score)
        # AND
-        # Low income: In 60th percentile or above for percent of block group population
+        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level. Source: Census's American Community Survey]

@ -268,7 +270,7 @@ class ScoreL(Score):
        # or
        # In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data
        # AND
-        # Low income: In 60th percentile or above for percent of block group population
+        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level. Source: Census's American Community Survey]

@ -315,7 +317,7 @@ class ScoreL(Score):
        # or
        # In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
        # AND
-        # Low income: In 60th percentile or above for percent of block group population
+        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level. Source: Census's American Community Survey]

@ -363,7 +365,7 @@ class ScoreL(Score):
    def _pollution_factor(self) -> bool:
        # Proximity to Risk Management Plan sites is > X
        # AND
-        # Low income: In 60th percentile or above for percent of block group population
+        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level. Source: Census's American Community Survey]

@ -410,7 +412,7 @@ class ScoreL(Score):
    def _water_factor(self) -> bool:
        # In Xth percentile or above for wastewater discharge (Source: EPA Risk-Screening Environmental Indicators (RSEI) Model)
        # AND
-        # Low income: In 60th percentile or above for percent of block group population
+        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level. Source: Census's American Community Survey]

@ -441,7 +443,7 @@ class ScoreL(Score):
        # or
        # In Xth percentile or above for low life expectancy (Source: CDC Places)
        # AND
-        # Low income: In 60th percentile or above for percent of block group population
+        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
        # poverty level. Source: Census's American Community Survey]

@ -449,8 +451,7 @@ class ScoreL(Score):
            field_names.DIABETES_LOW_INCOME_FIELD,
            field_names.ASTHMA_LOW_INCOME_FIELD,
            field_names.HEART_DISEASE_LOW_INCOME_FIELD,
-            field_names.HEALTHY_FOOD_LOW_INCOME_FIELD,
-            field_names.LIFE_EXPECTANCY_LOW_INCOME_FIELD,
+            field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD,
        ]

        diabetes_threshold = (
@ -475,24 +476,14 @@ class ScoreL(Score):
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )

-        healthy_food_threshold = (
+        low_life_expectancy_threshold = (
            self.df[
-                field_names.HEALTHY_FOOD_FIELD
+                field_names.LOW_LIFE_EXPECTANCY_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )

-        life_expectancy_threshold = (
-            self.df[
-                field_names.LIFE_EXPECTANCY_FIELD
-                + field_names.PERCENTILE_FIELD_SUFFIX
-            ]
-            # Note: a high life expectancy is good, so take 1 minus the threshold to invert it,
-            # and then look for life expenctancies lower than that (not greater than).
-            <= 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
-        )
-
        self.df[field_names.DIABETES_LOW_INCOME_FIELD] = (
            diabetes_threshold & self.df[field_names.FPL_200_SERIES]
        )
@ -502,11 +493,8 @@ class ScoreL(Score):
        self.df[field_names.HEART_DISEASE_LOW_INCOME_FIELD] = (
            heart_disease_threshold & self.df[field_names.FPL_200_SERIES]
        )
-        self.df[field_names.LIFE_EXPECTANCY_LOW_INCOME_FIELD] = (
-            life_expectancy_threshold & self.df[field_names.FPL_200_SERIES]
-        )
-        self.df[field_names.HEALTHY_FOOD_LOW_INCOME_FIELD] = (
-            healthy_food_threshold & self.df[field_names.FPL_200_SERIES]
+        self.df[field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD] = (
+            low_life_expectancy_threshold & self.df[field_names.FPL_200_SERIES]
        )

        self._increment_total_eligibility_exceeded(health_eligibility_columns)
@ -514,23 +502,25 @@ class ScoreL(Score):
        return self.df[health_eligibility_columns].any(axis="columns")

    def _workforce_factor(self) -> bool:
-        # Where unemployment is above X%
+        # Where unemployment is above Xth percentile
        # or
-        # Where median income is less than Y% of the area median income
+        # Where median income as a percent of area median income is above Xth percentile
        # or
-        # Where the percent of households at or below 100% of the federal poverty level is greater than Z%
+        # Where the percent of households at or below 100% of the federal poverty level
+        # is above Xth percentile
        # or
-        # Where linguistic isolation is greater than Y%
+        # Where linguistic isolation is above Xth percentile
        # AND
-        # Where the high school degree achievement rates for adults 25 years and older is less than 95%
-        # (necessary to screen out university block groups)
+        # Where the high school degree achievement rates for adults 25 years and older
+        # is less than Y%
+        # (necessary to screen out university tracts)

        # Workforce criteria for states fields.
        workforce_eligibility_columns = [
            field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
            field_names.POVERTY_LOW_HS_EDUCATION_FIELD,
            field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD,
-            field_names.MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
+            field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
        ]

        high_scool_achievement_rate_threshold = (
@ -546,14 +536,12 @@ class ScoreL(Score):
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )

-        median_income_threshold = (
+        low_median_income_threshold = (
            self.df[
-                field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+                field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
                + field_names.PERCENTILE_FIELD_SUFFIX
            ]
-            # Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.
-            # and then look for median income lower than that (not greater than).
-            <= 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
+            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )

        linguistic_isolation_threshold = (
@ -581,8 +569,8 @@ class ScoreL(Score):
            poverty_threshold & high_scool_achievement_rate_threshold
        )

-        self.df[field_names.MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD] = (
-            median_income_threshold & high_scool_achievement_rate_threshold
+        self.df[field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD] = (
+            low_median_income_threshold & high_scool_achievement_rate_threshold
        )

        self.df[field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD] = (
@ -624,23 +612,31 @@ class ScoreL(Score):
            threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
        )

+        # Also check whether low area median income is 90th percentile or higher
+        # within the islands.
+        low_median_income_as_a_percent_of_ami_island_areas_criteria_field_name = (
+            f"{field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009} exceeds "
+            f"{field_names.PERCENTILE}th percentile"
+        )
+        self.df[
+            low_median_income_as_a_percent_of_ami_island_areas_criteria_field_name
+        ] = (
+            self.df[
+                field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
+                + field_names.PERCENTILE_FIELD_SUFFIX
+            ]
+            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
+        )
+
        workforce_combined_criteria_for_island_areas = (
            self.df[unemployment_island_areas_criteria_field_name]
            | self.df[poverty_island_areas_criteria_field_name]
-            # Also check whether area median income is 10th percentile or lower
-            # within the islands.
-            | (
-                self.df[
-                    field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
-                    + field_names.PERCENTILE_FIELD_SUFFIX
-                ]
-                # Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.
-                # and then look for median income lower than that (not greater than).
-                < 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
-            )
+            | self.df[
+                low_median_income_as_a_percent_of_ami_island_areas_criteria_field_name
+            ]
        ) & (
            self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
-            > self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
+            >= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
        )

        percent_of_island_tracts_highlighted = (