Add Score L (#812)

* Create ScoreCalculator

This calculates all the factors for score L for now (with placeholder
formulae because this is a WIP). I think ideallly we'll want to
refactor all the score code to be extracted into this or  similar
classes.

* Add factor logic for score L

Updated factor logic to match score L factors methodology.
Still need to get the Score L field itself working.

Cleanup needed: Pull field names into constants file, extract all score
calculation into score calculator

Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
This commit is contained in:
Shelby Switzer 2021-10-28 16:07:41 -04:00 committed by GitHub
parent b1adc1f69f
commit 7b87e0ec99
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 385 additions and 73 deletions

View file

@ -5,6 +5,7 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.score.score_calculator import ScoreCalculator
logger = get_module_logger(__name__)
@ -522,7 +523,7 @@ class ScoreETL(ExtractTransformLoad):
)
return df
def _add_scores_d_and_e(self, df: pd.DataFrame) -> pd.DataFrame:
def _add_scores_d_e(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Adding Scores D and E")
fields_to_use_in_score = [
self.UNEMPLOYED_FIELD_NAME,
@ -641,8 +642,8 @@ class ScoreETL(ExtractTransformLoad):
)
return df
def _add_score_g(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Adding Score G")
def _add_score_g_k(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Adding Score G through K")
high_school_cutoff_threshold = 0.05
high_school_cutoff_threshold_2 = 0.06
@ -691,6 +692,12 @@ class ScoreETL(ExtractTransformLoad):
return df
def _add_definition_l_factors(self, df: pd.DataFrame) -> pd.DataFrame:
logger.info("Adding Definition L and factors")
calc = ScoreCalculator(df=df)
df = calc.add_definition_l_factors()
return df
# TODO Move a lot of this to the ETL part of the pipeline
def _prepare_initial_df(self, data_sets: list) -> pd.DataFrame:
logger.info("Preparing initial dataframe")
@ -831,7 +838,7 @@ class ScoreETL(ExtractTransformLoad):
self.df = self._add_score_c(self.df, data_sets)
# Calculate scores "D" and "E"
self.df = self._add_scores_d_and_e(self.df)
self.df = self._add_scores_d_e(self.df)
# Create percentiles for the scores
self.df = self._add_score_percentiles(self.df)
@ -840,8 +847,11 @@ class ScoreETL(ExtractTransformLoad):
# Calculate "Score F", which uses "either/or" thresholds.
self.df = self._add_score_f(self.df)
# Calculate "Score G", which uses AMI and poverty.
self.df = self._add_score_g(self.df)
# Calculate "Score G through K", which uses AMI and poverty.
self.df = self._add_score_g_k(self.df)
# Calculate Definition L and its factors
self.df = self._add_definition_l_factors(self.df)
def load(self) -> None:
logger.info("Saving Score CSV")

View file

@ -0,0 +1,284 @@
import pandas as pd
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreCalculator:
def __init__(self, df: pd.DataFrame):
# Define some global parameters
self.df = df
self.POVERTY_LESS_THAN_200_FPL_FIELD: str = (
"Percent of individuals < 200% Federal Poverty Line (percentile)"
)
self.POVERTY_LESS_THAN_100_FPL_FIELD: str = (
"Percent of individuals < 100% Federal Poverty Line (percentile)"
)
# FEMA Risk Index
self.NATIONAL_RISK_FIELD: str = (
"FEMA Risk Index Expected Annual Loss Score (percentile)"
)
# DOE energy burden
self.ENERGY_BURDEN_FIELD: str = "Energy burden (percentile)"
# Diesel particulate matter
self.DIESEL_FIELD: str = "Diesel particulate matter (percentile)"
# PM2.5
self.PM25_FIELD: str = "Particulate matter (PM2.5) (percentile)"
# Traffic proximity and volume
self.TRAFFIC_FIELD: str = "Traffic proximity and volume (percentile)"
# Lead paint
self.LEAD_PAINT_FIELD: str = (
"Percent pre-1960s housing (lead paint indicator) (percentile)"
)
# Housing cost burden
self.HOUSING_BURDEN_FIELD: str = "Housing burden (percent) (percentile)"
# Wastewater discharge
self.WASTEWATER_FIELD: str = "Wastewater discharge (percentile)"
# Diabetes
self.DIABETES_FIELD: str = (
"Diagnosed diabetes among adults aged >=18 years (percentile)"
)
# Asthma
self.ASTHMA_FIELD: str = (
"Current asthma among adults aged >=18 years (percentile)"
)
# Heart disease
self.HEART_DISEASE_FIELD: str = (
"Coronary heart disease among adults aged >=18 years (percentile)"
)
# Life expectancy
self.LIFE_EXPECTANCY_FIELD: str = "Life expectancy (years) (percentile)"
# Unemployment
self.UNEMPLOYMENT_FIELD: str = (
"Unemployed civilians (percent) (percentile)"
)
# Median income as % of AMI
self.MEDIAN_INCOME_FIELD: str = (
"Median household income (% of AMI) (percentile)"
)
# Linguistic isolation
self.LINGUISTIC_ISO_FIELD: str = (
"Linguistic isolation (percent) (percentile)"
)
# Less than high school education
self.HIGH_SCHOOL_ED_FIELD: str = "Percent individuals age 25 or over with less than high school degree (percentile)"
# Set thresholds for score L
self.LOW_INCOME_THRESHOLD: float = 0.60
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
def add_definition_l_factors(self):
self.df["Climate Factor (Definition L)"] = self.climate_factor()
self.df["Energy Factor (Definition L)"] = self.energy_factor()
self.df[
"Transportation Factor (Definition L)"
] = self.transportation_factor()
self.df["Housing Factor (Definition L)"] = self.housing_factor()
self.df["Pollution Factor (Definition L)"] = self.pollution_factor()
self.df["Water Factor (Definition L)"] = self.water_factor()
self.df["Health Factor (Definition L)"] = self.health_factor()
self.df["Workforce Factor (Definition L)"] = self.workforce_factor()
factors = [
"Climate Factor (Definition L)",
"Energy Factor (Definition L)",
"Transportation Factor (Definition L)",
"Housing Factor (Definition L)",
"Pollution Factor (Definition L)",
"Water Factor (Definition L)",
"Health Factor (Definition L)",
"Workforce Factor (Definition L)",
]
self.df["Definition L (communities)"] = self.df[factors].any(axis=1)
# Note: this is purely used for comparison tool analysis, and can be removed at a later date. - LMB.
non_workforce_factors = [
"Climate Factor (Definition L)",
"Energy Factor (Definition L)",
"Transportation Factor (Definition L)",
"Housing Factor (Definition L)",
"Pollution Factor (Definition L)",
"Water Factor (Definition L)",
"Health Factor (Definition L)",
]
self.df["Any Non-Workforce Factor (Definition L)"] = self.df[
non_workforce_factors
].any(axis=1)
return self.df
def climate_factor(self) -> bool:
# In Xth percentile or above for FEMAs Risk Index (Source: FEMA
# AND
# Low income: In 60th percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
return (
self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
> self.LOW_INCOME_THRESHOLD
) & (
self.df[self.NATIONAL_RISK_FIELD]
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
def energy_factor(self) -> bool:
# In Xth percentile or above for DOEs energy cost burden score (Source: LEAD Score)
# AND
# Low income: In 60th percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
return (
self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
> self.LOW_INCOME_THRESHOLD
) & (
self.df[self.ENERGY_BURDEN_FIELD]
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
def transportation_factor(self) -> bool:
# In Xth percentile or above for diesel particulate matter (Source: EPA National Air Toxics Assessment (NATA)
# or
# In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)]
# or
# In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data
# AND
# Low income: In 60th percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
transportation_criteria = (
(self.df[self.DIESEL_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD)
| (self.df[self.PM25_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD)
| (
self.df[self.TRAFFIC_FIELD]
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
)
return (
self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
> self.LOW_INCOME_THRESHOLD
) & transportation_criteria
def housing_factor(self) -> bool:
# In Xth percentile or above for lead paint (Source: Census's American Community Surveys
# percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
# or
# In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
# AND
# Low income: In 60th percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
housing_criteria = (
self.df[self.LEAD_PAINT_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD
) | (
self.df[self.HOUSING_BURDEN_FIELD]
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
return (
self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
> self.LOW_INCOME_THRESHOLD
) & housing_criteria
def pollution_factor(self) -> bool:
# TBD
# AND
# Low income: In 60th percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
return False
def water_factor(self) -> bool:
# In Xth percentile or above for wastewater discharge (Source: EPA Risk-Screening Environmental Indicators (RSEI) Model)
# AND
# Low income: In 60th percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
return (
self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
> self.LOW_INCOME_THRESHOLD
) & (
self.df[self.WASTEWATER_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
def health_factor(self) -> bool:
# In Xth percentile or above for diabetes (Source: CDC Places)
# or
# In Xth percentile or above for asthma (Source: CDC Places)
# or
# In Xth percentile or above for heart disease
# or
# In Xth percentile or above for low life expectancy (Source: CDC Places)
# AND
# Low income: In 60th percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
health_criteria = (
(self.df[self.DIABETES_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD)
| (self.df[self.ASTHMA_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD)
| (
self.df[self.HEART_DISEASE_FIELD]
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
| (
self.df[self.LIFE_EXPECTANCY_FIELD]
# Note: a high life expectancy is good, so take 1 minus the threshold to invert it,
# and then look for life expenctancies lower than that (not greater than).
< 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
)
return (
self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD]
> self.LOW_INCOME_THRESHOLD
) & health_criteria
def workforce_factor(self) -> bool:
# Where unemployment is above X%
# or
# Where median income is less than Y% of the area median income
# or
# Where the percent of households at or below 100% of the federal poverty level is greater than Z%
# or
# Where linguistic isolation is greater than Y%
# AND
# Where the high school degree achievement rates for adults 25 years and older is less than 95%
# (necessary to screen out university block groups)
workforce_criteria = (
(
self.df[self.UNEMPLOYMENT_FIELD]
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
| (
self.df[self.MEDIAN_INCOME_FIELD]
# Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.
# and then look for median income lower than that (not greater than).
< 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
| (
self.df[self.POVERTY_LESS_THAN_100_FPL_FIELD]
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
| (
self.df[self.LINGUISTIC_ISO_FIELD]
> self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
)
return (self.df[self.HIGH_SCHOOL_ED_FIELD] > 0.05) & workforce_criteria

View file

@ -119,13 +119,14 @@
"source": [
"# Analyze one field at a time (useful for setting thresholds)\n",
"\n",
"quantile = 0.8\n",
"quantile = 0.9\n",
"\n",
"for field in [\n",
" \"Percent of individuals < 200% Federal Poverty Line\",\n",
" \"Life expectancy (years)\",\n",
" \"Energy burden\",\n",
" URBAN_HEURISTIC_FIELD,\n",
" \"Linguistic isolation (percent)\",\n",
" \"Diesel particulate matter (percentile)\",\n",
" \"Particulate matter (PM2.5) (percentile)\",\n",
" \"Traffic proximity and volume (percentile)\",\n",
" \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n",
"]:\n",
" print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n",
" print(cejst_df[field].describe())\n",
@ -234,7 +235,7 @@
"execution_count": null,
"id": "8da016db",
"metadata": {
"scrolled": false
"scrolled": true
},
"outputs": [],
"source": [
@ -278,64 +279,83 @@
"\n",
"# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n",
"# (`census_tract_indices`).\n",
"census_block_group_indices = [\n",
" Index(\n",
" method_name=\"Score G\",\n",
" priority_communities_field=\"Score G (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score H\",\n",
" priority_communities_field=\"Score H (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score I\",\n",
" priority_communities_field=\"Score I (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"NMTC\",\n",
" priority_communities_field=\"NMTC (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score A\",\n",
" priority_communities_field=\"Score A (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score B\",\n",
" priority_communities_field=\"Score B (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score C\",\n",
" priority_communities_field=\"Score C (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score D (25th percentile)\",\n",
" priority_communities_field=\"Score D (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score F\",\n",
" priority_communities_field=\"Score F (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Poverty\",\n",
" priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Persistent Poverty (CBG)\",\n",
" priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
"\n",
"definition_l_factors = [\n",
" \"Climate Factor (Definition L)\",\n",
" \"Energy Factor (Definition L)\",\n",
" \"Transportation Factor (Definition L)\",\n",
" \"Housing Factor (Definition L)\",\n",
" \"Pollution Factor (Definition L)\",\n",
" \"Water Factor (Definition L)\",\n",
" \"Health Factor (Definition L)\",\n",
" \"Workforce Factor (Definition L)\",\n",
" # Also include a combined factor for all the non-workforce elements.\n",
" \"Any Non-Workforce Factor (Definition L)\",\n",
"]\n",
"\n",
"census_block_group_indices = (\n",
" [\n",
" Index(\n",
" method_name=\"Definition L\",\n",
" priority_communities_field=\"Definition L (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" ]\n",
" # Insert indices for each of the factors from Definition L.\n",
" # Note: since these involve no renaming, we write them using list comprehension.\n",
" + [\n",
" Index(\n",
" method_name=factor,\n",
" priority_communities_field=factor,\n",
" other_census_tract_fields_to_keep=[],\n",
" )\n",
" for factor in definition_l_factors\n",
" ]\n",
" + [\n",
" Index(\n",
" # Note: we're renaming Score G as NMTC Modified for clarity, since that's what Score G is under the hood.\n",
" method_name=\"NMTC Modified\",\n",
" priority_communities_field=\"Score G (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"NMTC\",\n",
" priority_communities_field=\"NMTC (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score C\",\n",
" priority_communities_field=\"Score C (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score D (30th percentile)\",\n",
" priority_communities_field=\"Score D (top 30th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score D (25th percentile)\",\n",
" priority_communities_field=\"Score D (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Score F\",\n",
" priority_communities_field=\"Score F (communities)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Poverty\",\n",
" priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" Index(\n",
" method_name=\"Persistent Poverty (CBG)\",\n",
" priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n",
" other_census_tract_fields_to_keep=[],\n",
" ),\n",
" ]\n",
")\n",
"\n",
"census_tract_indices = [\n",
" Index(\n",
" method_name=\"Persistent Poverty\",\n",
@ -623,9 +643,7 @@
"write_state_distribution_excel(\n",
" state_distribution_df=state_distribution_df,\n",
" file_path=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.xlsx\",\n",
")\n",
"\n",
"state_distribution_df.head()"
")"
]
},
{
@ -633,7 +651,7 @@
"execution_count": null,
"id": "8790cd64",
"metadata": {
"scrolled": false
"scrolled": true
},
"outputs": [],
"source": [
@ -1461,7 +1479,7 @@
"execution_count": null,
"id": "908e0ad4",
"metadata": {
"scrolled": true
"scrolled": false
},
"outputs": [],
"source": [