From 7b87e0ec99f624175efef6f0f2619f4b5bc3b5ea Mon Sep 17 00:00:00 2001 From: Shelby Switzer Date: Thu, 28 Oct 2021 16:07:41 -0400 Subject: [PATCH] Add Score L (#812) * Create ScoreCalculator This calculates all the factors for score L for now (with placeholder formulae because this is a WIP). I think ideallly we'll want to refactor all the score code to be extracted into this or similar classes. * Add factor logic for score L Updated factor logic to match score L factors methodology. Still need to get the Score L field itself working. Cleanup needed: Pull field names into constants file, extract all score calculation into score calculator Co-authored-by: Shelby Switzer Co-authored-by: lucasmbrown-usds --- .../data_pipeline/etl/score/etl_score.py | 22 +- .../etl/score/score_calculator.py | 284 ++++++++++++++++++ .../ipython/scoring_comparison.ipynb | 152 +++++----- 3 files changed, 385 insertions(+), 73 deletions(-) create mode 100644 data/data-pipeline/data_pipeline/etl/score/score_calculator.py diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index 2cf684fc..88a5a763 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -5,6 +5,7 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.utils import get_module_logger +from data_pipeline.etl.score.score_calculator import ScoreCalculator logger = get_module_logger(__name__) @@ -522,7 +523,7 @@ class ScoreETL(ExtractTransformLoad): ) return df - def _add_scores_d_and_e(self, df: pd.DataFrame) -> pd.DataFrame: + def _add_scores_d_e(self, df: pd.DataFrame) -> pd.DataFrame: logger.info("Adding Scores D and E") fields_to_use_in_score = [ self.UNEMPLOYED_FIELD_NAME, @@ -641,8 +642,8 @@ class ScoreETL(ExtractTransformLoad): ) return df - def _add_score_g(self, df: pd.DataFrame) -> pd.DataFrame: - logger.info("Adding Score G") + def _add_score_g_k(self, df: pd.DataFrame) -> pd.DataFrame: + logger.info("Adding Score G through K") high_school_cutoff_threshold = 0.05 high_school_cutoff_threshold_2 = 0.06 @@ -691,6 +692,12 @@ class ScoreETL(ExtractTransformLoad): return df + def _add_definition_l_factors(self, df: pd.DataFrame) -> pd.DataFrame: + logger.info("Adding Definition L and factors") + calc = ScoreCalculator(df=df) + df = calc.add_definition_l_factors() + return df + # TODO Move a lot of this to the ETL part of the pipeline def _prepare_initial_df(self, data_sets: list) -> pd.DataFrame: logger.info("Preparing initial dataframe") @@ -831,7 +838,7 @@ class ScoreETL(ExtractTransformLoad): self.df = self._add_score_c(self.df, data_sets) # Calculate scores "D" and "E" - self.df = self._add_scores_d_and_e(self.df) + self.df = self._add_scores_d_e(self.df) # Create percentiles for the scores self.df = self._add_score_percentiles(self.df) @@ -840,8 +847,11 @@ class ScoreETL(ExtractTransformLoad): # Calculate "Score F", which uses "either/or" thresholds. self.df = self._add_score_f(self.df) - # Calculate "Score G", which uses AMI and poverty. - self.df = self._add_score_g(self.df) + # Calculate "Score G through K", which uses AMI and poverty. + self.df = self._add_score_g_k(self.df) + + # Calculate Definition L and its factors + self.df = self._add_definition_l_factors(self.df) def load(self) -> None: logger.info("Saving Score CSV") diff --git a/data/data-pipeline/data_pipeline/etl/score/score_calculator.py b/data/data-pipeline/data_pipeline/etl/score/score_calculator.py new file mode 100644 index 00000000..9b8da89e --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/score/score_calculator.py @@ -0,0 +1,284 @@ +import pandas as pd + +from data_pipeline.utils import get_module_logger + +logger = get_module_logger(__name__) + +class ScoreCalculator: + def __init__(self, df: pd.DataFrame): + # Define some global parameters + self.df = df + + self.POVERTY_LESS_THAN_200_FPL_FIELD: str = ( + "Percent of individuals < 200% Federal Poverty Line (percentile)" + ) + + self.POVERTY_LESS_THAN_100_FPL_FIELD: str = ( + "Percent of individuals < 100% Federal Poverty Line (percentile)" + ) + + # FEMA Risk Index + self.NATIONAL_RISK_FIELD: str = ( + "FEMA Risk Index Expected Annual Loss Score (percentile)" + ) + + # DOE energy burden + self.ENERGY_BURDEN_FIELD: str = "Energy burden (percentile)" + + # Diesel particulate matter + self.DIESEL_FIELD: str = "Diesel particulate matter (percentile)" + + # PM2.5 + self.PM25_FIELD: str = "Particulate matter (PM2.5) (percentile)" + + # Traffic proximity and volume + self.TRAFFIC_FIELD: str = "Traffic proximity and volume (percentile)" + + # Lead paint + self.LEAD_PAINT_FIELD: str = ( + "Percent pre-1960s housing (lead paint indicator) (percentile)" + ) + + # Housing cost burden + self.HOUSING_BURDEN_FIELD: str = "Housing burden (percent) (percentile)" + + # Wastewater discharge + self.WASTEWATER_FIELD: str = "Wastewater discharge (percentile)" + + # Diabetes + self.DIABETES_FIELD: str = ( + "Diagnosed diabetes among adults aged >=18 years (percentile)" + ) + + # Asthma + self.ASTHMA_FIELD: str = ( + "Current asthma among adults aged >=18 years (percentile)" + ) + + # Heart disease + self.HEART_DISEASE_FIELD: str = ( + "Coronary heart disease among adults aged >=18 years (percentile)" + ) + + # Life expectancy + self.LIFE_EXPECTANCY_FIELD: str = "Life expectancy (years) (percentile)" + + # Unemployment + self.UNEMPLOYMENT_FIELD: str = ( + "Unemployed civilians (percent) (percentile)" + ) + + # Median income as % of AMI + self.MEDIAN_INCOME_FIELD: str = ( + "Median household income (% of AMI) (percentile)" + ) + + # Linguistic isolation + self.LINGUISTIC_ISO_FIELD: str = ( + "Linguistic isolation (percent) (percentile)" + ) + + # Less than high school education + self.HIGH_SCHOOL_ED_FIELD: str = "Percent individuals age 25 or over with less than high school degree (percentile)" + + # Set thresholds for score L + self.LOW_INCOME_THRESHOLD: float = 0.60 + self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90 + + def add_definition_l_factors(self): + self.df["Climate Factor (Definition L)"] = self.climate_factor() + self.df["Energy Factor (Definition L)"] = self.energy_factor() + self.df[ + "Transportation Factor (Definition L)" + ] = self.transportation_factor() + self.df["Housing Factor (Definition L)"] = self.housing_factor() + self.df["Pollution Factor (Definition L)"] = self.pollution_factor() + self.df["Water Factor (Definition L)"] = self.water_factor() + self.df["Health Factor (Definition L)"] = self.health_factor() + self.df["Workforce Factor (Definition L)"] = self.workforce_factor() + + factors = [ + "Climate Factor (Definition L)", + "Energy Factor (Definition L)", + "Transportation Factor (Definition L)", + "Housing Factor (Definition L)", + "Pollution Factor (Definition L)", + "Water Factor (Definition L)", + "Health Factor (Definition L)", + "Workforce Factor (Definition L)", + ] + self.df["Definition L (communities)"] = self.df[factors].any(axis=1) + + # Note: this is purely used for comparison tool analysis, and can be removed at a later date. - LMB. + non_workforce_factors = [ + "Climate Factor (Definition L)", + "Energy Factor (Definition L)", + "Transportation Factor (Definition L)", + "Housing Factor (Definition L)", + "Pollution Factor (Definition L)", + "Water Factor (Definition L)", + "Health Factor (Definition L)", + ] + self.df["Any Non-Workforce Factor (Definition L)"] = self.df[ + non_workforce_factors + ].any(axis=1) + + return self.df + + def climate_factor(self) -> bool: + # In Xth percentile or above for FEMA’s Risk Index (Source: FEMA + # AND + # Low income: In 60th percentile or above for percent of block group population + # of households where household income is less than or equal to twice the federal + # poverty level. Source: Census's American Community Survey] + return ( + self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD] + > self.LOW_INCOME_THRESHOLD + ) & ( + self.df[self.NATIONAL_RISK_FIELD] + > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + + def energy_factor(self) -> bool: + # In Xth percentile or above for DOE’s energy cost burden score (Source: LEAD Score) + # AND + # Low income: In 60th percentile or above for percent of block group population + # of households where household income is less than or equal to twice the federal + # poverty level. Source: Census's American Community Survey] + return ( + self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD] + > self.LOW_INCOME_THRESHOLD + ) & ( + self.df[self.ENERGY_BURDEN_FIELD] + > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + + def transportation_factor(self) -> bool: + # In Xth percentile or above for diesel particulate matter (Source: EPA National Air Toxics Assessment (NATA) + # or + # In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)] + # or + # In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data + # AND + # Low income: In 60th percentile or above for percent of block group population + # of households where household income is less than or equal to twice the federal + # poverty level. Source: Census's American Community Survey] + transportation_criteria = ( + (self.df[self.DIESEL_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD) + | (self.df[self.PM25_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD) + | ( + self.df[self.TRAFFIC_FIELD] + > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + ) + + return ( + self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD] + > self.LOW_INCOME_THRESHOLD + ) & transportation_criteria + + def housing_factor(self) -> bool: + # In Xth percentile or above for lead paint (Source: Census's American Community Survey’s + # percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes) + # or + # In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset + # AND + # Low income: In 60th percentile or above for percent of block group population + # of households where household income is less than or equal to twice the federal + # poverty level. Source: Census's American Community Survey] + housing_criteria = ( + self.df[self.LEAD_PAINT_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) | ( + self.df[self.HOUSING_BURDEN_FIELD] + > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + return ( + self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD] + > self.LOW_INCOME_THRESHOLD + ) & housing_criteria + + def pollution_factor(self) -> bool: + # TBD + # AND + # Low income: In 60th percentile or above for percent of block group population + # of households where household income is less than or equal to twice the federal + # poverty level. Source: Census's American Community Survey] + return False + + def water_factor(self) -> bool: + # In Xth percentile or above for wastewater discharge (Source: EPA Risk-Screening Environmental Indicators (RSEI) Model) + # AND + # Low income: In 60th percentile or above for percent of block group population + # of households where household income is less than or equal to twice the federal + # poverty level. Source: Census's American Community Survey] + return ( + self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD] + > self.LOW_INCOME_THRESHOLD + ) & ( + self.df[self.WASTEWATER_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + + def health_factor(self) -> bool: + # In Xth percentile or above for diabetes (Source: CDC Places) + # or + # In Xth percentile or above for asthma (Source: CDC Places) + # or + # In Xth percentile or above for heart disease + # or + # In Xth percentile or above for low life expectancy (Source: CDC Places) + # AND + # Low income: In 60th percentile or above for percent of block group population + # of households where household income is less than or equal to twice the federal + # poverty level. Source: Census's American Community Survey] + + health_criteria = ( + (self.df[self.DIABETES_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD) + | (self.df[self.ASTHMA_FIELD] > self.ENVIRONMENTAL_BURDEN_THRESHOLD) + | ( + self.df[self.HEART_DISEASE_FIELD] + > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + | ( + self.df[self.LIFE_EXPECTANCY_FIELD] + # Note: a high life expectancy is good, so take 1 minus the threshold to invert it, + # and then look for life expenctancies lower than that (not greater than). + < 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + ) + return ( + self.df[self.POVERTY_LESS_THAN_200_FPL_FIELD] + > self.LOW_INCOME_THRESHOLD + ) & health_criteria + + def workforce_factor(self) -> bool: + # Where unemployment is above X% + # or + # Where median income is less than Y% of the area median income + # or + # Where the percent of households at or below 100% of the federal poverty level is greater than Z% + # or + # Where linguistic isolation is greater than Y% + # AND + # Where the high school degree achievement rates for adults 25 years and older is less than 95% + # (necessary to screen out university block groups) + workforce_criteria = ( + ( + self.df[self.UNEMPLOYMENT_FIELD] + > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + | ( + self.df[self.MEDIAN_INCOME_FIELD] + # Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it. + # and then look for median income lower than that (not greater than). + < 1 - self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + | ( + self.df[self.POVERTY_LESS_THAN_100_FPL_FIELD] + > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + | ( + self.df[self.LINGUISTIC_ISO_FIELD] + > self.ENVIRONMENTAL_BURDEN_THRESHOLD + ) + ) + return (self.df[self.HIGH_SCHOOL_ED_FIELD] > 0.05) & workforce_criteria diff --git a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb index 85142cf7..69b8e166 100644 --- a/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb +++ b/data/data-pipeline/data_pipeline/ipython/scoring_comparison.ipynb @@ -119,13 +119,14 @@ "source": [ "# Analyze one field at a time (useful for setting thresholds)\n", "\n", - "quantile = 0.8\n", + "quantile = 0.9\n", "\n", "for field in [\n", - " \"Percent of individuals < 200% Federal Poverty Line\",\n", - " \"Life expectancy (years)\",\n", - " \"Energy burden\",\n", - " URBAN_HEURISTIC_FIELD,\n", + " \"Linguistic isolation (percent)\",\n", + " \"Diesel particulate matter (percentile)\",\n", + " \"Particulate matter (PM2.5) (percentile)\",\n", + " \"Traffic proximity and volume (percentile)\",\n", + " \"Percent of individuals < 200% Federal Poverty Line (percentile)\",\n", "]:\n", " print(f\"\\n~~~~Analysis for field `{field}`~~~~\")\n", " print(cejst_df[field].describe())\n", @@ -234,7 +235,7 @@ "execution_count": null, "id": "8da016db", "metadata": { - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ @@ -278,64 +279,83 @@ "\n", "# Define the indices used for CEJST scoring (`census_block_group_indices`) as well as comparison\n", "# (`census_tract_indices`).\n", - "census_block_group_indices = [\n", - " Index(\n", - " method_name=\"Score G\",\n", - " priority_communities_field=\"Score G (communities)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"Score H\",\n", - " priority_communities_field=\"Score H (communities)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"Score I\",\n", - " priority_communities_field=\"Score I (communities)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"NMTC\",\n", - " priority_communities_field=\"NMTC (communities)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"Score A\",\n", - " priority_communities_field=\"Score A (top 25th percentile)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"Score B\",\n", - " priority_communities_field=\"Score B (top 25th percentile)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"Score C\",\n", - " priority_communities_field=\"Score C (top 25th percentile)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"Score D (25th percentile)\",\n", - " priority_communities_field=\"Score D (top 25th percentile)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"Score F\",\n", - " priority_communities_field=\"Score F (communities)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"Poverty\",\n", - " priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", - " Index(\n", - " method_name=\"Persistent Poverty (CBG)\",\n", - " priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n", - " other_census_tract_fields_to_keep=[],\n", - " ),\n", + "\n", + "definition_l_factors = [\n", + " \"Climate Factor (Definition L)\",\n", + " \"Energy Factor (Definition L)\",\n", + " \"Transportation Factor (Definition L)\",\n", + " \"Housing Factor (Definition L)\",\n", + " \"Pollution Factor (Definition L)\",\n", + " \"Water Factor (Definition L)\",\n", + " \"Health Factor (Definition L)\",\n", + " \"Workforce Factor (Definition L)\",\n", + " # Also include a combined factor for all the non-workforce elements.\n", + " \"Any Non-Workforce Factor (Definition L)\",\n", "]\n", "\n", + "census_block_group_indices = (\n", + " [\n", + " Index(\n", + " method_name=\"Definition L\",\n", + " priority_communities_field=\"Definition L (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " ]\n", + " # Insert indices for each of the factors from Definition L.\n", + " # Note: since these involve no renaming, we write them using list comprehension.\n", + " + [\n", + " Index(\n", + " method_name=factor,\n", + " priority_communities_field=factor,\n", + " other_census_tract_fields_to_keep=[],\n", + " )\n", + " for factor in definition_l_factors\n", + " ]\n", + " + [\n", + " Index(\n", + " # Note: we're renaming Score G as NMTC Modified for clarity, since that's what Score G is under the hood.\n", + " method_name=\"NMTC Modified\",\n", + " priority_communities_field=\"Score G (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"NMTC\",\n", + " priority_communities_field=\"NMTC (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"Score C\",\n", + " priority_communities_field=\"Score C (top 25th percentile)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"Score D (30th percentile)\",\n", + " priority_communities_field=\"Score D (top 30th percentile)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"Score D (25th percentile)\",\n", + " priority_communities_field=\"Score D (top 25th percentile)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"Score F\",\n", + " priority_communities_field=\"Score F (communities)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"Poverty\",\n", + " priority_communities_field=\"Poverty (Less than 200% of federal poverty line) (top 25th percentile)\",\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " Index(\n", + " method_name=\"Persistent Poverty (CBG)\",\n", + " priority_communities_field=PERSISTENT_POVERTY_CBG_LEVEL_FIELD,\n", + " other_census_tract_fields_to_keep=[],\n", + " ),\n", + " ]\n", + ")\n", + "\n", "census_tract_indices = [\n", " Index(\n", " method_name=\"Persistent Poverty\",\n", @@ -623,9 +643,7 @@ "write_state_distribution_excel(\n", " state_distribution_df=state_distribution_df,\n", " file_path=COMPARISON_OUTPUTS_DIR / f\"{file_prefix}.xlsx\",\n", - ")\n", - "\n", - "state_distribution_df.head()" + ")" ] }, { @@ -633,7 +651,7 @@ "execution_count": null, "id": "8790cd64", "metadata": { - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ @@ -1461,7 +1479,7 @@ "execution_count": null, "id": "908e0ad4", "metadata": { - "scrolled": true + "scrolled": false }, "outputs": [], "source": [