From d298f7dedbb4c52bd36fdedaee6cba6347363dcf Mon Sep 17 00:00:00 2001 From: Carlos Felix Date: Tue, 3 Dec 2024 16:11:56 -0500 Subject: [PATCH] Removed unused files --- .../data_pipeline/score/score_a.py | 19 - .../data_pipeline/score/score_b.py | 21 - .../data_pipeline/score/score_c.py | 102 -- .../data_pipeline/score/score_d.py | 34 - .../data_pipeline/score/score_f.py | 97 -- .../data_pipeline/score/score_g.py | 34 - .../data_pipeline/score/score_h.py | 32 - .../data_pipeline/score/score_i.py | 33 - .../data_pipeline/score/score_k.py | 33 - .../data_pipeline/score/score_l.py | 690 -------------- .../data_pipeline/score/score_m.py | 888 ------------------ 11 files changed, 1983 deletions(-) delete mode 100644 data/data-pipeline/data_pipeline/score/score_a.py delete mode 100644 data/data-pipeline/data_pipeline/score/score_b.py delete mode 100644 data/data-pipeline/data_pipeline/score/score_c.py delete mode 100644 data/data-pipeline/data_pipeline/score/score_d.py delete mode 100644 data/data-pipeline/data_pipeline/score/score_f.py delete mode 100644 data/data-pipeline/data_pipeline/score/score_g.py delete mode 100644 data/data-pipeline/data_pipeline/score/score_h.py delete mode 100644 data/data-pipeline/data_pipeline/score/score_i.py delete mode 100644 data/data-pipeline/data_pipeline/score/score_k.py delete mode 100644 data/data-pipeline/data_pipeline/score/score_l.py delete mode 100644 data/data-pipeline/data_pipeline/score/score_m.py diff --git a/data/data-pipeline/data_pipeline/score/score_a.py b/data/data-pipeline/data_pipeline/score/score_a.py deleted file mode 100644 index 0788f094..00000000 --- a/data/data-pipeline/data_pipeline/score/score_a.py +++ /dev/null @@ -1,19 +0,0 @@ -import data_pipeline.score.field_names as field_names -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreA(Score): - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Score A") - self.df[field_names.SCORE_A] = self.df[ - [ - field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.HIGH_SCHOOL_ED_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - ] - ].mean(axis=1) - return self.df diff --git a/data/data-pipeline/data_pipeline/score/score_b.py b/data/data-pipeline/data_pipeline/score/score_b.py deleted file mode 100644 index 2e980b3d..00000000 --- a/data/data-pipeline/data_pipeline/score/score_b.py +++ /dev/null @@ -1,21 +0,0 @@ -import data_pipeline.score.field_names as field_names -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreB(Score): - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Score B") - self.df[field_names.SCORE_B] = ( - self.df[ - field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - * self.df[ - field_names.HIGH_SCHOOL_ED_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - ) - return self.df diff --git a/data/data-pipeline/data_pipeline/score/score_c.py b/data/data-pipeline/data_pipeline/score/score_c.py deleted file mode 100644 index b91b7417..00000000 --- a/data/data-pipeline/data_pipeline/score/score_c.py +++ /dev/null @@ -1,102 +0,0 @@ -from collections import namedtuple - -import data_pipeline.score.field_names as field_names -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreC(Score): - def __init__(self, df: pd.DataFrame) -> None: - Bucket = namedtuple(typename="Bucket", field_names=["name", "fields"]) - - # Note: we use percentiles for every field below. - # To do so, we add the percentile suffix to all the field names. - self.BUCKET_SOCIOECONOMIC = Bucket( - field_names.C_SOCIOECONOMIC, - [ - field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.HIGH_SCHOOL_ED_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.UNEMPLOYMENT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.HOUSING_BURDEN_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - ], - ) - self.BUCKET_SENSITIVE = Bucket( - field_names.C_SENSITIVE, - [ - field_names.UNDER_5_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.OVER_64_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LINGUISTIC_ISO_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - ], - ) - self.BUCKET_ENVIRONMENTAL = Bucket( - field_names.C_ENVIRONMENTAL, - [ - field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.WASTEWATER_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LEAD_PAINT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - ], - ) - self.BUCKET_EXPOSURES = Bucket( - field_names.C_EXPOSURES, - [ - field_names.AIR_TOXICS_CANCER_RISK_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.RESPIRATORY_HAZARD_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.OZONE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - ], - ) - self.BUCKETS = [ - self.BUCKET_SOCIOECONOMIC, - self.BUCKET_SENSITIVE, - self.BUCKET_ENVIRONMENTAL, - self.BUCKET_EXPOSURES, - ] - super().__init__(df) - - # "CalEnviroScreen for the US" score - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Score C") - # Average all the percentile values in each bucket into a single score for each of the four buckets. - for bucket in self.BUCKETS: - self.df[bucket.name] = self.df[bucket.fields].mean(axis=1) - - # Combine the score from the two Exposures and Environmental Effects buckets - # into a single score called "Pollution Burden". - # The math for this score is: - # (1.0 * Exposures Score + 0.5 * Environment Effects score) / 1.5. - self.df[field_names.AGGREGATION_POLLUTION_FIELD] = ( - 1.0 * self.df[self.BUCKET_EXPOSURES.name] - + 0.5 * self.df[self.BUCKET_ENVIRONMENTAL.name] - ) / 1.5 - - # Average the score from the two Sensitive populations and - # Socioeconomic factors buckets into a single score called - # "Population Characteristics". - self.df[field_names.AGGREGATION_POPULATION_FIELD] = self.df[ - [self.BUCKET_SENSITIVE.name, self.BUCKET_SOCIOECONOMIC.name] - ].mean(axis=1) - - # Multiply the "Pollution Burden" score and the "Population Characteristics" - # together to produce the cumulative impact score. - self.df[field_names.SCORE_C] = ( - self.df[field_names.AGGREGATION_POLLUTION_FIELD] - * self.df[field_names.AGGREGATION_POPULATION_FIELD] - ) - return self.df diff --git a/data/data-pipeline/data_pipeline/score/score_d.py b/data/data-pipeline/data_pipeline/score/score_d.py deleted file mode 100644 index 08e55f01..00000000 --- a/data/data-pipeline/data_pipeline/score/score_d.py +++ /dev/null @@ -1,34 +0,0 @@ -import data_pipeline.score.field_names as field_names -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreD(Score): - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Scores D and E") - fields_to_use_in_score = [ - field_names.UNEMPLOYMENT_FIELD, - field_names.LINGUISTIC_ISO_FIELD, - field_names.HOUSING_BURDEN_FIELD, - field_names.POVERTY_FIELD, - field_names.HIGH_SCHOOL_ED_FIELD, - ] - - fields_min_max = [ - f"{field}{field_names.MIN_MAX_FIELD_SUFFIX}" - for field in fields_to_use_in_score - ] - fields_percentile = [ - f"{field}{field_names.PERCENTILE_FIELD_SUFFIX}" - for field in fields_to_use_in_score - ] - - # Calculate "Score D", which uses min-max normalization - # and calculate "Score E", which uses percentile normalization for the same fields - self.df[field_names.SCORE_D] = self.df[fields_min_max].mean(axis=1) - self.df[field_names.SCORE_E] = self.df[fields_percentile].mean(axis=1) - - return self.df diff --git a/data/data-pipeline/data_pipeline/score/score_f.py b/data/data-pipeline/data_pipeline/score/score_f.py deleted file mode 100644 index be38e514..00000000 --- a/data/data-pipeline/data_pipeline/score/score_f.py +++ /dev/null @@ -1,97 +0,0 @@ -import data_pipeline.score.field_names as field_names -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreF(Score): - # TODO Make variables and constants clearer (meaning and type) - - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Score F") - ami_and_high_school_field = "Low AMI, Low HS graduation" - meets_socio_field = "Meets socioeconomic criteria" - meets_burden_field = "Meets burden criteria" - - self.df[ami_and_high_school_field] = ( - self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD] < 0.80 - ) & (self.df[field_names.HIGH_SCHOOL_ED_FIELD] > 0.2) - - self.df[meets_socio_field] = ( - self.df[ami_and_high_school_field] - | (self.df[field_names.POVERTY_FIELD] > 0.40) - | (self.df[field_names.LINGUISTIC_ISO_FIELD] > 0.10) - | (self.df[field_names.HIGH_SCHOOL_ED_FIELD] > 0.4) - ) - - self.df[meets_burden_field] = ( - ( - self.df[ - field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - > 0.9 - ) - | ( - self.df[ - field_names.RESPIRATORY_HAZARD_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - > 0.9 - ) - | ( - self.df[ - field_names.TRAFFIC_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - > 0.9 - ) - | ( - self.df[ - field_names.LEAD_PAINT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - > 0.9 - ) - | ( - self.df[ - field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - > 0.9 - ) - | ( - self.df[ - field_names.ASTHMA_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - > 0.9 - ) - | ( - self.df[ - field_names.HEART_DISEASE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - > 0.9 - ) - | ( - self.df[ - field_names.CANCER_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - > 0.9 - ) - | ( - self.df[ - field_names.DIABETES_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - > 0.9 - ) - ) - - self.df[field_names.SCORE_F_COMMUNITIES] = ( - self.df[meets_socio_field] & self.df[meets_burden_field] - ) - - return self.df diff --git a/data/data-pipeline/data_pipeline/score/score_g.py b/data/data-pipeline/data_pipeline/score/score_g.py deleted file mode 100644 index d4e10726..00000000 --- a/data/data-pipeline/data_pipeline/score/score_g.py +++ /dev/null @@ -1,34 +0,0 @@ -import data_pipeline.score.field_names as field_names -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreG(Score): - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Score G") - - high_school_cutoff_threshold = 0.05 - - # Score G is now modified NMTC - self.df[field_names.SCORE_G_COMMUNITIES] = ( - (self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD] < 0.8) - & ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] - > high_school_cutoff_threshold - ) - ) | ( - (self.df[field_names.POVERTY_LESS_THAN_100_FPL_FIELD] > 0.20) - & ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] - > high_school_cutoff_threshold - ) - ) - self.df[field_names.SCORE_G] = self.df[ - field_names.SCORE_G_COMMUNITIES - ].astype(int) - self.df["Score G (percentile)"] = self.df[field_names.SCORE_G] - - return self.df diff --git a/data/data-pipeline/data_pipeline/score/score_h.py b/data/data-pipeline/data_pipeline/score/score_h.py deleted file mode 100644 index 56168c53..00000000 --- a/data/data-pipeline/data_pipeline/score/score_h.py +++ /dev/null @@ -1,32 +0,0 @@ -import data_pipeline.score.field_names as field_names -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreH(Score): - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Score H") - - high_school_cutoff_threshold = 0.06 - - self.df[field_names.SCORE_H_COMMUNITIES] = ( - (self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD] < 0.8) - & ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] - > high_school_cutoff_threshold - ) - ) | ( - (self.df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD] > 0.40) - & ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] - > high_school_cutoff_threshold - ) - ) - self.df[field_names.SCORE_H] = self.df[ - field_names.SCORE_H_COMMUNITIES - ].astype(int) - - return self.df diff --git a/data/data-pipeline/data_pipeline/score/score_i.py b/data/data-pipeline/data_pipeline/score/score_i.py deleted file mode 100644 index 25f2c098..00000000 --- a/data/data-pipeline/data_pipeline/score/score_i.py +++ /dev/null @@ -1,33 +0,0 @@ -import data_pipeline.score.field_names as field_names -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreI(Score): - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Score I") - - high_school_cutoff_threshold = 0.05 - - self.df[field_names.SCORE_I_COMMUNITIES] = ( - (self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD] < 0.7) - & ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] - > high_school_cutoff_threshold - ) - ) | ( - (self.df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD] > 0.50) - & ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] - > high_school_cutoff_threshold - ) - ) - self.df[field_names.SCORE_I] = self.df[ - field_names.SCORE_I_COMMUNITIES - ].astype(int) - self.df["Score I (percentile)"] = self.df[field_names.SCORE_I] - - return self.df diff --git a/data/data-pipeline/data_pipeline/score/score_k.py b/data/data-pipeline/data_pipeline/score/score_k.py deleted file mode 100644 index db05344c..00000000 --- a/data/data-pipeline/data_pipeline/score/score_k.py +++ /dev/null @@ -1,33 +0,0 @@ -import data_pipeline.score.field_names as field_names -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreK(Score): - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Score K") - - high_school_cutoff_threshold = 0.06 - - self.df[field_names.SCORE_K] = ( - (self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD] < 0.8) - ) | (self.df[field_names.POVERTY_LESS_THAN_100_FPL_FIELD] > 0.20) - - self.df[field_names.SCORE_K_COMMUNITIES] = ( - (self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD] < 0.8) - & ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] - > high_school_cutoff_threshold - ) - ) | ( - (self.df[field_names.POVERTY_LESS_THAN_100_FPL_FIELD] > 0.20) - & ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] - > high_school_cutoff_threshold - ) - ) - - return self.df diff --git a/data/data-pipeline/data_pipeline/score/score_l.py b/data/data-pipeline/data_pipeline/score/score_l.py deleted file mode 100644 index 322b9129..00000000 --- a/data/data-pipeline/data_pipeline/score/score_l.py +++ /dev/null @@ -1,690 +0,0 @@ -import data_pipeline.score.field_names as field_names -import numpy as np -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreL(Score): - def __init__(self, df: pd.DataFrame) -> None: - self.LOW_INCOME_THRESHOLD: float = 0.65 - self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90 - self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90 - self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10 - - super().__init__(df) - - def _combine_island_areas_with_states_and_set_thresholds( - self, - df: pd.DataFrame, - column_from_island_areas: str, - column_from_decennial_census: str, - combined_column_name: str, - threshold_cutoff_for_island_areas: float, - ) -> (pd.DataFrame, str): - """Steps to set thresholds for island areas. - - This function is fairly logically complicated. It takes the following steps: - - 1. Combine the two different fields into a single field. - 2. Calculate the 90th percentile cutoff raw value for the combined field. - 3. Create a boolean series that is true for any census tract in the island - areas (and only the island areas) that exceeds this cutoff. - - For step one, it combines data that is either the island area's Decennial Census - value in 2009 or the state's value in 5-year ACS ending in 2010. - - This will be used to generate the percentile cutoff for the 90th percentile. - - The stateside decennial census stopped asking economic comparisons, - so this is as close to apples-to-apples as we get. We use 5-year ACS for data - robustness over 1-year ACS. - """ - # Create the combined field. - # TODO: move this combined field percentile calculation to `etl_score`, - # since most other percentile logic is there. - # There should only be one entry in either 2009 or 2019 fields, not one in both. - # But just to be safe, we take the mean and ignore null values so if there - # *were* entries in both, this result would make sense. - df[combined_column_name] = df[ - [column_from_island_areas, column_from_decennial_census] - ].mean(axis=1, skipna=True) - - logger.debug( - f"Combined field `{combined_column_name}` has " - f"{df[combined_column_name].isnull().sum()} " - f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) " - f"missing values for census tracts. " - ) - - # Calculate the percentile threshold raw value. - raw_threshold = np.nanquantile( - a=df[combined_column_name], q=threshold_cutoff_for_island_areas - ) - - logger.debug( - f"For combined field `{combined_column_name}`, " - f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a " - f"raw value of {raw_threshold:.3f}." - ) - - threshold_column_name = ( - f"{column_from_island_areas} exceeds " - f"{threshold_cutoff_for_island_areas*100:.0f}th percentile" - ) - - df[threshold_column_name] = ( - df[column_from_island_areas] >= raw_threshold - ) - - percent_of_tracts_highlighted = ( - 100 - * df[threshold_column_name].sum() - / df[column_from_island_areas].notnull().sum() - ) - - logger.info( - f"For `{threshold_column_name}`, " - f"{df[threshold_column_name].sum()} (" - f"{percent_of_tracts_highlighted:.2f}% of tracts that have non-null data " - f"in the column) have a value of TRUE." - ) - - return df, threshold_column_name - - def _create_low_income_threshold(self, df: pd.DataFrame) -> pd.Series: - """ - Returns a pandas series (really a numpy array) - of booleans based on the condition of the FPL at 200% - is at or more than some established threshold - """ - return ( - df[ - field_names.POVERTY_LESS_THAN_200_FPL_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.LOW_INCOME_THRESHOLD - ) - - def _increment_total_eligibility_exceeded( - self, columns_for_subset: list - ) -> None: - """ - Increments the total eligible factors for a given tract - """ - - self.df[field_names.THRESHOLD_COUNT] += self.df[columns_for_subset].sum( - axis=1, skipna=True - ) - - def _climate_factor(self) -> bool: - # In Xth percentile or above for FEMA’s Risk Index (Source: FEMA - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level. Source: Census's American Community Survey] - - climate_eligibility_columns = [ - field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD, - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD, - field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD, - ] - - expected_population_loss_threshold = ( - self.df[ - field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - expected_agriculture_loss_threshold = ( - self.df[ - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - expected_building_loss_threshold = ( - self.df[ - field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD] = ( - expected_population_loss_threshold - & self.df[field_names.FPL_200_SERIES] - ) - - self.df[field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD] = ( - expected_agriculture_loss_threshold - & self.df[field_names.FPL_200_SERIES] - ) - - self.df[field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD] = ( - expected_building_loss_threshold - & self.df[field_names.FPL_200_SERIES] - ) - - self._increment_total_eligibility_exceeded(climate_eligibility_columns) - - return self.df[climate_eligibility_columns].any(axis="columns") - - def _energy_factor(self) -> bool: - # In Xth percentile or above for DOE’s energy cost burden score (Source: LEAD Score) - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level. Source: Census's American Community Survey] - - energy_eligibility_columns = [ - field_names.PM25_EXPOSURE_LOW_INCOME_FIELD, - field_names.ENERGY_BURDEN_LOW_INCOME_FIELD, - ] - - energy_burden_threshold = ( - self.df[ - field_names.ENERGY_BURDEN_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - pm25_threshold = ( - self.df[ - field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.PM25_EXPOSURE_LOW_INCOME_FIELD] = ( - pm25_threshold & self.df[field_names.FPL_200_SERIES] - ) - - self.df[field_names.ENERGY_BURDEN_LOW_INCOME_FIELD] = ( - energy_burden_threshold & self.df[field_names.FPL_200_SERIES] - ) - - self._increment_total_eligibility_exceeded(energy_eligibility_columns) - - return self.df[energy_eligibility_columns].any(axis="columns") - - def _transportation_factor(self) -> bool: - # In Xth percentile or above for diesel particulate matter (Source: EPA National Air Toxics Assessment (NATA) - # or - # In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)] - # or - # In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level. Source: Census's American Community Survey] - - transportion_eligibility_columns = [ - field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD, - field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD, - ] - - diesel_threshold = ( - self.df[ - field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - traffic_threshold = ( - self.df[ - field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD] = ( - diesel_threshold & self.df[field_names.FPL_200_SERIES] - ) - - self.df[field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD] = ( - traffic_threshold & self.df[field_names.FPL_200_SERIES] - ) - - self._increment_total_eligibility_exceeded( - transportion_eligibility_columns - ) - - return self.df[transportion_eligibility_columns].any(axis="columns") - - def _housing_factor(self) -> bool: - # ( - # In Xth percentile or above for lead paint (Source: Census's American Community Survey’s - # percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes) - # AND - # In Yth percentile or below for Median House Value (Source: Census's American Community Survey) - # ) - # or - # In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level. Source: Census's American Community Survey] - - housing_eligibility_columns = [ - field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD, - field_names.HOUSING_BURDEN_LOW_INCOME_FIELD, - ] - - lead_paint_median_home_value_threshold = ( - self.df[ - field_names.LEAD_PAINT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) & ( - self.df[ - field_names.MEDIAN_HOUSE_VALUE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - <= self.MEDIAN_HOUSE_VALUE_THRESHOLD - ) - - housing_burden_threshold = ( - self.df[ - field_names.HOUSING_BURDEN_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - # series by series indicators - self.df[field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD] = ( - lead_paint_median_home_value_threshold - & self.df[field_names.FPL_200_SERIES] - ) - - self.df[field_names.HOUSING_BURDEN_LOW_INCOME_FIELD] = ( - housing_burden_threshold & self.df[field_names.FPL_200_SERIES] - ) - - self._increment_total_eligibility_exceeded(housing_eligibility_columns) - - return self.df[housing_eligibility_columns].any(axis="columns") - - def _pollution_factor(self) -> bool: - # Proximity to Risk Management Plan sites is > X - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level. Source: Census's American Community Survey] - - pollution_eligibility_columns = [ - field_names.RMP_LOW_INCOME_FIELD, - field_names.SUPERFUND_LOW_INCOME_FIELD, - field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD, - ] - - rmp_sites_threshold = ( - self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - npl_sites_threshold = ( - self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - tsdf_sites_threshold = ( - self.df[ - field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - # individual series-by-series - self.df[field_names.RMP_LOW_INCOME_FIELD] = ( - rmp_sites_threshold & self.df[field_names.FPL_200_SERIES] - ) - self.df[field_names.SUPERFUND_LOW_INCOME_FIELD] = ( - npl_sites_threshold & self.df[field_names.FPL_200_SERIES] - ) - self.df[field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD] = ( - tsdf_sites_threshold & self.df[field_names.FPL_200_SERIES] - ) - - self._increment_total_eligibility_exceeded( - pollution_eligibility_columns - ) - - return self.df[pollution_eligibility_columns].any(axis="columns") - - def _water_factor(self) -> bool: - # In Xth percentile or above for wastewater discharge (Source: EPA Risk-Screening Environmental Indicators (RSEI) Model) - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level. Source: Census's American Community Survey] - - wastewater_threshold = ( - self.df[ - field_names.WASTEWATER_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD] = ( - wastewater_threshold & self.df[field_names.FPL_200_SERIES] - ) - - self._increment_total_eligibility_exceeded( - [field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD] - ) - - return self.df[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD] - - def _health_factor(self) -> bool: - # In Xth percentile or above for diabetes (Source: CDC Places) - # or - # In Xth percentile or above for asthma (Source: CDC Places) - # or - # In Xth percentile or above for heart disease - # or - # In Xth percentile or above for low life expectancy (Source: CDC Places) - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level. Source: Census's American Community Survey] - - health_eligibility_columns = [ - field_names.DIABETES_LOW_INCOME_FIELD, - field_names.ASTHMA_LOW_INCOME_FIELD, - field_names.HEART_DISEASE_LOW_INCOME_FIELD, - field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD, - ] - - diabetes_threshold = ( - self.df[ - field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - asthma_threshold = ( - self.df[ - field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - heart_disease_threshold = ( - self.df[ - field_names.HEART_DISEASE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - low_life_expectancy_threshold = ( - self.df[ - field_names.LOW_LIFE_EXPECTANCY_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.DIABETES_LOW_INCOME_FIELD] = ( - diabetes_threshold & self.df[field_names.FPL_200_SERIES] - ) - self.df[field_names.ASTHMA_LOW_INCOME_FIELD] = ( - asthma_threshold & self.df[field_names.FPL_200_SERIES] - ) - self.df[field_names.HEART_DISEASE_LOW_INCOME_FIELD] = ( - heart_disease_threshold & self.df[field_names.FPL_200_SERIES] - ) - self.df[field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD] = ( - low_life_expectancy_threshold & self.df[field_names.FPL_200_SERIES] - ) - - self._increment_total_eligibility_exceeded(health_eligibility_columns) - - return self.df[health_eligibility_columns].any(axis="columns") - - def _workforce_factor(self) -> bool: - # Where unemployment is above Xth percentile - # or - # Where median income as a percent of area median income is above Xth percentile - # or - # Where the percent of households at or below 100% of the federal poverty level - # is above Xth percentile - # or - # Where linguistic isolation is above Xth percentile - # AND - # Where the high school degree achievement rates for adults 25 years and older - # is less than Y% - # (necessary to screen out university tracts) - - # Workforce criteria for states fields. - workforce_eligibility_columns = [ - field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD, - field_names.POVERTY_LOW_HS_EDUCATION_FIELD, - field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD, - field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD, - ] - - self.df[field_names.LOW_HS_EDUCATION_FIELD] = ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] - >= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD - ) - - unemployment_threshold = ( - self.df[ - field_names.UNEMPLOYMENT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - low_median_income_threshold = ( - self.df[ - field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - linguistic_isolation_threshold = ( - self.df[ - field_names.LINGUISTIC_ISO_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - poverty_threshold = ( - self.df[ - field_names.POVERTY_LESS_THAN_100_FPL_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD] = ( - linguistic_isolation_threshold - & self.df[field_names.LOW_HS_EDUCATION_FIELD] - ) - - self.df[field_names.POVERTY_LOW_HS_EDUCATION_FIELD] = ( - poverty_threshold & self.df[field_names.LOW_HS_EDUCATION_FIELD] - ) - - self.df[field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD] = ( - low_median_income_threshold - & self.df[field_names.LOW_HS_EDUCATION_FIELD] - ) - - self.df[field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD] = ( - unemployment_threshold & self.df[field_names.LOW_HS_EDUCATION_FIELD] - ) - - workforce_combined_criteria_for_states = self.df[ - workforce_eligibility_columns - ].any(axis="columns") - - self._increment_total_eligibility_exceeded( - workforce_eligibility_columns - ) - - # Now, calculate workforce criteria for island territories. - island_areas_workforce_eligibility_columns = [ - field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD, - field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD, - field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD, - ] - - # First, combine unemployment. - ( - self.df, - island_areas_unemployment_criteria_field_name, - ) = self._combine_island_areas_with_states_and_set_thresholds( - df=self.df, - column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009, - column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010, - combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010, - threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD, - ) - - # Next, combine poverty. - ( - self.df, - island_areas_poverty_criteria_field_name, - ) = self._combine_island_areas_with_states_and_set_thresholds( - df=self.df, - column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009, - column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010, - combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010, - threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD, - ) - - # Also check whether low area median income is 90th percentile or higher - # within the islands. - island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name = ( - f"{field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009} exceeds " - f"{field_names.PERCENTILE}th percentile" - ) - self.df[ - island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name - ] = ( - self.df[ - field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] = ( - self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009] - >= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD - ) - - self.df[ - field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD - ] = ( - self.df[island_areas_unemployment_criteria_field_name] - & self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] - ) - - self.df[field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD] = ( - self.df[island_areas_poverty_criteria_field_name] - & self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] - ) - - self.df[ - field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD - ] = ( - self.df[ - island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name - ] - & self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] - ) - - workforce_combined_criteria_for_island_areas = self.df[ - island_areas_workforce_eligibility_columns - ].any(axis="columns") - - self._increment_total_eligibility_exceeded( - island_areas_workforce_eligibility_columns - ) - - percent_of_island_tracts_highlighted = ( - 100 - * workforce_combined_criteria_for_island_areas.sum() - # Choosing a random column from island areas to calculate the denominator. - / self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009] - .notnull() - .sum() - ) - - logger.debug( - f"For workforce criteria in island areas, " - f"{workforce_combined_criteria_for_island_areas.sum()} (" - f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data " - f"in the column) have a value of TRUE." - ) - - # A tract is included if it meets either the states tract criteria or the - # island areas tract criteria. - return ( - workforce_combined_criteria_for_states - | workforce_combined_criteria_for_island_areas - ) - - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Score L") - - self.df[field_names.THRESHOLD_COUNT] = 0 - self.df[field_names.FPL_200_SERIES] = self._create_low_income_threshold( - self.df - ) - self.df[field_names.L_CLIMATE] = self._climate_factor() - self.df[field_names.L_ENERGY] = self._energy_factor() - self.df[field_names.L_TRANSPORTATION] = self._transportation_factor() - self.df[field_names.L_HOUSING] = self._housing_factor() - self.df[field_names.L_POLLUTION] = self._pollution_factor() - self.df[field_names.L_WATER] = self._water_factor() - self.df[field_names.L_HEALTH] = self._health_factor() - self.df[field_names.L_WORKFORCE] = self._workforce_factor() - - factors = [ - field_names.L_CLIMATE, - field_names.L_ENERGY, - field_names.L_TRANSPORTATION, - field_names.L_HOUSING, - field_names.L_POLLUTION, - field_names.L_WATER, - field_names.L_HEALTH, - field_names.L_WORKFORCE, - ] - self.df[field_names.SCORE_L_COMMUNITIES] = self.df[factors].any(axis=1) - - # Note: this is purely used for comparison tool analysis, and can be removed at a later date. - LMB. - non_workforce_factors = [ - field_names.L_CLIMATE, - field_names.L_ENERGY, - field_names.L_TRANSPORTATION, - field_names.L_HOUSING, - field_names.L_POLLUTION, - field_names.L_WATER, - field_names.L_HEALTH, - ] - self.df[field_names.L_NON_WORKFORCE] = self.df[ - non_workforce_factors - ].any(axis=1) - - self.df[ - field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX - ] = self.df[field_names.SCORE_L_COMMUNITIES].astype(int) - - return self.df diff --git a/data/data-pipeline/data_pipeline/score/score_m.py b/data/data-pipeline/data_pipeline/score/score_m.py deleted file mode 100644 index a8751c2a..00000000 --- a/data/data-pipeline/data_pipeline/score/score_m.py +++ /dev/null @@ -1,888 +0,0 @@ -from typing import Tuple - -import data_pipeline.etl.score.constants as constants -import data_pipeline.score.field_names as field_names -import numpy as np -import pandas as pd -from data_pipeline.score.score import Score -from data_pipeline.utils import get_module_logger - -logger = get_module_logger(__name__) - - -class ScoreM(Score): - """Very similar to Score L, with a few minor modifications.""" - - def __init__(self, df: pd.DataFrame) -> None: - self.LOW_INCOME_THRESHOLD: float = 0.65 - self.MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20 - self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90 - self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90 - self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10 - - super().__init__(df) - - def _combine_island_areas_with_states_and_set_thresholds( - self, - df: pd.DataFrame, - column_from_island_areas: str, - column_from_decennial_census: str, - combined_column_name: str, - threshold_cutoff_for_island_areas: float, - ) -> Tuple[pd.DataFrame, str]: - """Steps to set thresholds for island areas. - - This function is fairly logically complicated. It takes the following steps: - - 1. Combine the two different fields into a single field. - 2. Calculate the 90th percentile for the combined field. - 3. Create a boolean series that is true for any census tract in the island - areas (and only the island areas) that exceeds this percentile. - - For step one, it combines data that is either the island area's Decennial Census - value in 2009 or the state's value in 5-year ACS ending in 2010. - - This will be used to generate the percentile cutoff for the 90th percentile. - - The stateside decennial census stopped asking economic comparisons, - so this is as close to apples-to-apples as we get. We use 5-year ACS for data - robustness over 1-year ACS. - """ - # Create the combined field. - # TODO: move this combined field percentile calculation to `etl_score`, - # since most other percentile logic is there. - # There should only be one entry in either 2009 or 2019 fields, not one in both. - # But just to be safe, we take the mean and ignore null values so if there - # *were* entries in both, this result would make sense. - df[combined_column_name] = df[ - [column_from_island_areas, column_from_decennial_census] - ].mean(axis=1, skipna=True) - - # Create a percentile field for use in the Islands / PR visualization - # TODO: move this code - # In the code below, percentiles are constructed based on the combined column - # of census and island data, but only reported for the island areas (where there - # is no other comprehensive percentile information) - return_series_name = ( - column_from_island_areas - + field_names.ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ) - df[return_series_name] = np.where( - df[column_from_decennial_census].isna(), - df[combined_column_name].rank(pct=True), - np.nan, - ) - - threshold_column_name = ( - f"{column_from_island_areas} exceeds " - f"{threshold_cutoff_for_island_areas*100:.0f}th percentile" - ) - - df[threshold_column_name] = ( - df[return_series_name] >= threshold_cutoff_for_island_areas - ) - - return df, threshold_column_name - - def _create_low_income_and_low_college_attendance_threshold( - self, df: pd.DataFrame - ) -> pd.Series: - """ - Returns a pandas series (really a numpy array) - of booleans based on the condition of the FPL at 200% - is at or more than some established threshold - """ - - return (df[field_names.LOW_INCOME_THRESHOLD]) & ( - df[field_names.COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD] - | ( - # If college attendance data is null for this tract, just rely on the - # poverty data - df[field_names.COLLEGE_ATTENDANCE_FIELD].isna() - ) - ) - - def _increment_total_eligibility_exceeded( - self, columns_for_subset: list, skip_fips: tuple = () - ) -> None: - """ - Increments the total eligible factors for a given tract - - The new skip_fips argument specifies which (if any) fips codes to - skip over for incrementing. - This allows us to essentially skip data we think is of limited veracity, - without overriding any values in the data. - THIS IS A TEMPORARY FIX. - """ - if skip_fips: - self.df[field_names.THRESHOLD_COUNT] += np.where( - self.df[field_names.GEOID_TRACT_FIELD].str.startswith( - skip_fips - ), - 0, - self.df[columns_for_subset].sum(axis=1, skipna=True), - ) - else: - self.df[field_names.THRESHOLD_COUNT] += self.df[ - columns_for_subset - ].sum(axis=1, skipna=True) - - def _climate_factor(self) -> bool: - # In Xth percentile or above for FEMA’s Risk Index (Source: FEMA - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level and there is low higher ed attendance - # Source: Census's American Community Survey - - climate_eligibility_columns = [ - field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD, - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD, - field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD, - ] - - self.df[ - field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD - ] = ( - self.df[ - field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[ - field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD - ] = ( - self.df[ - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD] = ( - self.df[ - field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.CLIMATE_THRESHOLD_EXCEEDED] = ( - self.df[ - field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD - ] - | self.df[ - field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD - ] - | self.df[ - field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD - ] - ) - - self.df[ - field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD - ] = ( - self.df[ - field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD - ] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self.df[ - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD - ] = ( - self.df[ - field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD - ] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self.df[ - field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD - ] = ( - self.df[field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self._increment_total_eligibility_exceeded( - climate_eligibility_columns, - skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS, - ) - - return self.df[climate_eligibility_columns].any(axis="columns") - - def _energy_factor(self) -> bool: - # In Xth percentile or above for DOE’s energy cost burden score (Source: LEAD Score) - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level and has low higher ed attendance. - # Source: Census's American Community Survey - - energy_eligibility_columns = [ - field_names.PM25_EXPOSURE_LOW_INCOME_LOW_HIGHER_ED_FIELD, - field_names.ENERGY_BURDEN_LOW_INCOME_LOW_HIGHER_ED_FIELD, - ] - - self.df[field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD] = ( - self.df[ - field_names.ENERGY_BURDEN_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.PM25_EXCEEDS_PCTILE_THRESHOLD] = ( - self.df[ - field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.ENERGY_THRESHOLD_EXCEEDED] = ( - self.df[field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD] - | self.df[field_names.PM25_EXCEEDS_PCTILE_THRESHOLD] - ) - - self.df[field_names.PM25_EXPOSURE_LOW_INCOME_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.PM25_EXCEEDS_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self.df[field_names.ENERGY_BURDEN_LOW_INCOME_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self._increment_total_eligibility_exceeded( - energy_eligibility_columns, - skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS, - ) - - return self.df[energy_eligibility_columns].any(axis="columns") - - def _transportation_factor(self) -> bool: - # In Xth percentile or above for diesel particulate matter (Source: EPA National Air Toxics Assessment (NATA) - # or - # In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)] - # or - # In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level and has a low percent of higher ed students. - # Source: Census's American Community Survey - - transportion_eligibility_columns = [ - field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_LOW_HIGHER_ED_FIELD, - field_names.TRAFFIC_PROXIMITY_LOW_INCOME_LOW_HIGHER_ED_FIELD, - ] - - self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD] = ( - self.df[ - field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD] = ( - self.df[ - field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.TRAFFIC_THRESHOLD_EXCEEDED] = ( - self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD] - | self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD] - ) - - self.df[ - field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_LOW_HIGHER_ED_FIELD - ] = ( - self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self.df[ - field_names.TRAFFIC_PROXIMITY_LOW_INCOME_LOW_HIGHER_ED_FIELD - ] = ( - self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self._increment_total_eligibility_exceeded( - transportion_eligibility_columns, - skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS, - ) - - return self.df[transportion_eligibility_columns].any(axis="columns") - - def _housing_factor(self) -> bool: - # ( - # In Xth percentile or above for lead paint (Source: Census's American Community Survey’s - # percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes) - # AND - # In Yth percentile or below for Median House Value (Source: Census's American Community Survey) - # ) - # or - # In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level and has a low percent of higher ed students. - # Source: Census's American Community Survey - - housing_eligibility_columns = [ - field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_LOW_HIGHER_ED_FIELD, - field_names.HOUSING_BURDEN_LOW_INCOME_LOW_HIGHER_ED_FIELD, - ] - - self.df[field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD] = ( - self.df[ - field_names.LEAD_PAINT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) & ( - self.df[ - field_names.MEDIAN_HOUSE_VALUE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - <= self.MEDIAN_HOUSE_VALUE_THRESHOLD - ) - - self.df[field_names.HOUSING_BURDEN_PCTILE_THRESHOLD] = ( - self.df[ - field_names.HOUSING_BURDEN_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.HOUSING_THREHSOLD_EXCEEDED] = ( - self.df[field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD] - | self.df[field_names.HOUSING_BURDEN_PCTILE_THRESHOLD] - ) - - # series by series indicators - self.df[ - field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_LOW_HIGHER_ED_FIELD - ] = ( - self.df[field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self.df[field_names.HOUSING_BURDEN_LOW_INCOME_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.HOUSING_BURDEN_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self._increment_total_eligibility_exceeded( - housing_eligibility_columns, - skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS, - ) - - return self.df[housing_eligibility_columns].any(axis="columns") - - def _pollution_factor(self) -> bool: - # Proximity to Risk Management Plan sites is > X - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level and has a low percent of higher ed students. - # Source: Census's American Community Survey - - pollution_eligibility_columns = [ - field_names.RMP_LOW_INCOME_LOW_HIGHER_ED_FIELD, - field_names.SUPERFUND_LOW_INCOME_LOW_HIGHER_ED_FIELD, - field_names.HAZARDOUS_WASTE_LOW_INCOME_LOW_HIGHER_ED_FIELD, - ] - - self.df[field_names.RMP_PCTILE_THRESHOLD] = ( - self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.NPL_PCTILE_THRESHOLD] = ( - self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.TSDF_PCTILE_THRESHOLD] = ( - self.df[ - field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = ( - self.df[field_names.RMP_PCTILE_THRESHOLD] - | self.df[field_names.NPL_PCTILE_THRESHOLD] - ) | self.df[field_names.TSDF_PCTILE_THRESHOLD] - - # individual series-by-series - self.df[field_names.RMP_LOW_INCOME_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.RMP_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - self.df[field_names.SUPERFUND_LOW_INCOME_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.NPL_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - self.df[field_names.HAZARDOUS_WASTE_LOW_INCOME_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.TSDF_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self._increment_total_eligibility_exceeded( - pollution_eligibility_columns, - skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS, - ) - - return self.df[pollution_eligibility_columns].any(axis="columns") - - def _water_factor(self) -> bool: - # In Xth percentile or above for wastewater discharge (Source: EPA Risk-Screening Environmental Indicators (RSEI) Model) - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level and has a low percent of higher ed students - # Source: Census's American Community Survey - - self.df[field_names.WASTEWATER_PCTILE_THRESHOLD] = ( - self.df[ - field_names.WASTEWATER_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - # Straight copy here in case we add additional water fields. - self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[ - field_names.WASTEWATER_PCTILE_THRESHOLD - ].copy() - - self.df[ - field_names.WASTEWATER_DISCHARGE_LOW_INCOME_LOW_HIGHER_ED_FIELD - ] = ( - self.df[field_names.WASTEWATER_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self._increment_total_eligibility_exceeded( - [field_names.WASTEWATER_DISCHARGE_LOW_INCOME_LOW_HIGHER_ED_FIELD], - skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS, - ) - - return self.df[ - field_names.WASTEWATER_DISCHARGE_LOW_INCOME_LOW_HIGHER_ED_FIELD - ] - - def _health_factor(self) -> bool: - # In Xth percentile or above for diabetes (Source: CDC Places) - # or - # In Xth percentile or above for asthma (Source: CDC Places) - # or - # In Xth percentile or above for heart disease - # or - # In Xth percentile or above for low life expectancy (Source: CDC Places) - # AND - # Low income: In Nth percentile or above for percent of block group population - # of households where household income is less than or equal to twice the federal - # poverty level and has a low percent of higher ed students - # Source: Census's American Community Survey - - health_eligibility_columns = [ - field_names.DIABETES_LOW_INCOME_LOW_HIGHER_ED_FIELD, - field_names.ASTHMA_LOW_INCOME_LOW_HIGHER_ED_FIELD, - field_names.HEART_DISEASE_LOW_INCOME_LOW_HIGHER_ED_FIELD, - field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_LOW_HIGHER_ED_FIELD, - ] - - self.df[field_names.DIABETES_PCTILE_THRESHOLD] = ( - self.df[ - field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.ASTHMA_PCTILE_THRESHOLD] = ( - self.df[ - field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.HEART_DISEASE_PCTILE_THRESHOLD] = ( - self.df[ - field_names.HEART_DISEASE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD] = ( - self.df[ - field_names.LOW_LIFE_EXPECTANCY_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.HEALTH_THRESHOLD_EXCEEDED] = ( - ( - self.df[field_names.DIABETES_PCTILE_THRESHOLD] - | self.df[field_names.ASTHMA_PCTILE_THRESHOLD] - ) - | self.df[field_names.HEART_DISEASE_PCTILE_THRESHOLD] - ) | self.df[field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD] - - self.df[field_names.DIABETES_LOW_INCOME_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.DIABETES_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - self.df[field_names.ASTHMA_LOW_INCOME_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.ASTHMA_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - self.df[field_names.HEART_DISEASE_LOW_INCOME_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.HEART_DISEASE_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - self.df[ - field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_LOW_HIGHER_ED_FIELD - ] = ( - self.df[field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD] - & self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES] - ) - - self._increment_total_eligibility_exceeded( - health_eligibility_columns, - skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS, - ) - - return self.df[health_eligibility_columns].any(axis="columns") - - def _workforce_factor(self) -> bool: - # Where unemployment is above Xth percentile - # or - # Where median income as a percent of area median income is above Xth percentile - # or - # Where the percent of households at or below 100% of the federal poverty level - # is above Xth percentile - # or - # Where linguistic isolation is above Xth percentile - # AND - # Where the high school degree achievement rates for adults 25 years and older - # is less than Y% - # AND the higher ed attendance rates are under Z% - # (necessary to screen out university tracts) - - # Workforce criteria for states fields. - workforce_eligibility_columns = [ - field_names.UNEMPLOYMENT_LOW_HS_LOW_HIGHER_ED_FIELD, - field_names.POVERTY_LOW_HS_LOW_HIGHER_ED_FIELD, - field_names.LINGUISTIC_ISOLATION_LOW_HS_LOW_HIGHER_ED_FIELD, - field_names.LOW_MEDIAN_INCOME_LOW_HS_LOW_HIGHER_ED_FIELD, - ] - - self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.HIGH_SCHOOL_ED_FIELD] - >= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD - ) & ( - ( - self.df[field_names.COLLEGE_ATTENDANCE_FIELD] - <= self.MAX_COLLEGE_ATTENDANCE_THRESHOLD - ) - | ( - # If college attendance data is null for this tract, just rely on the - # poverty/AMI data - self.df[field_names.COLLEGE_ATTENDANCE_FIELD].isna() - ) - ) - - self.df[field_names.UNEMPLOYMENT_PCTILE_THRESHOLD] = ( - self.df[ - field_names.UNEMPLOYMENT_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD] = ( - self.df[ - field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD] = ( - self.df[ - field_names.LINGUISTIC_ISO_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.POVERTY_PCTILE_THRESHOLD] = ( - self.df[ - field_names.POVERTY_LESS_THAN_100_FPL_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.LINGUISTIC_ISOLATION_LOW_HS_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD] - & self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD] - ) - - self.df[field_names.POVERTY_LOW_HS_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.POVERTY_PCTILE_THRESHOLD] - & self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD] - ) - - self.df[field_names.LOW_MEDIAN_INCOME_LOW_HS_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD] - & self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD] - ) - - self.df[field_names.UNEMPLOYMENT_LOW_HS_LOW_HIGHER_ED_FIELD] = ( - self.df[field_names.UNEMPLOYMENT_PCTILE_THRESHOLD] - & self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD] - ) - - workforce_combined_criteria_for_states = self.df[ - workforce_eligibility_columns - ].any(axis="columns") - - self._increment_total_eligibility_exceeded( - workforce_eligibility_columns - ) - - # Now, calculate workforce criteria for island territories. - island_areas_workforce_eligibility_columns = [ - field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD, - field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD, - field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD, - ] - - # First, combine unemployment. - # This will include an adjusted percentile column for the island areas - # to be used by the front end. - ( - self.df, - island_areas_unemployment_criteria_field_name, - ) = self._combine_island_areas_with_states_and_set_thresholds( - df=self.df, - column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009, - column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010, - combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010, - threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD, - ) - - # TODO: Remove this, it's for checking only - assert ( - island_areas_unemployment_criteria_field_name - == field_names.ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD - ), "Error combining island columns" - - # Next, combine poverty. - # This will include an adjusted percentile column for the island areas - # to be used by the front end. - ( - self.df, - island_areas_poverty_criteria_field_name, - ) = self._combine_island_areas_with_states_and_set_thresholds( - df=self.df, - column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009, - column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010, - combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010, - threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD, - ) - - # TODO: Remove this, it's for checking only - assert ( - island_areas_poverty_criteria_field_name - == field_names.ISLAND_POVERTY_PCTILE_THRESHOLD - ), "Error combining island columns" - - # Also check whether low area median income is 90th percentile or higher - # within the islands. - - # Note that because the field for low median does not have to be combined, - # unlike the other fields, we do not need to create a new percentile - # column. This code should probably be refactored when (TODO) we do the big - # refactor. - self.df[field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD] = ( - self.df[ - field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.ENVIRONMENTAL_BURDEN_THRESHOLD - ) - - self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] = ( - self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009] - >= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD - ) - - self.df[ - field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD - ] = ( - self.df[island_areas_unemployment_criteria_field_name] - & self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] - ) - - self.df[field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD] = ( - self.df[island_areas_poverty_criteria_field_name] - & self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] - ) - - self.df[ - field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD - ] = ( - self.df[field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD] - & self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] - ) - - workforce_combined_criteria_for_island_areas = self.df[ - island_areas_workforce_eligibility_columns - ].any(axis="columns") - - self._increment_total_eligibility_exceeded( - island_areas_workforce_eligibility_columns - ) - - percent_of_island_tracts_highlighted = ( - 100 - * workforce_combined_criteria_for_island_areas.sum() - # Choosing a random column from island areas to calculate the denominator. - / self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009] - .notnull() - .sum() - ) - - logger.debug( - f"For workforce criteria in island areas, " - f"{workforce_combined_criteria_for_island_areas.sum()} (" - f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data " - f"in the column) have a value of TRUE." - ) - - # Because these criteria are calculated differently for the islands, we also calculate the - # thresholds to pass to the FE slightly differently - - self.df[field_names.WORKFORCE_THRESHOLD_EXCEEDED] = ( - ## First we calculate for the non-island areas - ( - ( - self.df[field_names.POVERTY_PCTILE_THRESHOLD] - | self.df[field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD] - ) - | self.df[field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD] - ) - | self.df[field_names.UNEMPLOYMENT_PCTILE_THRESHOLD] - ) | ( - ## then we calculate just for the island areas - ( - self.df[field_names.ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD] - | self.df[field_names.ISLAND_POVERTY_PCTILE_THRESHOLD] - ) - | self.df[field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD] - ) - - # Because of the island complications, we also have to separately calculate the threshold for - # socioeconomic thresholds - self.df[field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED] = ( - self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] - | self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD] - ) - - # A tract is included if it meets either the states tract criteria or the - # island areas tract criteria. - return ( - workforce_combined_criteria_for_states - | workforce_combined_criteria_for_island_areas - ) - - def add_columns(self) -> pd.DataFrame: - logger.debug("Adding Score M") - - self.df[field_names.THRESHOLD_COUNT] = 0 - - # TODO: move this inside of - # `_create_low_income_and_low_college_attendance_threshold` - # and change the return signature of that method. - # Create a standalone field that captures the college attendance boolean - # threshold. - self.df[field_names.LOW_INCOME_THRESHOLD] = ( - self.df[ - field_names.POVERTY_LESS_THAN_200_FPL_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX - ] - >= self.LOW_INCOME_THRESHOLD - ) - - # Because we are moving this variable to be in the same direction as all - # other variables, we change this to be < rather than <=. This translates - # to "80% or more of residents are not college students", rather than - # "Strictly greater than 80% of residents are not college students." - # There are two tracts that are impacted by this (that is, they have exactly) - # 20% college students -- neither of these has been a DAC under any score. - self.df[field_names.COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD] = ( - self.df[field_names.COLLEGE_ATTENDANCE_FIELD] - < self.MAX_COLLEGE_ATTENDANCE_THRESHOLD - ) - - self.df[ - field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES - ] = self._create_low_income_and_low_college_attendance_threshold( - self.df - ) - self.df[field_names.M_CLIMATE] = self._climate_factor() - self.df[field_names.M_ENERGY] = self._energy_factor() - self.df[field_names.M_TRANSPORTATION] = self._transportation_factor() - self.df[field_names.M_HOUSING] = self._housing_factor() - self.df[field_names.M_POLLUTION] = self._pollution_factor() - self.df[field_names.M_WATER] = self._water_factor() - self.df[field_names.M_HEALTH] = self._health_factor() - self.df[field_names.M_WORKFORCE] = self._workforce_factor() - - factors = [ - field_names.M_CLIMATE, - field_names.M_ENERGY, - field_names.M_TRANSPORTATION, - field_names.M_HOUSING, - field_names.M_POLLUTION, - field_names.M_WATER, - field_names.M_HEALTH, - field_names.M_WORKFORCE, - ] - self.df[field_names.CATEGORY_COUNT] = self.df[factors].sum(axis=1) - self.df[field_names.SCORE_M_COMMUNITIES] = self.df[factors].any(axis=1) - - # Note: this is purely used for comparison tool analysis, and can be removed at a later date. - LMB. - non_workforce_factors = [ - field_names.M_CLIMATE, - field_names.M_ENERGY, - field_names.M_TRANSPORTATION, - field_names.M_HOUSING, - field_names.M_POLLUTION, - field_names.M_WATER, - field_names.M_HEALTH, - ] - self.df[field_names.M_NON_WORKFORCE] = self.df[ - non_workforce_factors - ].any(axis=1) - - self.df[ - field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX - ] = self.df[field_names.SCORE_M_COMMUNITIES].astype(int) - - return self.df