Removed unused files

This commit is contained in:
Carlos Felix 2024-12-03 16:11:56 -05:00
commit d298f7dedb
11 changed files with 0 additions and 1983 deletions

View file

@ -1,19 +0,0 @@
import data_pipeline.score.field_names as field_names
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreA(Score):
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Score A")
self.df[field_names.SCORE_A] = self.df[
[
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HIGH_SCHOOL_ED_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
]
].mean(axis=1)
return self.df

View file

@ -1,21 +0,0 @@
import data_pipeline.score.field_names as field_names
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreB(Score):
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Score B")
self.df[field_names.SCORE_B] = (
self.df[
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
* self.df[
field_names.HIGH_SCHOOL_ED_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
)
return self.df

View file

@ -1,102 +0,0 @@
from collections import namedtuple
import data_pipeline.score.field_names as field_names
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreC(Score):
def __init__(self, df: pd.DataFrame) -> None:
Bucket = namedtuple(typename="Bucket", field_names=["name", "fields"])
# Note: we use percentiles for every field below.
# To do so, we add the percentile suffix to all the field names.
self.BUCKET_SOCIOECONOMIC = Bucket(
field_names.C_SOCIOECONOMIC,
[
field_names.HOUSEHOLDS_LINGUISTIC_ISO_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HIGH_SCHOOL_ED_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
],
)
self.BUCKET_SENSITIVE = Bucket(
field_names.C_SENSITIVE,
[
field_names.UNDER_5_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.OVER_64_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
],
)
self.BUCKET_ENVIRONMENTAL = Bucket(
field_names.C_ENVIRONMENTAL,
[
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
],
)
self.BUCKET_EXPOSURES = Bucket(
field_names.C_EXPOSURES,
[
field_names.AIR_TOXICS_CANCER_RISK_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.RESPIRATORY_HAZARD_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.OZONE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
],
)
self.BUCKETS = [
self.BUCKET_SOCIOECONOMIC,
self.BUCKET_SENSITIVE,
self.BUCKET_ENVIRONMENTAL,
self.BUCKET_EXPOSURES,
]
super().__init__(df)
# "CalEnviroScreen for the US" score
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Score C")
# Average all the percentile values in each bucket into a single score for each of the four buckets.
for bucket in self.BUCKETS:
self.df[bucket.name] = self.df[bucket.fields].mean(axis=1)
# Combine the score from the two Exposures and Environmental Effects buckets
# into a single score called "Pollution Burden".
# The math for this score is:
# (1.0 * Exposures Score + 0.5 * Environment Effects score) / 1.5.
self.df[field_names.AGGREGATION_POLLUTION_FIELD] = (
1.0 * self.df[self.BUCKET_EXPOSURES.name]
+ 0.5 * self.df[self.BUCKET_ENVIRONMENTAL.name]
) / 1.5
# Average the score from the two Sensitive populations and
# Socioeconomic factors buckets into a single score called
# "Population Characteristics".
self.df[field_names.AGGREGATION_POPULATION_FIELD] = self.df[
[self.BUCKET_SENSITIVE.name, self.BUCKET_SOCIOECONOMIC.name]
].mean(axis=1)
# Multiply the "Pollution Burden" score and the "Population Characteristics"
# together to produce the cumulative impact score.
self.df[field_names.SCORE_C] = (
self.df[field_names.AGGREGATION_POLLUTION_FIELD]
* self.df[field_names.AGGREGATION_POPULATION_FIELD]
)
return self.df

View file

@ -1,34 +0,0 @@
import data_pipeline.score.field_names as field_names
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreD(Score):
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Scores D and E")
fields_to_use_in_score = [
field_names.UNEMPLOYMENT_FIELD,
field_names.LINGUISTIC_ISO_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.POVERTY_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
]
fields_min_max = [
f"{field}{field_names.MIN_MAX_FIELD_SUFFIX}"
for field in fields_to_use_in_score
]
fields_percentile = [
f"{field}{field_names.PERCENTILE_FIELD_SUFFIX}"
for field in fields_to_use_in_score
]
# Calculate "Score D", which uses min-max normalization
# and calculate "Score E", which uses percentile normalization for the same fields
self.df[field_names.SCORE_D] = self.df[fields_min_max].mean(axis=1)
self.df[field_names.SCORE_E] = self.df[fields_percentile].mean(axis=1)
return self.df

View file

@ -1,97 +0,0 @@
import data_pipeline.score.field_names as field_names
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreF(Score):
# TODO Make variables and constants clearer (meaning and type)
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Score F")
ami_and_high_school_field = "Low AMI, Low HS graduation"
meets_socio_field = "Meets socioeconomic criteria"
meets_burden_field = "Meets burden criteria"
self.df[ami_and_high_school_field] = (
self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD] < 0.80
) & (self.df[field_names.HIGH_SCHOOL_ED_FIELD] > 0.2)
self.df[meets_socio_field] = (
self.df[ami_and_high_school_field]
| (self.df[field_names.POVERTY_FIELD] > 0.40)
| (self.df[field_names.LINGUISTIC_ISO_FIELD] > 0.10)
| (self.df[field_names.HIGH_SCHOOL_ED_FIELD] > 0.4)
)
self.df[meets_burden_field] = (
(
self.df[
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
> 0.9
)
| (
self.df[
field_names.RESPIRATORY_HAZARD_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
> 0.9
)
| (
self.df[
field_names.TRAFFIC_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
> 0.9
)
| (
self.df[
field_names.LEAD_PAINT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
> 0.9
)
| (
self.df[
field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
> 0.9
)
| (
self.df[
field_names.ASTHMA_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
> 0.9
)
| (
self.df[
field_names.HEART_DISEASE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
> 0.9
)
| (
self.df[
field_names.CANCER_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
> 0.9
)
| (
self.df[
field_names.DIABETES_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
> 0.9
)
)
self.df[field_names.SCORE_F_COMMUNITIES] = (
self.df[meets_socio_field] & self.df[meets_burden_field]
)
return self.df

View file

@ -1,34 +0,0 @@
import data_pipeline.score.field_names as field_names
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreG(Score):
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Score G")
high_school_cutoff_threshold = 0.05
# Score G is now modified NMTC
self.df[field_names.SCORE_G_COMMUNITIES] = (
(self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD] < 0.8)
& (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
> high_school_cutoff_threshold
)
) | (
(self.df[field_names.POVERTY_LESS_THAN_100_FPL_FIELD] > 0.20)
& (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
> high_school_cutoff_threshold
)
)
self.df[field_names.SCORE_G] = self.df[
field_names.SCORE_G_COMMUNITIES
].astype(int)
self.df["Score G (percentile)"] = self.df[field_names.SCORE_G]
return self.df

View file

@ -1,32 +0,0 @@
import data_pipeline.score.field_names as field_names
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreH(Score):
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Score H")
high_school_cutoff_threshold = 0.06
self.df[field_names.SCORE_H_COMMUNITIES] = (
(self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD] < 0.8)
& (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
> high_school_cutoff_threshold
)
) | (
(self.df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD] > 0.40)
& (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
> high_school_cutoff_threshold
)
)
self.df[field_names.SCORE_H] = self.df[
field_names.SCORE_H_COMMUNITIES
].astype(int)
return self.df

View file

@ -1,33 +0,0 @@
import data_pipeline.score.field_names as field_names
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreI(Score):
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Score I")
high_school_cutoff_threshold = 0.05
self.df[field_names.SCORE_I_COMMUNITIES] = (
(self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD] < 0.7)
& (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
> high_school_cutoff_threshold
)
) | (
(self.df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD] > 0.50)
& (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
> high_school_cutoff_threshold
)
)
self.df[field_names.SCORE_I] = self.df[
field_names.SCORE_I_COMMUNITIES
].astype(int)
self.df["Score I (percentile)"] = self.df[field_names.SCORE_I]
return self.df

View file

@ -1,33 +0,0 @@
import data_pipeline.score.field_names as field_names
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreK(Score):
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Score K")
high_school_cutoff_threshold = 0.06
self.df[field_names.SCORE_K] = (
(self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD] < 0.8)
) | (self.df[field_names.POVERTY_LESS_THAN_100_FPL_FIELD] > 0.20)
self.df[field_names.SCORE_K_COMMUNITIES] = (
(self.df[field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD] < 0.8)
& (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
> high_school_cutoff_threshold
)
) | (
(self.df[field_names.POVERTY_LESS_THAN_100_FPL_FIELD] > 0.20)
& (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
> high_school_cutoff_threshold
)
)
return self.df

View file

@ -1,690 +0,0 @@
import data_pipeline.score.field_names as field_names
import numpy as np
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreL(Score):
def __init__(self, df: pd.DataFrame) -> None:
self.LOW_INCOME_THRESHOLD: float = 0.65
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
super().__init__(df)
def _combine_island_areas_with_states_and_set_thresholds(
self,
df: pd.DataFrame,
column_from_island_areas: str,
column_from_decennial_census: str,
combined_column_name: str,
threshold_cutoff_for_island_areas: float,
) -> (pd.DataFrame, str):
"""Steps to set thresholds for island areas.
This function is fairly logically complicated. It takes the following steps:
1. Combine the two different fields into a single field.
2. Calculate the 90th percentile cutoff raw value for the combined field.
3. Create a boolean series that is true for any census tract in the island
areas (and only the island areas) that exceeds this cutoff.
For step one, it combines data that is either the island area's Decennial Census
value in 2009 or the state's value in 5-year ACS ending in 2010.
This will be used to generate the percentile cutoff for the 90th percentile.
The stateside decennial census stopped asking economic comparisons,
so this is as close to apples-to-apples as we get. We use 5-year ACS for data
robustness over 1-year ACS.
"""
# Create the combined field.
# TODO: move this combined field percentile calculation to `etl_score`,
# since most other percentile logic is there.
# There should only be one entry in either 2009 or 2019 fields, not one in both.
# But just to be safe, we take the mean and ignore null values so if there
# *were* entries in both, this result would make sense.
df[combined_column_name] = df[
[column_from_island_areas, column_from_decennial_census]
].mean(axis=1, skipna=True)
logger.debug(
f"Combined field `{combined_column_name}` has "
f"{df[combined_column_name].isnull().sum()} "
f"({df[combined_column_name].isnull().sum() * 100 / len(df):.2f}%) "
f"missing values for census tracts. "
)
# Calculate the percentile threshold raw value.
raw_threshold = np.nanquantile(
a=df[combined_column_name], q=threshold_cutoff_for_island_areas
)
logger.debug(
f"For combined field `{combined_column_name}`, "
f"the {threshold_cutoff_for_island_areas*100:.0f} percentile cutoff is a "
f"raw value of {raw_threshold:.3f}."
)
threshold_column_name = (
f"{column_from_island_areas} exceeds "
f"{threshold_cutoff_for_island_areas*100:.0f}th percentile"
)
df[threshold_column_name] = (
df[column_from_island_areas] >= raw_threshold
)
percent_of_tracts_highlighted = (
100
* df[threshold_column_name].sum()
/ df[column_from_island_areas].notnull().sum()
)
logger.info(
f"For `{threshold_column_name}`, "
f"{df[threshold_column_name].sum()} ("
f"{percent_of_tracts_highlighted:.2f}% of tracts that have non-null data "
f"in the column) have a value of TRUE."
)
return df, threshold_column_name
def _create_low_income_threshold(self, df: pd.DataFrame) -> pd.Series:
"""
Returns a pandas series (really a numpy array)
of booleans based on the condition of the FPL at 200%
is at or more than some established threshold
"""
return (
df[
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.LOW_INCOME_THRESHOLD
)
def _increment_total_eligibility_exceeded(
self, columns_for_subset: list
) -> None:
"""
Increments the total eligible factors for a given tract
"""
self.df[field_names.THRESHOLD_COUNT] += self.df[columns_for_subset].sum(
axis=1, skipna=True
)
def _climate_factor(self) -> bool:
# In Xth percentile or above for FEMAs Risk Index (Source: FEMA
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
climate_eligibility_columns = [
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD,
]
expected_population_loss_threshold = (
self.df[
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
expected_agriculture_loss_threshold = (
self.df[
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
expected_building_loss_threshold = (
self.df[
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD] = (
expected_population_loss_threshold
& self.df[field_names.FPL_200_SERIES]
)
self.df[field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD] = (
expected_agriculture_loss_threshold
& self.df[field_names.FPL_200_SERIES]
)
self.df[field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD] = (
expected_building_loss_threshold
& self.df[field_names.FPL_200_SERIES]
)
self._increment_total_eligibility_exceeded(climate_eligibility_columns)
return self.df[climate_eligibility_columns].any(axis="columns")
def _energy_factor(self) -> bool:
# In Xth percentile or above for DOEs energy cost burden score (Source: LEAD Score)
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
energy_eligibility_columns = [
field_names.PM25_EXPOSURE_LOW_INCOME_FIELD,
field_names.ENERGY_BURDEN_LOW_INCOME_FIELD,
]
energy_burden_threshold = (
self.df[
field_names.ENERGY_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
pm25_threshold = (
self.df[
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.PM25_EXPOSURE_LOW_INCOME_FIELD] = (
pm25_threshold & self.df[field_names.FPL_200_SERIES]
)
self.df[field_names.ENERGY_BURDEN_LOW_INCOME_FIELD] = (
energy_burden_threshold & self.df[field_names.FPL_200_SERIES]
)
self._increment_total_eligibility_exceeded(energy_eligibility_columns)
return self.df[energy_eligibility_columns].any(axis="columns")
def _transportation_factor(self) -> bool:
# In Xth percentile or above for diesel particulate matter (Source: EPA National Air Toxics Assessment (NATA)
# or
# In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)]
# or
# In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
transportion_eligibility_columns = [
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD,
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD,
]
diesel_threshold = (
self.df[
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
traffic_threshold = (
self.df[
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD] = (
diesel_threshold & self.df[field_names.FPL_200_SERIES]
)
self.df[field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD] = (
traffic_threshold & self.df[field_names.FPL_200_SERIES]
)
self._increment_total_eligibility_exceeded(
transportion_eligibility_columns
)
return self.df[transportion_eligibility_columns].any(axis="columns")
def _housing_factor(self) -> bool:
# (
# In Xth percentile or above for lead paint (Source: Census's American Community Surveys
# percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
# AND
# In Yth percentile or below for Median House Value (Source: Census's American Community Survey)
# )
# or
# In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
housing_eligibility_columns = [
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD,
field_names.HOUSING_BURDEN_LOW_INCOME_FIELD,
]
lead_paint_median_home_value_threshold = (
self.df[
field_names.LEAD_PAINT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
) & (
self.df[
field_names.MEDIAN_HOUSE_VALUE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
<= self.MEDIAN_HOUSE_VALUE_THRESHOLD
)
housing_burden_threshold = (
self.df[
field_names.HOUSING_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
# series by series indicators
self.df[field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD] = (
lead_paint_median_home_value_threshold
& self.df[field_names.FPL_200_SERIES]
)
self.df[field_names.HOUSING_BURDEN_LOW_INCOME_FIELD] = (
housing_burden_threshold & self.df[field_names.FPL_200_SERIES]
)
self._increment_total_eligibility_exceeded(housing_eligibility_columns)
return self.df[housing_eligibility_columns].any(axis="columns")
def _pollution_factor(self) -> bool:
# Proximity to Risk Management Plan sites is > X
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
pollution_eligibility_columns = [
field_names.RMP_LOW_INCOME_FIELD,
field_names.SUPERFUND_LOW_INCOME_FIELD,
field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD,
]
rmp_sites_threshold = (
self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
npl_sites_threshold = (
self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
tsdf_sites_threshold = (
self.df[
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
# individual series-by-series
self.df[field_names.RMP_LOW_INCOME_FIELD] = (
rmp_sites_threshold & self.df[field_names.FPL_200_SERIES]
)
self.df[field_names.SUPERFUND_LOW_INCOME_FIELD] = (
npl_sites_threshold & self.df[field_names.FPL_200_SERIES]
)
self.df[field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD] = (
tsdf_sites_threshold & self.df[field_names.FPL_200_SERIES]
)
self._increment_total_eligibility_exceeded(
pollution_eligibility_columns
)
return self.df[pollution_eligibility_columns].any(axis="columns")
def _water_factor(self) -> bool:
# In Xth percentile or above for wastewater discharge (Source: EPA Risk-Screening Environmental Indicators (RSEI) Model)
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
wastewater_threshold = (
self.df[
field_names.WASTEWATER_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD] = (
wastewater_threshold & self.df[field_names.FPL_200_SERIES]
)
self._increment_total_eligibility_exceeded(
[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD]
)
return self.df[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD]
def _health_factor(self) -> bool:
# In Xth percentile or above for diabetes (Source: CDC Places)
# or
# In Xth percentile or above for asthma (Source: CDC Places)
# or
# In Xth percentile or above for heart disease
# or
# In Xth percentile or above for low life expectancy (Source: CDC Places)
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level. Source: Census's American Community Survey]
health_eligibility_columns = [
field_names.DIABETES_LOW_INCOME_FIELD,
field_names.ASTHMA_LOW_INCOME_FIELD,
field_names.HEART_DISEASE_LOW_INCOME_FIELD,
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD,
]
diabetes_threshold = (
self.df[
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
asthma_threshold = (
self.df[
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
heart_disease_threshold = (
self.df[
field_names.HEART_DISEASE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
low_life_expectancy_threshold = (
self.df[
field_names.LOW_LIFE_EXPECTANCY_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.DIABETES_LOW_INCOME_FIELD] = (
diabetes_threshold & self.df[field_names.FPL_200_SERIES]
)
self.df[field_names.ASTHMA_LOW_INCOME_FIELD] = (
asthma_threshold & self.df[field_names.FPL_200_SERIES]
)
self.df[field_names.HEART_DISEASE_LOW_INCOME_FIELD] = (
heart_disease_threshold & self.df[field_names.FPL_200_SERIES]
)
self.df[field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD] = (
low_life_expectancy_threshold & self.df[field_names.FPL_200_SERIES]
)
self._increment_total_eligibility_exceeded(health_eligibility_columns)
return self.df[health_eligibility_columns].any(axis="columns")
def _workforce_factor(self) -> bool:
# Where unemployment is above Xth percentile
# or
# Where median income as a percent of area median income is above Xth percentile
# or
# Where the percent of households at or below 100% of the federal poverty level
# is above Xth percentile
# or
# Where linguistic isolation is above Xth percentile
# AND
# Where the high school degree achievement rates for adults 25 years and older
# is less than Y%
# (necessary to screen out university tracts)
# Workforce criteria for states fields.
workforce_eligibility_columns = [
field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD,
field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
]
self.df[field_names.LOW_HS_EDUCATION_FIELD] = (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
)
unemployment_threshold = (
self.df[
field_names.UNEMPLOYMENT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
low_median_income_threshold = (
self.df[
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
linguistic_isolation_threshold = (
self.df[
field_names.LINGUISTIC_ISO_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
poverty_threshold = (
self.df[
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD] = (
linguistic_isolation_threshold
& self.df[field_names.LOW_HS_EDUCATION_FIELD]
)
self.df[field_names.POVERTY_LOW_HS_EDUCATION_FIELD] = (
poverty_threshold & self.df[field_names.LOW_HS_EDUCATION_FIELD]
)
self.df[field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD] = (
low_median_income_threshold
& self.df[field_names.LOW_HS_EDUCATION_FIELD]
)
self.df[field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD] = (
unemployment_threshold & self.df[field_names.LOW_HS_EDUCATION_FIELD]
)
workforce_combined_criteria_for_states = self.df[
workforce_eligibility_columns
].any(axis="columns")
self._increment_total_eligibility_exceeded(
workforce_eligibility_columns
)
# Now, calculate workforce criteria for island territories.
island_areas_workforce_eligibility_columns = [
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
]
# First, combine unemployment.
(
self.df,
island_areas_unemployment_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010,
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
# Next, combine poverty.
(
self.df,
island_areas_poverty_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
# Also check whether low area median income is 90th percentile or higher
# within the islands.
island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name = (
f"{field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009} exceeds "
f"{field_names.PERCENTILE}th percentile"
)
self.df[
island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name
] = (
self.df[
field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] = (
self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
)
self.df[
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD
] = (
self.df[island_areas_unemployment_criteria_field_name]
& self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
)
self.df[field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD] = (
self.df[island_areas_poverty_criteria_field_name]
& self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
)
self.df[
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD
] = (
self.df[
island_areas_low_median_income_as_a_percent_of_ami_criteria_field_name
]
& self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
)
workforce_combined_criteria_for_island_areas = self.df[
island_areas_workforce_eligibility_columns
].any(axis="columns")
self._increment_total_eligibility_exceeded(
island_areas_workforce_eligibility_columns
)
percent_of_island_tracts_highlighted = (
100
* workforce_combined_criteria_for_island_areas.sum()
# Choosing a random column from island areas to calculate the denominator.
/ self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009]
.notnull()
.sum()
)
logger.debug(
f"For workforce criteria in island areas, "
f"{workforce_combined_criteria_for_island_areas.sum()} ("
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
f"in the column) have a value of TRUE."
)
# A tract is included if it meets either the states tract criteria or the
# island areas tract criteria.
return (
workforce_combined_criteria_for_states
| workforce_combined_criteria_for_island_areas
)
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Score L")
self.df[field_names.THRESHOLD_COUNT] = 0
self.df[field_names.FPL_200_SERIES] = self._create_low_income_threshold(
self.df
)
self.df[field_names.L_CLIMATE] = self._climate_factor()
self.df[field_names.L_ENERGY] = self._energy_factor()
self.df[field_names.L_TRANSPORTATION] = self._transportation_factor()
self.df[field_names.L_HOUSING] = self._housing_factor()
self.df[field_names.L_POLLUTION] = self._pollution_factor()
self.df[field_names.L_WATER] = self._water_factor()
self.df[field_names.L_HEALTH] = self._health_factor()
self.df[field_names.L_WORKFORCE] = self._workforce_factor()
factors = [
field_names.L_CLIMATE,
field_names.L_ENERGY,
field_names.L_TRANSPORTATION,
field_names.L_HOUSING,
field_names.L_POLLUTION,
field_names.L_WATER,
field_names.L_HEALTH,
field_names.L_WORKFORCE,
]
self.df[field_names.SCORE_L_COMMUNITIES] = self.df[factors].any(axis=1)
# Note: this is purely used for comparison tool analysis, and can be removed at a later date. - LMB.
non_workforce_factors = [
field_names.L_CLIMATE,
field_names.L_ENERGY,
field_names.L_TRANSPORTATION,
field_names.L_HOUSING,
field_names.L_POLLUTION,
field_names.L_WATER,
field_names.L_HEALTH,
]
self.df[field_names.L_NON_WORKFORCE] = self.df[
non_workforce_factors
].any(axis=1)
self.df[
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX
] = self.df[field_names.SCORE_L_COMMUNITIES].astype(int)
return self.df

View file

@ -1,888 +0,0 @@
from typing import Tuple
import data_pipeline.etl.score.constants as constants
import data_pipeline.score.field_names as field_names
import numpy as np
import pandas as pd
from data_pipeline.score.score import Score
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class ScoreM(Score):
"""Very similar to Score L, with a few minor modifications."""
def __init__(self, df: pd.DataFrame) -> None:
self.LOW_INCOME_THRESHOLD: float = 0.65
self.MAX_COLLEGE_ATTENDANCE_THRESHOLD: float = 0.20
self.ENVIRONMENTAL_BURDEN_THRESHOLD: float = 0.90
self.MEDIAN_HOUSE_VALUE_THRESHOLD: float = 0.90
self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD: float = 0.10
super().__init__(df)
def _combine_island_areas_with_states_and_set_thresholds(
self,
df: pd.DataFrame,
column_from_island_areas: str,
column_from_decennial_census: str,
combined_column_name: str,
threshold_cutoff_for_island_areas: float,
) -> Tuple[pd.DataFrame, str]:
"""Steps to set thresholds for island areas.
This function is fairly logically complicated. It takes the following steps:
1. Combine the two different fields into a single field.
2. Calculate the 90th percentile for the combined field.
3. Create a boolean series that is true for any census tract in the island
areas (and only the island areas) that exceeds this percentile.
For step one, it combines data that is either the island area's Decennial Census
value in 2009 or the state's value in 5-year ACS ending in 2010.
This will be used to generate the percentile cutoff for the 90th percentile.
The stateside decennial census stopped asking economic comparisons,
so this is as close to apples-to-apples as we get. We use 5-year ACS for data
robustness over 1-year ACS.
"""
# Create the combined field.
# TODO: move this combined field percentile calculation to `etl_score`,
# since most other percentile logic is there.
# There should only be one entry in either 2009 or 2019 fields, not one in both.
# But just to be safe, we take the mean and ignore null values so if there
# *were* entries in both, this result would make sense.
df[combined_column_name] = df[
[column_from_island_areas, column_from_decennial_census]
].mean(axis=1, skipna=True)
# Create a percentile field for use in the Islands / PR visualization
# TODO: move this code
# In the code below, percentiles are constructed based on the combined column
# of census and island data, but only reported for the island areas (where there
# is no other comprehensive percentile information)
return_series_name = (
column_from_island_areas
+ field_names.ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
)
df[return_series_name] = np.where(
df[column_from_decennial_census].isna(),
df[combined_column_name].rank(pct=True),
np.nan,
)
threshold_column_name = (
f"{column_from_island_areas} exceeds "
f"{threshold_cutoff_for_island_areas*100:.0f}th percentile"
)
df[threshold_column_name] = (
df[return_series_name] >= threshold_cutoff_for_island_areas
)
return df, threshold_column_name
def _create_low_income_and_low_college_attendance_threshold(
self, df: pd.DataFrame
) -> pd.Series:
"""
Returns a pandas series (really a numpy array)
of booleans based on the condition of the FPL at 200%
is at or more than some established threshold
"""
return (df[field_names.LOW_INCOME_THRESHOLD]) & (
df[field_names.COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD]
| (
# If college attendance data is null for this tract, just rely on the
# poverty data
df[field_names.COLLEGE_ATTENDANCE_FIELD].isna()
)
)
def _increment_total_eligibility_exceeded(
self, columns_for_subset: list, skip_fips: tuple = ()
) -> None:
"""
Increments the total eligible factors for a given tract
The new skip_fips argument specifies which (if any) fips codes to
skip over for incrementing.
This allows us to essentially skip data we think is of limited veracity,
without overriding any values in the data.
THIS IS A TEMPORARY FIX.
"""
if skip_fips:
self.df[field_names.THRESHOLD_COUNT] += np.where(
self.df[field_names.GEOID_TRACT_FIELD].str.startswith(
skip_fips
),
0,
self.df[columns_for_subset].sum(axis=1, skipna=True),
)
else:
self.df[field_names.THRESHOLD_COUNT] += self.df[
columns_for_subset
].sum(axis=1, skipna=True)
def _climate_factor(self) -> bool:
# In Xth percentile or above for FEMAs Risk Index (Source: FEMA
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level and there is low higher ed attendance
# Source: Census's American Community Survey
climate_eligibility_columns = [
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD,
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD,
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD,
]
self.df[
field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD
] = (
self.df[
field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[
field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD
] = (
self.df[
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD] = (
self.df[
field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.CLIMATE_THRESHOLD_EXCEEDED] = (
self.df[
field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD
]
| self.df[
field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD
]
| self.df[
field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD
]
)
self.df[
field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD
] = (
self.df[
field_names.EXPECTED_POPULATION_LOSS_EXCEEDS_PCTILE_THRESHOLD
]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[
field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD
] = (
self.df[
field_names.EXPECTED_AGRICULTURAL_LOSS_EXCEEDS_PCTILE_THRESHOLD
]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[
field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD
] = (
self.df[field_names.EXPECTED_BUILDING_LOSS_EXCEEDS_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(
climate_eligibility_columns,
skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
)
return self.df[climate_eligibility_columns].any(axis="columns")
def _energy_factor(self) -> bool:
# In Xth percentile or above for DOEs energy cost burden score (Source: LEAD Score)
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level and has low higher ed attendance.
# Source: Census's American Community Survey
energy_eligibility_columns = [
field_names.PM25_EXPOSURE_LOW_INCOME_LOW_HIGHER_ED_FIELD,
field_names.ENERGY_BURDEN_LOW_INCOME_LOW_HIGHER_ED_FIELD,
]
self.df[field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD] = (
self.df[
field_names.ENERGY_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.PM25_EXCEEDS_PCTILE_THRESHOLD] = (
self.df[
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.ENERGY_THRESHOLD_EXCEEDED] = (
self.df[field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD]
| self.df[field_names.PM25_EXCEEDS_PCTILE_THRESHOLD]
)
self.df[field_names.PM25_EXPOSURE_LOW_INCOME_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.PM25_EXCEEDS_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.ENERGY_BURDEN_LOW_INCOME_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.ENERGY_BURDEN_EXCEEDS_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(
energy_eligibility_columns,
skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
)
return self.df[energy_eligibility_columns].any(axis="columns")
def _transportation_factor(self) -> bool:
# In Xth percentile or above for diesel particulate matter (Source: EPA National Air Toxics Assessment (NATA)
# or
# In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)]
# or
# In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level and has a low percent of higher ed students.
# Source: Census's American Community Survey
transportion_eligibility_columns = [
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_LOW_HIGHER_ED_FIELD,
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_LOW_HIGHER_ED_FIELD,
]
self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD] = (
self.df[
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD] = (
self.df[
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.TRAFFIC_THRESHOLD_EXCEEDED] = (
self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD]
| self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD]
)
self.df[
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_LOW_HIGHER_ED_FIELD
] = (
self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_LOW_HIGHER_ED_FIELD
] = (
self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(
transportion_eligibility_columns,
skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
)
return self.df[transportion_eligibility_columns].any(axis="columns")
def _housing_factor(self) -> bool:
# (
# In Xth percentile or above for lead paint (Source: Census's American Community Surveys
# percent of housing units built pre-1960, used as an indicator of potential lead paint exposure in homes)
# AND
# In Yth percentile or below for Median House Value (Source: Census's American Community Survey)
# )
# or
# In Xth percentile or above for housing cost burden (Source: HUD's Comprehensive Housing Affordability Strategy dataset
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level and has a low percent of higher ed students.
# Source: Census's American Community Survey
housing_eligibility_columns = [
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_LOW_HIGHER_ED_FIELD,
field_names.HOUSING_BURDEN_LOW_INCOME_LOW_HIGHER_ED_FIELD,
]
self.df[field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD] = (
self.df[
field_names.LEAD_PAINT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
) & (
self.df[
field_names.MEDIAN_HOUSE_VALUE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
<= self.MEDIAN_HOUSE_VALUE_THRESHOLD
)
self.df[field_names.HOUSING_BURDEN_PCTILE_THRESHOLD] = (
self.df[
field_names.HOUSING_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.HOUSING_THREHSOLD_EXCEEDED] = (
self.df[field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD]
| self.df[field_names.HOUSING_BURDEN_PCTILE_THRESHOLD]
)
# series by series indicators
self.df[
field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_LOW_HIGHER_ED_FIELD
] = (
self.df[field_names.LEAD_PAINT_PROXY_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.HOUSING_BURDEN_LOW_INCOME_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.HOUSING_BURDEN_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(
housing_eligibility_columns,
skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
)
return self.df[housing_eligibility_columns].any(axis="columns")
def _pollution_factor(self) -> bool:
# Proximity to Risk Management Plan sites is > X
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level and has a low percent of higher ed students.
# Source: Census's American Community Survey
pollution_eligibility_columns = [
field_names.RMP_LOW_INCOME_LOW_HIGHER_ED_FIELD,
field_names.SUPERFUND_LOW_INCOME_LOW_HIGHER_ED_FIELD,
field_names.HAZARDOUS_WASTE_LOW_INCOME_LOW_HIGHER_ED_FIELD,
]
self.df[field_names.RMP_PCTILE_THRESHOLD] = (
self.df[field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.NPL_PCTILE_THRESHOLD] = (
self.df[field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.TSDF_PCTILE_THRESHOLD] = (
self.df[
field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.POLLUTION_THRESHOLD_EXCEEDED] = (
self.df[field_names.RMP_PCTILE_THRESHOLD]
| self.df[field_names.NPL_PCTILE_THRESHOLD]
) | self.df[field_names.TSDF_PCTILE_THRESHOLD]
# individual series-by-series
self.df[field_names.RMP_LOW_INCOME_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.RMP_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.SUPERFUND_LOW_INCOME_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.NPL_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.HAZARDOUS_WASTE_LOW_INCOME_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.TSDF_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(
pollution_eligibility_columns,
skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
)
return self.df[pollution_eligibility_columns].any(axis="columns")
def _water_factor(self) -> bool:
# In Xth percentile or above for wastewater discharge (Source: EPA Risk-Screening Environmental Indicators (RSEI) Model)
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level and has a low percent of higher ed students
# Source: Census's American Community Survey
self.df[field_names.WASTEWATER_PCTILE_THRESHOLD] = (
self.df[
field_names.WASTEWATER_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
# Straight copy here in case we add additional water fields.
self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[
field_names.WASTEWATER_PCTILE_THRESHOLD
].copy()
self.df[
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_LOW_HIGHER_ED_FIELD
] = (
self.df[field_names.WASTEWATER_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(
[field_names.WASTEWATER_DISCHARGE_LOW_INCOME_LOW_HIGHER_ED_FIELD],
skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
)
return self.df[
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_LOW_HIGHER_ED_FIELD
]
def _health_factor(self) -> bool:
# In Xth percentile or above for diabetes (Source: CDC Places)
# or
# In Xth percentile or above for asthma (Source: CDC Places)
# or
# In Xth percentile or above for heart disease
# or
# In Xth percentile or above for low life expectancy (Source: CDC Places)
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
# poverty level and has a low percent of higher ed students
# Source: Census's American Community Survey
health_eligibility_columns = [
field_names.DIABETES_LOW_INCOME_LOW_HIGHER_ED_FIELD,
field_names.ASTHMA_LOW_INCOME_LOW_HIGHER_ED_FIELD,
field_names.HEART_DISEASE_LOW_INCOME_LOW_HIGHER_ED_FIELD,
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_LOW_HIGHER_ED_FIELD,
]
self.df[field_names.DIABETES_PCTILE_THRESHOLD] = (
self.df[
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.ASTHMA_PCTILE_THRESHOLD] = (
self.df[
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.HEART_DISEASE_PCTILE_THRESHOLD] = (
self.df[
field_names.HEART_DISEASE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD] = (
self.df[
field_names.LOW_LIFE_EXPECTANCY_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.HEALTH_THRESHOLD_EXCEEDED] = (
(
self.df[field_names.DIABETES_PCTILE_THRESHOLD]
| self.df[field_names.ASTHMA_PCTILE_THRESHOLD]
)
| self.df[field_names.HEART_DISEASE_PCTILE_THRESHOLD]
) | self.df[field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD]
self.df[field_names.DIABETES_LOW_INCOME_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.DIABETES_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.ASTHMA_LOW_INCOME_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.ASTHMA_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[field_names.HEART_DISEASE_LOW_INCOME_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.HEART_DISEASE_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self.df[
field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_LOW_HIGHER_ED_FIELD
] = (
self.df[field_names.LOW_LIFE_EXPECTANCY_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES]
)
self._increment_total_eligibility_exceeded(
health_eligibility_columns,
skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
)
return self.df[health_eligibility_columns].any(axis="columns")
def _workforce_factor(self) -> bool:
# Where unemployment is above Xth percentile
# or
# Where median income as a percent of area median income is above Xth percentile
# or
# Where the percent of households at or below 100% of the federal poverty level
# is above Xth percentile
# or
# Where linguistic isolation is above Xth percentile
# AND
# Where the high school degree achievement rates for adults 25 years and older
# is less than Y%
# AND the higher ed attendance rates are under Z%
# (necessary to screen out university tracts)
# Workforce criteria for states fields.
workforce_eligibility_columns = [
field_names.UNEMPLOYMENT_LOW_HS_LOW_HIGHER_ED_FIELD,
field_names.POVERTY_LOW_HS_LOW_HIGHER_ED_FIELD,
field_names.LINGUISTIC_ISOLATION_LOW_HS_LOW_HIGHER_ED_FIELD,
field_names.LOW_MEDIAN_INCOME_LOW_HS_LOW_HIGHER_ED_FIELD,
]
self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.HIGH_SCHOOL_ED_FIELD]
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
) & (
(
self.df[field_names.COLLEGE_ATTENDANCE_FIELD]
<= self.MAX_COLLEGE_ATTENDANCE_THRESHOLD
)
| (
# If college attendance data is null for this tract, just rely on the
# poverty/AMI data
self.df[field_names.COLLEGE_ATTENDANCE_FIELD].isna()
)
)
self.df[field_names.UNEMPLOYMENT_PCTILE_THRESHOLD] = (
self.df[
field_names.UNEMPLOYMENT_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD] = (
self.df[
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD] = (
self.df[
field_names.LINGUISTIC_ISO_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.POVERTY_PCTILE_THRESHOLD] = (
self.df[
field_names.POVERTY_LESS_THAN_100_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.LINGUISTIC_ISOLATION_LOW_HS_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD]
& self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD]
)
self.df[field_names.POVERTY_LOW_HS_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.POVERTY_PCTILE_THRESHOLD]
& self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD]
)
self.df[field_names.LOW_MEDIAN_INCOME_LOW_HS_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD]
& self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD]
)
self.df[field_names.UNEMPLOYMENT_LOW_HS_LOW_HIGHER_ED_FIELD] = (
self.df[field_names.UNEMPLOYMENT_PCTILE_THRESHOLD]
& self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD]
)
workforce_combined_criteria_for_states = self.df[
workforce_eligibility_columns
].any(axis="columns")
self._increment_total_eligibility_exceeded(
workforce_eligibility_columns
)
# Now, calculate workforce criteria for island territories.
island_areas_workforce_eligibility_columns = [
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD,
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD,
]
# First, combine unemployment.
# This will include an adjusted percentile column for the island areas
# to be used by the front end.
(
self.df,
island_areas_unemployment_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009,
column_from_decennial_census=field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
combined_column_name=field_names.COMBINED_UNEMPLOYMENT_2010,
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
# TODO: Remove this, it's for checking only
assert (
island_areas_unemployment_criteria_field_name
== field_names.ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD
), "Error combining island columns"
# Next, combine poverty.
# This will include an adjusted percentile column for the island areas
# to be used by the front end.
(
self.df,
island_areas_poverty_criteria_field_name,
) = self._combine_island_areas_with_states_and_set_thresholds(
df=self.df,
column_from_island_areas=field_names.CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009,
column_from_decennial_census=field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
combined_column_name=field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
threshold_cutoff_for_island_areas=self.ENVIRONMENTAL_BURDEN_THRESHOLD,
)
# TODO: Remove this, it's for checking only
assert (
island_areas_poverty_criteria_field_name
== field_names.ISLAND_POVERTY_PCTILE_THRESHOLD
), "Error combining island columns"
# Also check whether low area median income is 90th percentile or higher
# within the islands.
# Note that because the field for low median does not have to be combined,
# unlike the other fields, we do not need to create a new percentile
# column. This code should probably be refactored when (TODO) we do the big
# refactor.
self.df[field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD] = (
self.df[
field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD] = (
self.df[field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009]
>= self.LACK_OF_HIGH_SCHOOL_MINIMUM_THRESHOLD
)
self.df[
field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD
] = (
self.df[island_areas_unemployment_criteria_field_name]
& self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
)
self.df[field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD] = (
self.df[island_areas_poverty_criteria_field_name]
& self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
)
self.df[
field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD
] = (
self.df[field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD]
& self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
)
workforce_combined_criteria_for_island_areas = self.df[
island_areas_workforce_eligibility_columns
].any(axis="columns")
self._increment_total_eligibility_exceeded(
island_areas_workforce_eligibility_columns
)
percent_of_island_tracts_highlighted = (
100
* workforce_combined_criteria_for_island_areas.sum()
# Choosing a random column from island areas to calculate the denominator.
/ self.df[field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009]
.notnull()
.sum()
)
logger.debug(
f"For workforce criteria in island areas, "
f"{workforce_combined_criteria_for_island_areas.sum()} ("
f"{percent_of_island_tracts_highlighted:.2f}% of tracts that have non-null data "
f"in the column) have a value of TRUE."
)
# Because these criteria are calculated differently for the islands, we also calculate the
# thresholds to pass to the FE slightly differently
self.df[field_names.WORKFORCE_THRESHOLD_EXCEEDED] = (
## First we calculate for the non-island areas
(
(
self.df[field_names.POVERTY_PCTILE_THRESHOLD]
| self.df[field_names.LINGUISTIC_ISOLATION_PCTILE_THRESHOLD]
)
| self.df[field_names.LOW_MEDIAN_INCOME_PCTILE_THRESHOLD]
)
| self.df[field_names.UNEMPLOYMENT_PCTILE_THRESHOLD]
) | (
## then we calculate just for the island areas
(
self.df[field_names.ISLAND_UNEMPLOYMENT_PCTILE_THRESHOLD]
| self.df[field_names.ISLAND_POVERTY_PCTILE_THRESHOLD]
)
| self.df[field_names.ISLAND_LOW_MEDIAN_INCOME_PCTILE_THRESHOLD]
)
# Because of the island complications, we also have to separately calculate the threshold for
# socioeconomic thresholds
self.df[field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED] = (
self.df[field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD]
| self.df[field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD]
)
# A tract is included if it meets either the states tract criteria or the
# island areas tract criteria.
return (
workforce_combined_criteria_for_states
| workforce_combined_criteria_for_island_areas
)
def add_columns(self) -> pd.DataFrame:
logger.debug("Adding Score M")
self.df[field_names.THRESHOLD_COUNT] = 0
# TODO: move this inside of
# `_create_low_income_and_low_college_attendance_threshold`
# and change the return signature of that method.
# Create a standalone field that captures the college attendance boolean
# threshold.
self.df[field_names.LOW_INCOME_THRESHOLD] = (
self.df[
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.LOW_INCOME_THRESHOLD
)
# Because we are moving this variable to be in the same direction as all
# other variables, we change this to be < rather than <=. This translates
# to "80% or more of residents are not college students", rather than
# "Strictly greater than 80% of residents are not college students."
# There are two tracts that are impacted by this (that is, they have exactly)
# 20% college students -- neither of these has been a DAC under any score.
self.df[field_names.COLLEGE_ATTENDANCE_LESS_THAN_20_FIELD] = (
self.df[field_names.COLLEGE_ATTENDANCE_FIELD]
< self.MAX_COLLEGE_ATTENDANCE_THRESHOLD
)
self.df[
field_names.FPL_200_AND_COLLEGE_ATTENDANCE_SERIES
] = self._create_low_income_and_low_college_attendance_threshold(
self.df
)
self.df[field_names.M_CLIMATE] = self._climate_factor()
self.df[field_names.M_ENERGY] = self._energy_factor()
self.df[field_names.M_TRANSPORTATION] = self._transportation_factor()
self.df[field_names.M_HOUSING] = self._housing_factor()
self.df[field_names.M_POLLUTION] = self._pollution_factor()
self.df[field_names.M_WATER] = self._water_factor()
self.df[field_names.M_HEALTH] = self._health_factor()
self.df[field_names.M_WORKFORCE] = self._workforce_factor()
factors = [
field_names.M_CLIMATE,
field_names.M_ENERGY,
field_names.M_TRANSPORTATION,
field_names.M_HOUSING,
field_names.M_POLLUTION,
field_names.M_WATER,
field_names.M_HEALTH,
field_names.M_WORKFORCE,
]
self.df[field_names.CATEGORY_COUNT] = self.df[factors].sum(axis=1)
self.df[field_names.SCORE_M_COMMUNITIES] = self.df[factors].any(axis=1)
# Note: this is purely used for comparison tool analysis, and can be removed at a later date. - LMB.
non_workforce_factors = [
field_names.M_CLIMATE,
field_names.M_ENERGY,
field_names.M_TRANSPORTATION,
field_names.M_HOUSING,
field_names.M_POLLUTION,
field_names.M_WATER,
field_names.M_HEALTH,
]
self.df[field_names.M_NON_WORKFORCE] = self.df[
non_workforce_factors
].any(axis=1)
self.df[
field_names.SCORE_M + field_names.PERCENTILE_FIELD_SUFFIX
] = self.df[field_names.SCORE_M_COMMUNITIES].astype(int)
return self.df