Changing LHE in tiles to a boolean (#1767)

also includes merging / clean up of the release
This commit is contained in:
Emma Nechamkin 2022-08-03 13:55:58 -04:00 committed by Emma Nechamkin
parent b0a728437c
commit 0d90ae563a
5 changed files with 18 additions and 89 deletions

View file

@ -342,8 +342,6 @@ TILES_SCORE_FLOAT_COLUMNS = [
+ field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PERCENTILE_FIELD_SUFFIX,
# Island areas HS degree attainment rate # Island areas HS degree attainment rate
field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009, field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009,
field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD,
field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.COLLEGE_NON_ATTENDANCE_FIELD, field_names.COLLEGE_NON_ATTENDANCE_FIELD,
field_names.COLLEGE_ATTENDANCE_FIELD, field_names.COLLEGE_ATTENDANCE_FIELD,

View file

@ -274,26 +274,6 @@ class ScoreETL(ExtractTransformLoad):
E.g., "PM2.5 exposure (percentile)". E.g., "PM2.5 exposure (percentile)".
This will be for the entire country. This will be for the entire country.
For an "apples-to-apples" comparison of urban tracts to other urban tracts,
and compare rural tracts to other rural tracts.
This percentile will be created and returned as
f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}".
E.g., "PM2.5 exposure (percentile urban/rural)".
This field exists for every tract, but for urban tracts this value will be the
percentile compared to other urban tracts, and for rural tracts this value
will be the percentile compared to other rural tracts.
Specific methdology:
1. Decide a methodology for confirming whether a tract counts as urban or
rural. Currently in the codebase, we use Geocorr to identify the % rural of
a tract, and mark the tract as rural if the percentage is >50% and urban
otherwise. This may or may not be the right methodology.
2. Once tracts are marked as urban or rural, create one percentile rank
that only ranks urban tracts, and one percentile rank that only ranks rural
tracts.
3. Combine into a single field.
`output_column_name_root` is different from `input_column_name` to enable the `output_column_name_root` is different from `input_column_name` to enable the
reverse percentile use case. In that use case, `input_column_name` may be reverse percentile use case. In that use case, `input_column_name` may be
something like "3rd grade reading proficiency" and `output_column_name_root` something like "3rd grade reading proficiency" and `output_column_name_root`
@ -413,7 +393,6 @@ class ScoreETL(ExtractTransformLoad):
field_names.NPL_FIELD, field_names.NPL_FIELD,
field_names.WASTEWATER_FIELD, field_names.WASTEWATER_FIELD,
field_names.LEAD_PAINT_FIELD, field_names.LEAD_PAINT_FIELD,
field_names.UST_FIELD,
field_names.UNDER_5_FIELD, field_names.UNDER_5_FIELD,
field_names.OVER_64_FIELD, field_names.OVER_64_FIELD,
field_names.LINGUISTIC_ISO_FIELD, field_names.LINGUISTIC_ISO_FIELD,
@ -436,6 +415,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.EXTREME_HEAT_FIELD, field_names.EXTREME_HEAT_FIELD,
field_names.HEALTHY_FOOD_FIELD, field_names.HEALTHY_FOOD_FIELD,
field_names.IMPENETRABLE_SURFACES_FIELD, field_names.IMPENETRABLE_SURFACES_FIELD,
field_names.UST_FIELD,
# We have to pass this boolean here in order to include it in ag value loss percentiles. # We have to pass this boolean here in order to include it in ag value loss percentiles.
field_names.AGRICULTURAL_VALUE_BOOL_FIELD, field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD, field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
@ -489,7 +469,13 @@ class ScoreETL(ExtractTransformLoad):
df_copy = df[columns_to_keep].copy() df_copy = df[columns_to_keep].copy()
df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric) assert len(numeric_columns) == len(
set(numeric_columns)
), "You have a double-entered column in the numeric columns list"
df_copy[numeric_columns] = df_copy[numeric_columns].apply(
pd.to_numeric
)
# Convert all columns to numeric and do math # Convert all columns to numeric and do math
# Note that we have a few special conditions here, that we handle explicitly. # Note that we have a few special conditions here, that we handle explicitly.
@ -535,24 +521,6 @@ class ScoreETL(ExtractTransformLoad):
drop_tracts=drop_tracts, drop_tracts=drop_tracts,
) )
# Min-max normalization:
# (
# Observed value
# - minimum of all values
# )
# divided by
# (
# Maximum of all values
# - minimum of all values
# )
min_value = df_copy[numeric_column].min(skipna=True)
max_value = df_copy[numeric_column].max(skipna=True)
df_copy[f"{numeric_column}{field_names.MIN_MAX_FIELD_SUFFIX}"] = (
df_copy[numeric_column] - min_value
) / (max_value - min_value)
# Create reversed percentiles for these fields # Create reversed percentiles for these fields
for reverse_percentile in reverse_percentiles: for reverse_percentile in reverse_percentiles:
# Calculate reverse percentiles # Calculate reverse percentiles

View file

@ -432,7 +432,6 @@ HAZARDOUS_WASTE_LOW_INCOME_FIELD = (
WASTEWATER_DISCHARGE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for wastewater discharge and is low income?" WASTEWATER_DISCHARGE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for wastewater discharge and is low income?"
UST_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for leaky underground storage tanks and is low income?" UST_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for leaky underground storage tanks and is low income?"
# Health Burdens # Health Burdens
DIABETES_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for diabetes and is low income?" DIABETES_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for diabetes and is low income?"
ASTHMA_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for asthma and is low income?" ASTHMA_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for asthma and is low income?"

View file

@ -435,6 +435,11 @@ class ScoreNarwhal(Score):
# poverty level and has a low percent of higher ed students # poverty level and has a low percent of higher ed students
# Source: Census's American Community Survey # Source: Census's American Community Survey
eligibility_columns = [
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
field_names.UST_LOW_INCOME_FIELD,
]
self.df[field_names.WASTEWATER_PCTILE_THRESHOLD] = ( self.df[field_names.WASTEWATER_PCTILE_THRESHOLD] = (
self.df[ self.df[
field_names.WASTEWATER_FIELD field_names.WASTEWATER_FIELD
@ -457,28 +462,17 @@ class ScoreNarwhal(Score):
& self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
) )
self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[
[
field_names.WASTEWATER_PCTILE_THRESHOLD,
field_names.UST_PCTILE_THRESHOLD,
]
].max(axis=1)
self._increment_total_eligibility_exceeded( self._increment_total_eligibility_exceeded(
[ eligibility_columns,
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
field_names.UST_LOW_INCOME_FIELD,
],
skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS, skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
) )
return self.df[ self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[
[ eligibility_columns
field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
field_names.UST_LOW_INCOME_FIELD,
]
].any(axis=1) ].any(axis=1)
return self.df[field_names.WATER_THRESHOLD_EXCEEDED]
def _health_factor(self) -> bool: def _health_factor(self) -> bool:
# In Xth percentile or above for diabetes (Source: CDC Places) # In Xth percentile or above for diabetes (Source: CDC Places)
# or # or

View file

@ -28,7 +28,6 @@ class ScoreRunner:
self.df = ScoreA(df=self.df).add_columns() self.df = ScoreA(df=self.df).add_columns()
self.df = ScoreB(df=self.df).add_columns() self.df = ScoreB(df=self.df).add_columns()
self.df = ScoreC(df=self.df).add_columns() self.df = ScoreC(df=self.df).add_columns()
self.df = ScoreD(df=self.df).add_columns()
self.df = ScoreF(df=self.df).add_columns() self.df = ScoreF(df=self.df).add_columns()
self.df = ScoreG(df=self.df).add_columns() self.df = ScoreG(df=self.df).add_columns()
self.df = ScoreH(df=self.df).add_columns() self.df = ScoreH(df=self.df).add_columns()
@ -38,33 +37,4 @@ class ScoreRunner:
self.df = ScoreM(df=self.df).add_columns() self.df = ScoreM(df=self.df).add_columns()
self.df = ScoreNarwhal(df=self.df).add_columns() self.df = ScoreNarwhal(df=self.df).add_columns()
# TODO do this with each score instead of in a bundle
# Create percentiles for these index scores
self.df = self._add_score_percentiles()
return self.df
def _add_score_percentiles(self) -> pd.DataFrame:
logger.info("Adding Score Percentiles")
for score_field in [
field_names.SCORE_A,
field_names.SCORE_B,
field_names.SCORE_C,
field_names.SCORE_D,
field_names.SCORE_E,
]:
self.df[
f"{score_field}{field_names.PERCENTILE_FIELD_SUFFIX}"
] = self.df[score_field].rank(pct=True)
for threshold in [0.25, 0.3, 0.35, 0.4]:
fraction_converted_to_percent = int(100 * threshold)
self.df[
f"{score_field} (top {fraction_converted_to_percent}th percentile)"
] = (
self.df[
f"{score_field}{field_names.PERCENTILE_FIELD_SUFFIX}"
]
>= 1 - threshold
)
return self.df return self.df