Issue 970: reverse percentiles for AMI and life expectancy (#1018)

* switching to low

* fixing score-etl-post

* updating comments

* fixing comparison

* create separate field for clarity

* comment fix

* removing healthy food

* fixing bug in score post

* running black and adding comment

* Update pickles and add a helpful notes to README

Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
This commit is contained in:
Lucas Merrill Brown 2021-12-10 10:16:22 -05:00 committed by GitHub
commit 7fcecaee42
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 144 additions and 100 deletions

View file

@ -81,7 +81,7 @@ TILES_SCORE_COLUMNS = [
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
@ -89,7 +89,7 @@ TILES_SCORE_COLUMNS = [
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
@ -115,7 +115,7 @@ TILES_SCORE_FLOAT_COLUMNS = [
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
@ -123,7 +123,7 @@ TILES_SCORE_FLOAT_COLUMNS = [
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
@ -137,7 +137,6 @@ DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC = [
field_names.DIABETES_FIELD,
field_names.ASTHMA_FIELD,
field_names.HEART_DISEASE_FIELD,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.TRAFFIC_FIELD,
field_names.FEMA_RISK_FIELD,
field_names.ENERGY_BURDEN_FIELD,
@ -149,11 +148,11 @@ DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC = [
field_names.TOTAL_POP_FIELD,
]
# For every indicator above, we want to include percentile and min-max normalized variants also
# For every indicator above, we want to include percentile also.
DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(
pd.core.common.flatten(
[
[p, f"{p} (percentile)"]
[p, f"{p}{field_names.PERCENTILE_FIELD_SUFFIX}"]
for p in DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC
]
)
@ -165,8 +164,15 @@ DOWNLOADABLE_SCORE_COLUMNS = [
field_names.COUNTY_FIELD,
field_names.STATE_FIELD,
field_names.SCORE_G_COMMUNITIES,
# Note: the reverse percentile fields get moved down here because
# we put the raw value in the download along with the *reversed* percentile.
# All other fields we put in f"{field_name}" and
# f"{field_name}{field_names.PERCENTILE_FIELD_SUFFIX}", which doesn't work for the
# reversed percentile fields.
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD
field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
*DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL,
]

View file

@ -404,9 +404,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.POVERTY_LESS_THAN_150_FPL_FIELD,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD,
field_names.AMI_FIELD,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
field_names.MEDIAN_INCOME_FIELD,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.ENERGY_BURDEN_FIELD,
field_names.FEMA_RISK_FIELD,
field_names.URBAN_HEURISTIC_FIELD,
@ -439,7 +437,6 @@ class ScoreETL(ExtractTransformLoad):
field_names.CENSUS_UNEMPLOYMENT_FIELD_2010,
field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010,
field_names.CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009,
field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009,
field_names.EXTREME_HEAT_FIELD,
field_names.HEALTHY_FOOD_FIELD,
field_names.IMPENETRABLE_SURFACES_FIELD,
@ -468,7 +465,19 @@ class ScoreETL(ExtractTransformLoad):
ReversePercentile(
field_name=field_names.READING_FIELD,
low_field_name=field_names.LOW_READING_FIELD,
)
),
ReversePercentile(
field_name=field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
low_field_name=field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
),
ReversePercentile(
field_name=field_names.LIFE_EXPECTANCY_FIELD,
low_field_name=field_names.LOW_LIFE_EXPECTANCY_FIELD,
),
ReversePercentile(
field_name=field_names.CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009,
low_field_name=field_names.LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009,
),
]
columns_to_keep = (
@ -505,10 +514,6 @@ class ScoreETL(ExtractTransformLoad):
max_value = df_copy[numeric_column].max(skipna=True)
logger.info(
f"For data set {numeric_column}, the min value is {min_value} and the max value is {max_value}."
)
df_copy[f"{numeric_column}{field_names.MIN_MAX_FIELD_SUFFIX}"] = (
df_copy[numeric_column] - min_value
) / (max_value - min_value)

File diff suppressed because one or more lines are too long