Update etl constants to use score field_names and put strings around tract IDs in downloadable CSV (#985)

* Update etl constants to use score field_names

Put strings around tract IDs in downloadable CSV

No need to modify the xls file creation because the string type is
preserved and interpreted correctly in Excel already.

One note is that this does cause the ID in the CSV to be have quotes
around it, which might be annoying. Maybe we don't want this behavior?

* Update based on PR feedback and lint needs

* Change field we're using in downloadable

This reverts the downloadable csv field list to use
MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD instead of
MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD in order to get the test to pass.
The point of this PR is a refactor (and a small change to the CSV
quotations), not to change the output. That will be a different PR
later.

Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
This commit is contained in:
Shelby Switzer 2021-12-06 13:17:17 -05:00 committed by GitHub
commit 819f3ff478
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 101 additions and 85 deletions

View file

@ -4,6 +4,8 @@ import datetime
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.score import field_names
# Base Paths
DATA_PATH = Path(settings.APP_ROOT) / "data"
TMP_PATH = DATA_PATH / "tmp"
@ -59,88 +61,92 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
# Column subsets
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
TILES_SCORE_COLUMNS = [
"GEOID10_TRACT",
"State Name",
"County Name",
"Total population",
"Score D (percentile)",
"Score D (top 25th percentile)",
"Score E (percentile)",
"Score E (top 25th percentile)",
"Score G (communities)",
"Score G",
"Definition L (communities)",
"Definition L (percentile)",
"Poverty (Less than 200% of federal poverty line) (percentile)",
"Percent individuals age 25 or over with less than high school degree (percentile)",
"Linguistic isolation (percent) (percentile)",
"Unemployed civilians (percent) (percentile)",
"Housing burden (percent) (percentile)",
"Diagnosed diabetes among adults aged >=18 years (percentile)",
"Current asthma among adults aged >=18 years (percentile)",
"Coronary heart disease among adults aged >=18 years (percentile)",
"Life expectancy (years) (percentile)",
"Traffic proximity and volume (percentile)",
"FEMA Risk Index Expected Annual Loss Score (percentile)",
"Energy burden (percentile)",
"Wastewater discharge (percentile)",
"Percent pre-1960s housing (lead paint indicator) (percentile)",
"Diesel particulate matter (percentile)",
"Particulate matter (PM2.5) (percentile)",
"Median household income (% of AMI) (percentile)",
"Percent of individuals < 200% Federal Poverty Line (percentile)",
field_names.GEOID_TRACT_FIELD,
field_names.STATE_FIELD,
field_names.COUNTY_FIELD,
field_names.TOTAL_POP_FIELD,
field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_G_COMMUNITIES,
field_names.SCORE_G,
field_names.SCORE_L_COMMUNITIES,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
]
# columns to round floats to 2 decimals
TILES_SCORE_FLOAT_COLUMNS = [
"Score D (percentile)",
"Score D (top 25th percentile)",
"Score E (percentile)",
"Score E (top 25th percentile)",
"Definition L (percentile)",
"Poverty (Less than 200% of federal poverty line)",
"Percent individuals age 25 or over with less than high school degree",
"Linguistic isolation (percent)",
"Unemployed civilians (percent)",
"Housing burden (percent)",
"Poverty (Less than 200% of federal poverty line) (percentile)",
"Percent individuals age 25 or over with less than high school degree (percentile)",
"Linguistic isolation (percent) (percentile)",
"Unemployed civilians (percent) (percentile)",
"Housing burden (percent) (percentile)",
"Diagnosed diabetes among adults aged >=18 years (percentile)",
"Current asthma among adults aged >=18 years (percentile)",
"Coronary heart disease among adults aged >=18 years (percentile)",
"Life expectancy (years) (percentile)",
"Traffic proximity and volume (percentile)",
"FEMA Risk Index Expected Annual Loss Score (percentile)",
"Energy burden (percentile)",
"Wastewater discharge (percentile)",
"Percent pre-1960s housing (lead paint indicator) (percentile)",
"Diesel particulate matter (percentile)",
"Particulate matter (PM2.5) (percentile)",
"Median household income (% of AMI) (percentile)",
"Percent of individuals < 200% Federal Poverty Line (percentile)",
field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX,
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.LINGUISTIC_ISO_FIELD,
field_names.UNEMPLOYMENT_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
]
TILES_ROUND_NUM_DECIMALS = 2
DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC = [
"Area Median Income (State or metropolitan)",
"Percent of individuals < 100% Federal Poverty Line",
"Percent individuals age 25 or over with less than high school degree",
"Diagnosed diabetes among adults aged >=18 years",
"Current asthma among adults aged >=18 years",
"Coronary heart disease among adults aged >=18 years",
"Life expectancy (years)",
"Traffic proximity and volume",
"FEMA Risk Index Expected Annual Loss Score",
"Energy burden",
"Housing burden (percent)",
"Wastewater discharge",
"Percent pre-1960s housing (lead paint indicator)",
"Diesel particulate matter",
"Particulate matter (PM2.5)",
"Total population",
field_names.AMI_FIELD,
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
field_names.HIGH_SCHOOL_ED_FIELD,
field_names.DIABETES_FIELD,
field_names.ASTHMA_FIELD,
field_names.HEART_DISEASE_FIELD,
field_names.LIFE_EXPECTANCY_FIELD,
field_names.TRAFFIC_FIELD,
field_names.FEMA_RISK_FIELD,
field_names.ENERGY_BURDEN_FIELD,
field_names.HOUSING_BURDEN_FIELD,
field_names.WASTEWATER_FIELD,
field_names.LEAD_PAINT_FIELD,
field_names.DIESEL_FIELD,
field_names.PM25_FIELD,
field_names.TOTAL_POP_FIELD,
]
# For every indicator above, we want to include percentile and min-max normalized variants also
@ -155,11 +161,12 @@ DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(
# Finally we augment with the GEOID10, county, and state
DOWNLOADABLE_SCORE_COLUMNS = [
"GEOID10_TRACT",
"County Name",
"State Name",
"Score G (communities)",
"Median household income (% of AMI)",
"Median household income (% of state median household income) (percentile)",
field_names.GEOID_TRACT_FIELD,
field_names.COUNTY_FIELD,
field_names.STATE_FIELD,
field_names.SCORE_G_COMMUNITIES,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
field_names.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX,
*DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL,
]

View file

@ -276,12 +276,15 @@ class PostScoreETL(ExtractTransformLoad):
inplace=False,
)
logger.info("Writing downloadable csv")
downloadable_df_copy.to_csv(csv_path, index=False)
logger.info("Writing downloadable excel")
downloadable_df_copy.to_excel(excel_path, index=False)
logger.info("Writing downloadable csv")
downloadable_df_copy[self.GEOID_TRACT_FIELD_NAME] = (
'"' + downloadable_df_copy[self.GEOID_TRACT_FIELD_NAME] + '"'
)
downloadable_df_copy.to_csv(csv_path, index=False)
logger.info("Compressing files")
files_to_compress = [csv_path, excel_path, pdf_path]
zip_files(zip_path, files_to_compress)

View file

@ -1,6 +1,12 @@
# Suffixes
PERCENTILE_FIELD_SUFFIX = " (percentile)"
MIN_MAX_FIELD_SUFFIX = " (min-max normalized)"
TOP_25_PERCENTILE_SUFFIX = " (top 25th percentile)"
# Geographic field names
GEOID_TRACT_FIELD = "GEOID10_TRACT"
STATE_FIELD = "State Name"
COUNTY_FIELD = "County Name"
# Score file field names
SCORE_A = "Score A"
@ -21,6 +27,7 @@ SCORE_I = "Score I"
SCORE_I_COMMUNITIES = "Score I (communities)"
SCORE_K = "NMTC (communities)"
SCORE_K_COMMUNITIES = "Score K (communities)"
SCORE_L = "Definition L"
SCORE_L_COMMUNITIES = "Definition L (communities)"
L_CLIMATE = "Climate Factor (Definition L)"
L_ENERGY = "Energy Factor (Definition L)"
@ -45,7 +52,6 @@ POVERTY_LESS_THAN_150_FPL_FIELD = (
POVERTY_LESS_THAN_100_FPL_FIELD = (
"Percent of individuals < 100% Federal Poverty Line"
)
MEDIAN_INCOME_PERCENT_AMI_FIELD = "Median household income (% of AMI)"
STATE_MEDIAN_INCOME_FIELD = (
"Median household income (State; 2019 inflation-adjusted dollars)"
)

View file

@ -528,7 +528,7 @@ class ScoreL(Score):
median_income_threshold = (
self.df[
field_names.MEDIAN_INCOME_PERCENT_AMI_FIELD
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
# Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.