mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 14:21:16 -07:00
Update etl constants to use score field_names and put strings around tract IDs in downloadable CSV (#985)
* Update etl constants to use score field_names Put strings around tract IDs in downloadable CSV No need to modify the xls file creation because the string type is preserved and interpreted correctly in Excel already. One note is that this does cause the ID in the CSV to be have quotes around it, which might be annoying. Maybe we don't want this behavior? * Update based on PR feedback and lint needs * Change field we're using in downloadable This reverts the downloadable csv field list to use MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD instead of MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD in order to get the test to pass. The point of this PR is a refactor (and a small change to the CSV quotations), not to change the output. That will be a different PR later. Co-authored-by: Shelby Switzer <shelby.switzer@cms.hhs.gov>
This commit is contained in:
parent
bbc4a4dec0
commit
819f3ff478
4 changed files with 101 additions and 85 deletions
|
@ -4,6 +4,8 @@ import datetime
|
|||
import pandas as pd
|
||||
from data_pipeline.config import settings
|
||||
|
||||
from data_pipeline.score import field_names
|
||||
|
||||
# Base Paths
|
||||
DATA_PATH = Path(settings.APP_ROOT) / "data"
|
||||
TMP_PATH = DATA_PATH / "tmp"
|
||||
|
@ -59,88 +61,92 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = (
|
|||
# Column subsets
|
||||
CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"]
|
||||
TILES_SCORE_COLUMNS = [
|
||||
"GEOID10_TRACT",
|
||||
"State Name",
|
||||
"County Name",
|
||||
"Total population",
|
||||
"Score D (percentile)",
|
||||
"Score D (top 25th percentile)",
|
||||
"Score E (percentile)",
|
||||
"Score E (top 25th percentile)",
|
||||
"Score G (communities)",
|
||||
"Score G",
|
||||
"Definition L (communities)",
|
||||
"Definition L (percentile)",
|
||||
"Poverty (Less than 200% of federal poverty line) (percentile)",
|
||||
"Percent individuals age 25 or over with less than high school degree (percentile)",
|
||||
"Linguistic isolation (percent) (percentile)",
|
||||
"Unemployed civilians (percent) (percentile)",
|
||||
"Housing burden (percent) (percentile)",
|
||||
"Diagnosed diabetes among adults aged >=18 years (percentile)",
|
||||
"Current asthma among adults aged >=18 years (percentile)",
|
||||
"Coronary heart disease among adults aged >=18 years (percentile)",
|
||||
"Life expectancy (years) (percentile)",
|
||||
"Traffic proximity and volume (percentile)",
|
||||
"FEMA Risk Index Expected Annual Loss Score (percentile)",
|
||||
"Energy burden (percentile)",
|
||||
"Wastewater discharge (percentile)",
|
||||
"Percent pre-1960s housing (lead paint indicator) (percentile)",
|
||||
"Diesel particulate matter (percentile)",
|
||||
"Particulate matter (PM2.5) (percentile)",
|
||||
"Median household income (% of AMI) (percentile)",
|
||||
"Percent of individuals < 200% Federal Poverty Line (percentile)",
|
||||
field_names.GEOID_TRACT_FIELD,
|
||||
field_names.STATE_FIELD,
|
||||
field_names.COUNTY_FIELD,
|
||||
field_names.TOTAL_POP_FIELD,
|
||||
field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX,
|
||||
field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX,
|
||||
field_names.SCORE_G_COMMUNITIES,
|
||||
field_names.SCORE_G,
|
||||
field_names.SCORE_L_COMMUNITIES,
|
||||
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
]
|
||||
|
||||
# columns to round floats to 2 decimals
|
||||
TILES_SCORE_FLOAT_COLUMNS = [
|
||||
"Score D (percentile)",
|
||||
"Score D (top 25th percentile)",
|
||||
"Score E (percentile)",
|
||||
"Score E (top 25th percentile)",
|
||||
"Definition L (percentile)",
|
||||
"Poverty (Less than 200% of federal poverty line)",
|
||||
"Percent individuals age 25 or over with less than high school degree",
|
||||
"Linguistic isolation (percent)",
|
||||
"Unemployed civilians (percent)",
|
||||
"Housing burden (percent)",
|
||||
"Poverty (Less than 200% of federal poverty line) (percentile)",
|
||||
"Percent individuals age 25 or over with less than high school degree (percentile)",
|
||||
"Linguistic isolation (percent) (percentile)",
|
||||
"Unemployed civilians (percent) (percentile)",
|
||||
"Housing burden (percent) (percentile)",
|
||||
"Diagnosed diabetes among adults aged >=18 years (percentile)",
|
||||
"Current asthma among adults aged >=18 years (percentile)",
|
||||
"Coronary heart disease among adults aged >=18 years (percentile)",
|
||||
"Life expectancy (years) (percentile)",
|
||||
"Traffic proximity and volume (percentile)",
|
||||
"FEMA Risk Index Expected Annual Loss Score (percentile)",
|
||||
"Energy burden (percentile)",
|
||||
"Wastewater discharge (percentile)",
|
||||
"Percent pre-1960s housing (lead paint indicator) (percentile)",
|
||||
"Diesel particulate matter (percentile)",
|
||||
"Particulate matter (PM2.5) (percentile)",
|
||||
"Median household income (% of AMI) (percentile)",
|
||||
"Percent of individuals < 200% Federal Poverty Line (percentile)",
|
||||
field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX,
|
||||
field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX,
|
||||
field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.POVERTY_FIELD,
|
||||
field_names.HIGH_SCHOOL_ED_FIELD,
|
||||
field_names.LINGUISTIC_ISO_FIELD,
|
||||
field_names.UNEMPLOYMENT_FIELD,
|
||||
field_names.HOUSING_BURDEN_FIELD,
|
||||
field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
field_names.POVERTY_LESS_THAN_200_FPL_FIELD
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
]
|
||||
TILES_ROUND_NUM_DECIMALS = 2
|
||||
|
||||
DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC = [
|
||||
"Area Median Income (State or metropolitan)",
|
||||
"Percent of individuals < 100% Federal Poverty Line",
|
||||
"Percent individuals age 25 or over with less than high school degree",
|
||||
"Diagnosed diabetes among adults aged >=18 years",
|
||||
"Current asthma among adults aged >=18 years",
|
||||
"Coronary heart disease among adults aged >=18 years",
|
||||
"Life expectancy (years)",
|
||||
"Traffic proximity and volume",
|
||||
"FEMA Risk Index Expected Annual Loss Score",
|
||||
"Energy burden",
|
||||
"Housing burden (percent)",
|
||||
"Wastewater discharge",
|
||||
"Percent pre-1960s housing (lead paint indicator)",
|
||||
"Diesel particulate matter",
|
||||
"Particulate matter (PM2.5)",
|
||||
"Total population",
|
||||
field_names.AMI_FIELD,
|
||||
field_names.POVERTY_LESS_THAN_100_FPL_FIELD,
|
||||
field_names.HIGH_SCHOOL_ED_FIELD,
|
||||
field_names.DIABETES_FIELD,
|
||||
field_names.ASTHMA_FIELD,
|
||||
field_names.HEART_DISEASE_FIELD,
|
||||
field_names.LIFE_EXPECTANCY_FIELD,
|
||||
field_names.TRAFFIC_FIELD,
|
||||
field_names.FEMA_RISK_FIELD,
|
||||
field_names.ENERGY_BURDEN_FIELD,
|
||||
field_names.HOUSING_BURDEN_FIELD,
|
||||
field_names.WASTEWATER_FIELD,
|
||||
field_names.LEAD_PAINT_FIELD,
|
||||
field_names.DIESEL_FIELD,
|
||||
field_names.PM25_FIELD,
|
||||
field_names.TOTAL_POP_FIELD,
|
||||
]
|
||||
|
||||
# For every indicator above, we want to include percentile and min-max normalized variants also
|
||||
|
@ -155,11 +161,12 @@ DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list(
|
|||
|
||||
# Finally we augment with the GEOID10, county, and state
|
||||
DOWNLOADABLE_SCORE_COLUMNS = [
|
||||
"GEOID10_TRACT",
|
||||
"County Name",
|
||||
"State Name",
|
||||
"Score G (communities)",
|
||||
"Median household income (% of AMI)",
|
||||
"Median household income (% of state median household income) (percentile)",
|
||||
field_names.GEOID_TRACT_FIELD,
|
||||
field_names.COUNTY_FIELD,
|
||||
field_names.STATE_FIELD,
|
||||
field_names.SCORE_G_COMMUNITIES,
|
||||
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD,
|
||||
field_names.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX,
|
||||
*DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL,
|
||||
]
|
||||
|
|
|
@ -276,12 +276,15 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
inplace=False,
|
||||
)
|
||||
|
||||
logger.info("Writing downloadable csv")
|
||||
downloadable_df_copy.to_csv(csv_path, index=False)
|
||||
|
||||
logger.info("Writing downloadable excel")
|
||||
downloadable_df_copy.to_excel(excel_path, index=False)
|
||||
|
||||
logger.info("Writing downloadable csv")
|
||||
downloadable_df_copy[self.GEOID_TRACT_FIELD_NAME] = (
|
||||
'"' + downloadable_df_copy[self.GEOID_TRACT_FIELD_NAME] + '"'
|
||||
)
|
||||
downloadable_df_copy.to_csv(csv_path, index=False)
|
||||
|
||||
logger.info("Compressing files")
|
||||
files_to_compress = [csv_path, excel_path, pdf_path]
|
||||
zip_files(zip_path, files_to_compress)
|
||||
|
|
|
@ -1,6 +1,12 @@
|
|||
# Suffixes
|
||||
PERCENTILE_FIELD_SUFFIX = " (percentile)"
|
||||
MIN_MAX_FIELD_SUFFIX = " (min-max normalized)"
|
||||
TOP_25_PERCENTILE_SUFFIX = " (top 25th percentile)"
|
||||
|
||||
# Geographic field names
|
||||
GEOID_TRACT_FIELD = "GEOID10_TRACT"
|
||||
STATE_FIELD = "State Name"
|
||||
COUNTY_FIELD = "County Name"
|
||||
|
||||
# Score file field names
|
||||
SCORE_A = "Score A"
|
||||
|
@ -21,6 +27,7 @@ SCORE_I = "Score I"
|
|||
SCORE_I_COMMUNITIES = "Score I (communities)"
|
||||
SCORE_K = "NMTC (communities)"
|
||||
SCORE_K_COMMUNITIES = "Score K (communities)"
|
||||
SCORE_L = "Definition L"
|
||||
SCORE_L_COMMUNITIES = "Definition L (communities)"
|
||||
L_CLIMATE = "Climate Factor (Definition L)"
|
||||
L_ENERGY = "Energy Factor (Definition L)"
|
||||
|
@ -45,7 +52,6 @@ POVERTY_LESS_THAN_150_FPL_FIELD = (
|
|||
POVERTY_LESS_THAN_100_FPL_FIELD = (
|
||||
"Percent of individuals < 100% Federal Poverty Line"
|
||||
)
|
||||
MEDIAN_INCOME_PERCENT_AMI_FIELD = "Median household income (% of AMI)"
|
||||
STATE_MEDIAN_INCOME_FIELD = (
|
||||
"Median household income (State; 2019 inflation-adjusted dollars)"
|
||||
)
|
||||
|
|
|
@ -528,7 +528,7 @@ class ScoreL(Score):
|
|||
|
||||
median_income_threshold = (
|
||||
self.df[
|
||||
field_names.MEDIAN_INCOME_PERCENT_AMI_FIELD
|
||||
field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD
|
||||
+ field_names.PERCENTILE_FIELD_SUFFIX
|
||||
]
|
||||
# Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue