diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 11a29b11..d3338a82 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -4,6 +4,8 @@ import datetime import pandas as pd from data_pipeline.config import settings +from data_pipeline.score import field_names + # Base Paths DATA_PATH = Path(settings.APP_ROOT) / "data" TMP_PATH = DATA_PATH / "tmp" @@ -59,88 +61,92 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = ( # Column subsets CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"] TILES_SCORE_COLUMNS = [ - "GEOID10_TRACT", - "State Name", - "County Name", - "Total population", - "Score D (percentile)", - "Score D (top 25th percentile)", - "Score E (percentile)", - "Score E (top 25th percentile)", - "Score G (communities)", - "Score G", - "Definition L (communities)", - "Definition L (percentile)", - "Poverty (Less than 200% of federal poverty line) (percentile)", - "Percent individuals age 25 or over with less than high school degree (percentile)", - "Linguistic isolation (percent) (percentile)", - "Unemployed civilians (percent) (percentile)", - "Housing burden (percent) (percentile)", - "Diagnosed diabetes among adults aged >=18 years (percentile)", - "Current asthma among adults aged >=18 years (percentile)", - "Coronary heart disease among adults aged >=18 years (percentile)", - "Life expectancy (years) (percentile)", - "Traffic proximity and volume (percentile)", - "FEMA Risk Index Expected Annual Loss Score (percentile)", - "Energy burden (percentile)", - "Wastewater discharge (percentile)", - "Percent pre-1960s housing (lead paint indicator) (percentile)", - "Diesel particulate matter (percentile)", - "Particulate matter (PM2.5) (percentile)", - "Median household income (% of AMI) (percentile)", - "Percent of individuals < 200% Federal Poverty Line (percentile)", + field_names.GEOID_TRACT_FIELD, + field_names.STATE_FIELD, + field_names.COUNTY_FIELD, + field_names.TOTAL_POP_FIELD, + field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX, + field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX, + field_names.SCORE_G_COMMUNITIES, + field_names.SCORE_G, + field_names.SCORE_L_COMMUNITIES, + field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.POVERTY_LESS_THAN_200_FPL_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, ] # columns to round floats to 2 decimals TILES_SCORE_FLOAT_COLUMNS = [ - "Score D (percentile)", - "Score D (top 25th percentile)", - "Score E (percentile)", - "Score E (top 25th percentile)", - "Definition L (percentile)", - "Poverty (Less than 200% of federal poverty line)", - "Percent individuals age 25 or over with less than high school degree", - "Linguistic isolation (percent)", - "Unemployed civilians (percent)", - "Housing burden (percent)", - "Poverty (Less than 200% of federal poverty line) (percentile)", - "Percent individuals age 25 or over with less than high school degree (percentile)", - "Linguistic isolation (percent) (percentile)", - "Unemployed civilians (percent) (percentile)", - "Housing burden (percent) (percentile)", - "Diagnosed diabetes among adults aged >=18 years (percentile)", - "Current asthma among adults aged >=18 years (percentile)", - "Coronary heart disease among adults aged >=18 years (percentile)", - "Life expectancy (years) (percentile)", - "Traffic proximity and volume (percentile)", - "FEMA Risk Index Expected Annual Loss Score (percentile)", - "Energy burden (percentile)", - "Wastewater discharge (percentile)", - "Percent pre-1960s housing (lead paint indicator) (percentile)", - "Diesel particulate matter (percentile)", - "Particulate matter (PM2.5) (percentile)", - "Median household income (% of AMI) (percentile)", - "Percent of individuals < 200% Federal Poverty Line (percentile)", + field_names.SCORE_D + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.SCORE_D + field_names.TOP_25_PERCENTILE_SUFFIX, + field_names.SCORE_E + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.SCORE_E + field_names.TOP_25_PERCENTILE_SUFFIX, + field_names.SCORE_L + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.POVERTY_FIELD, + field_names.HIGH_SCHOOL_ED_FIELD, + field_names.LINGUISTIC_ISO_FIELD, + field_names.UNEMPLOYMENT_FIELD, + field_names.HOUSING_BURDEN_FIELD, + field_names.POVERTY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.FEMA_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.POVERTY_LESS_THAN_200_FPL_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, ] TILES_ROUND_NUM_DECIMALS = 2 DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_BASIC = [ - "Area Median Income (State or metropolitan)", - "Percent of individuals < 100% Federal Poverty Line", - "Percent individuals age 25 or over with less than high school degree", - "Diagnosed diabetes among adults aged >=18 years", - "Current asthma among adults aged >=18 years", - "Coronary heart disease among adults aged >=18 years", - "Life expectancy (years)", - "Traffic proximity and volume", - "FEMA Risk Index Expected Annual Loss Score", - "Energy burden", - "Housing burden (percent)", - "Wastewater discharge", - "Percent pre-1960s housing (lead paint indicator)", - "Diesel particulate matter", - "Particulate matter (PM2.5)", - "Total population", + field_names.AMI_FIELD, + field_names.POVERTY_LESS_THAN_100_FPL_FIELD, + field_names.HIGH_SCHOOL_ED_FIELD, + field_names.DIABETES_FIELD, + field_names.ASTHMA_FIELD, + field_names.HEART_DISEASE_FIELD, + field_names.LIFE_EXPECTANCY_FIELD, + field_names.TRAFFIC_FIELD, + field_names.FEMA_RISK_FIELD, + field_names.ENERGY_BURDEN_FIELD, + field_names.HOUSING_BURDEN_FIELD, + field_names.WASTEWATER_FIELD, + field_names.LEAD_PAINT_FIELD, + field_names.DIESEL_FIELD, + field_names.PM25_FIELD, + field_names.TOTAL_POP_FIELD, ] # For every indicator above, we want to include percentile and min-max normalized variants also @@ -155,11 +161,12 @@ DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL = list( # Finally we augment with the GEOID10, county, and state DOWNLOADABLE_SCORE_COLUMNS = [ - "GEOID10_TRACT", - "County Name", - "State Name", - "Score G (communities)", - "Median household income (% of AMI)", - "Median household income (% of state median household income) (percentile)", + field_names.GEOID_TRACT_FIELD, + field_names.COUNTY_FIELD, + field_names.STATE_FIELD, + field_names.SCORE_G_COMMUNITIES, + field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD, + field_names.MEDIAN_INCOME_AS_PERCENT_OF_STATE_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, *DOWNLOADABLE_SCORE_INDICATOR_COLUMNS_FULL, ] diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 585595cf..4723296c 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -276,12 +276,15 @@ class PostScoreETL(ExtractTransformLoad): inplace=False, ) - logger.info("Writing downloadable csv") - downloadable_df_copy.to_csv(csv_path, index=False) - logger.info("Writing downloadable excel") downloadable_df_copy.to_excel(excel_path, index=False) + logger.info("Writing downloadable csv") + downloadable_df_copy[self.GEOID_TRACT_FIELD_NAME] = ( + '"' + downloadable_df_copy[self.GEOID_TRACT_FIELD_NAME] + '"' + ) + downloadable_df_copy.to_csv(csv_path, index=False) + logger.info("Compressing files") files_to_compress = [csv_path, excel_path, pdf_path] zip_files(zip_path, files_to_compress) diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 39b6b7f9..1bbcb37b 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -1,6 +1,12 @@ # Suffixes PERCENTILE_FIELD_SUFFIX = " (percentile)" MIN_MAX_FIELD_SUFFIX = " (min-max normalized)" +TOP_25_PERCENTILE_SUFFIX = " (top 25th percentile)" + +# Geographic field names +GEOID_TRACT_FIELD = "GEOID10_TRACT" +STATE_FIELD = "State Name" +COUNTY_FIELD = "County Name" # Score file field names SCORE_A = "Score A" @@ -21,6 +27,7 @@ SCORE_I = "Score I" SCORE_I_COMMUNITIES = "Score I (communities)" SCORE_K = "NMTC (communities)" SCORE_K_COMMUNITIES = "Score K (communities)" +SCORE_L = "Definition L" SCORE_L_COMMUNITIES = "Definition L (communities)" L_CLIMATE = "Climate Factor (Definition L)" L_ENERGY = "Energy Factor (Definition L)" @@ -45,7 +52,6 @@ POVERTY_LESS_THAN_150_FPL_FIELD = ( POVERTY_LESS_THAN_100_FPL_FIELD = ( "Percent of individuals < 100% Federal Poverty Line" ) -MEDIAN_INCOME_PERCENT_AMI_FIELD = "Median household income (% of AMI)" STATE_MEDIAN_INCOME_FIELD = ( "Median household income (State; 2019 inflation-adjusted dollars)" ) diff --git a/data/data-pipeline/data_pipeline/score/score_l.py b/data/data-pipeline/data_pipeline/score/score_l.py index d745f5b9..35c56a65 100644 --- a/data/data-pipeline/data_pipeline/score/score_l.py +++ b/data/data-pipeline/data_pipeline/score/score_l.py @@ -528,7 +528,7 @@ class ScoreL(Score): median_income_threshold = ( self.df[ - field_names.MEDIAN_INCOME_PERCENT_AMI_FIELD + field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD + field_names.PERCENTILE_FIELD_SUFFIX ] # Note: a high median income as a % of AMI is good, so take 1 minus the threshold to invert it.