From fb8be1a8cedddf7eb3fcb97fb2be1e450735bee3 Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Fri, 17 Dec 2021 13:48:31 -0500 Subject: [PATCH 1/9] Download column order completed --- .../data_pipeline/etl/score/constants.py | 98 +++++++++---------- .../data_pipeline/etl/score/etl_score_post.py | 8 +- 2 files changed, 52 insertions(+), 54 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index e1c15c77..bc8076ab 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -194,88 +194,84 @@ DOWNLOADABLE_SCORE_COLUMNS = [ field_names.SCORE_L_COMMUNITIES, field_names.TOTAL_POP_FIELD, field_names.FPL_200_SERIES, - field_names.POVERTY_LESS_THAN_200_FPL_FIELD, - field_names.POVERTY_LESS_THAN_200_FPL_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD, + field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD, field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD, - field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD, + field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD, + field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD, field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD, - field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD, + field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD, field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD, - field_names.ENERGY_BURDEN_FIELD, - field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD, field_names.ENERGY_BURDEN_LOW_INCOME_FIELD, - field_names.PM25_FIELD, - field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.ENERGY_BURDEN_FIELD, field_names.PM25_EXPOSURE_LOW_INCOME_FIELD, - field_names.DIESEL_FIELD, - field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PM25_FIELD, field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD, - field_names.TRAFFIC_FIELD, - field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.DIESEL_FIELD, field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD, - field_names.HOUSING_BURDEN_FIELD, - field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.TRAFFIC_FIELD, field_names.HOUSING_BURDEN_LOW_INCOME_FIELD, - field_names.LEAD_PAINT_FIELD, - field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HOUSING_BURDEN_FIELD, field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD, - field_names.MEDIAN_HOUSE_VALUE_FIELD, + field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LEAD_PAINT_FIELD, field_names.MEDIAN_HOUSE_VALUE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.TSDF_FIELD, - field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.MEDIAN_HOUSE_VALUE_FIELD, field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD, - field_names.NPL_FIELD, - field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.TSDF_FIELD, field_names.SUPERFUND_LOW_INCOME_FIELD, - field_names.RMP_FIELD, - field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.NPL_FIELD, field_names.RMP_LOW_INCOME_FIELD, - field_names.WASTEWATER_FIELD, - field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.RMP_FIELD, field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD, - field_names.ASTHMA_FIELD, - field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.WASTEWATER_FIELD, field_names.ASTHMA_LOW_INCOME_FIELD, - field_names.DIABETES_FIELD, - field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.ASTHMA_FIELD, field_names.DIABETES_LOW_INCOME_FIELD, - field_names.HEART_DISEASE_FIELD, - field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.DIABETES_FIELD, field_names.HEART_DISEASE_LOW_INCOME_FIELD, - field_names.LIFE_EXPECTANCY_FIELD, - field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HEART_DISEASE_FIELD, field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD, - field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD, + field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LIFE_EXPECTANCY_FIELD, + field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD, field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD, - field_names.LINGUISTIC_ISO_FIELD, + field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD, field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD, - field_names.UNEMPLOYMENT_FIELD, - field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.POVERTY_LESS_THAN_100_FPL_FIELD, + field_names.LINGUISTIC_ISO_FIELD, + field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD, + field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.UNEMPLOYMENT_FIELD, + field_names.POVERTY_LOW_HS_EDUCATION_FIELD, + field_names.POVERTY_LESS_THAN_200_FPL_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, field_names.POVERTY_LESS_THAN_100_FPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.POVERTY_LOW_HS_EDUCATION_FIELD, - field_names.HIGH_SCHOOL_ED_FIELD, + field_names.POVERTY_LESS_THAN_200_FPL_FIELD, + field_names.POVERTY_LESS_THAN_100_FPL_FIELD, field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LOW_HS_EDUCATION_FIELD, - field_names.THRESHOLD_COUNT, - field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD, + field_names.HIGH_SCHOOL_ED_FIELD, field_names.COMBINED_UNEMPLOYMENT_2010, - field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009, field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010, field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD, field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD, field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD, - field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD, + field_names.THRESHOLD_COUNT, ] diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 8f527c7a..048b0bce 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -235,9 +235,11 @@ class PostScoreETL(ExtractTransformLoad): def _create_downloadable_data( self, score_county_state_merged_df: pd.DataFrame ) -> pd.DataFrame: - return score_county_state_merged_df[ - constants.DOWNLOADABLE_SCORE_COLUMNS - ] + df = score_county_state_merged_df[constants.DOWNLOADABLE_SCORE_COLUMNS] + + # rename fields + + return df def transform(self) -> None: logger.info("Transforming data sources for Score + County CSVs") From 943f6283f4d1cde9ed4df5a9be3092b47cf2652e Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Fri, 17 Dec 2021 15:16:43 -0500 Subject: [PATCH 2/9] Kameron changes --- .../data_pipeline/score/field_names.py | 98 +++++++++++-------- 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index bdca25d5..e3618916 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -45,7 +45,7 @@ MEDIAN_HOUSE_VALUE_PERCENTILE = 90 # Poverty / Income POVERTY_FIELD = "Poverty (Less than 200% of federal poverty line)" POVERTY_LESS_THAN_200_FPL_FIELD = ( - "Percent of individuals < 200% Federal Poverty Line" + "Percent of individuals below 200% Federal Poverty Line" ) POVERTY_LESS_THAN_150_FPL_FIELD = ( "Percent of individuals < 150% Federal Poverty Line" @@ -106,16 +106,20 @@ HT_INDEX_FIELD = ( ENERGY_BURDEN_FIELD = "Energy burden" # Health -DIABETES_FIELD = "Diagnosed diabetes among adults aged >=18 years" -ASTHMA_FIELD = "Current asthma among adults aged >=18 years" -HEART_DISEASE_FIELD = "Coronary heart disease among adults aged >=18 years" -CANCER_FIELD = "Cancer (excluding skin cancer) among adults aged >=18 years" +DIABETES_FIELD = ( + "Diagnosed diabetes among adults aged greater than or equal to 18 years" +) +ASTHMA_FIELD = ( + "Current asthma among adults aged greater than or equal to 18 years" +) +HEART_DISEASE_FIELD = ( + "Coronary heart disease among adults aged greater than or equal to 18 years" +) +CANCER_FIELD = "Cancer (excluding skin cancer) among adults aged greater than or equal to 18 years" HEALTH_INSURANCE_FIELD = ( "Current lack of health insurance among adults aged 18-64 years" ) -PHYS_HEALTH_NOT_GOOD_FIELD = ( - "Physical health not good for >=14 days among adults aged >=18 years" -) +PHYS_HEALTH_NOT_GOOD_FIELD = "Physical health not good for greater than or equal to 14 days among adults aged greater than or equal to 18 years" LIFE_EXPECTANCY_FIELD = "Life expectancy (years)" LOW_LIFE_EXPECTANCY_FIELD = "Low life expectancy" @@ -150,7 +154,7 @@ LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = "Low median househo # Fields from 2010 ACS (loaded for comparison with the territories) CENSUS_UNEMPLOYMENT_FIELD_2010 = "Unemployed civilians (percent) in 2010" CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = ( - "Percent of individuals < 100% Federal Poverty Line in 2010" + "Percent of individuals less than 100% Federal Poverty Line in 2010" ) # Combined fields that merge island areas and states data @@ -233,66 +237,74 @@ LOW_READING_FIELD = "Low third grade reading proficiency" ##### # Names for individual factors being exceeded # Climate Change -EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for expected population loss rate and is low income" -EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for expected agriculture loss rate and is low income" -EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for expected building loss rate and is low income" +EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD = ( + f"Greater than or equal to the {PERCENTILE}th percentile" + f" for expected population loss rate and is low income" +) +EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD = ( + f"Greater than or equal to the {PERCENTILE}th percentile" + f" for expected agriculture loss rate and is low income" +) +EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD = ( + f"Greater than or equal to the {PERCENTILE}th percentile" + f" for expected building loss rate and is low income" +) # Clean energy and efficiency -PM25_EXPOSURE_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for PM2.5 exposure and is low income" -ENERGY_BURDEN_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for energy burden and is low income" +PM25_EXPOSURE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for PM2.5 exposure and is low income" +ENERGY_BURDEN_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for energy burden and is low income" # Clean transportation -DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for diesel particulate matter and is low income" -TRAFFIC_PROXIMITY_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for traffic proximity and is low income" +DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for diesel particulate matter and is low income" +TRAFFIC_PROXIMITY_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for traffic proximity and is low income" # Affordable and Sustainable Housing LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD = ( - f"At or above the {PERCENTILE}th percentile for lead paint and" + f"Greater than or equal to the {PERCENTILE}th percentile for lead paint and" f" the median house value is less than {MEDIAN_HOUSE_VALUE_PERCENTILE}th " f"percentile and is low income" ) -HOUSING_BURDEN_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for housing burden and is low income" +HOUSING_BURDEN_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for housing burden and is low income" # Remediation and Reduction of Legacy Pollution -RMP_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for proximity to RMP sites and is low income" -SUPERFUND_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for proximity to superfund sites and is low income" -HAZARDOUS_WASTE_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for proximity to hazardous waste facilities and is low income" +RMP_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for proximity to RMP sites and is low income" +SUPERFUND_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for proximity to superfund sites and is low income" +HAZARDOUS_WASTE_LOW_INCOME_FIELD = ( + f"Greater than or equal to the {PERCENTILE}th percentile" + f" for proximity to hazardous waste facilities and is low income" +) # Critical Clean Water and Waste Infrastructure -WASTEWATER_DISCHARGE_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for wastewater discharge and is low income" +WASTEWATER_DISCHARGE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for wastewater discharge and is low income" # Health Burdens -DIABETES_LOW_INCOME_FIELD = ( - f"At or above the {PERCENTILE}th percentile for diabetes and is low income" -) -ASTHMA_LOW_INCOME_FIELD = ( - f"At or above the {PERCENTILE}th percentile for asthma and is low income" -) -HEART_DISEASE_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for heart disease and is low income" +DIABETES_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for diabetes and is low income" +ASTHMA_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for asthma and is low income" +HEART_DISEASE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for heart disease and is low income" LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD = ( - f"At or above the {PERCENTILE}th percentile " + f"Greater than or equal to the {PERCENTILE}th percentile " f"for low life expectancy and is low income" ) # Workforce UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD = ( - f"At or above the {PERCENTILE}th percentile for unemployment" + f"Greater than or equal to the {PERCENTILE}th percentile for unemployment" " and has low HS education" ) LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD = ( - f"At or above the {PERCENTILE}th percentile for households in linguistic isolation" + f"Greater than or equal to the {PERCENTILE}th percentile for households in linguistic isolation" " and has low HS education" ) POVERTY_LOW_HS_EDUCATION_FIELD = ( - f"At or above the {PERCENTILE}th percentile for households at or below 100% federal poverty level" + f"Greater than or equal to the {PERCENTILE}th percentile for households at or below 100% federal poverty level" " and has low HS education" ) LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = ( - f"At or above the {PERCENTILE}th percentile for low median household income as a " + f"Greater than or equal to the {PERCENTILE}th percentile for low median household income as a " f"percent of area median income and has low HS education" ) @@ -301,17 +313,17 @@ LOW_HS_EDUCATION_FIELD = "Low high school education" # Workforce for island areas ISLAND_AREAS_SUFFIX = " in 2009 (island areas)" ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD = ( - f"At or above the {PERCENTILE}th percentile for unemployment" + f"Greater than or equal to the {PERCENTILE}th percentile for unemployment" f" and has low HS education{ISLAND_AREAS_SUFFIX}" ) ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD = ( - f"At or above the {PERCENTILE}th percentile for households at or below 100% federal poverty level" + f"Greater than or equal to the {PERCENTILE}th percentile for households at or below 100% federal poverty level" f" and has low HS education{ISLAND_AREAS_SUFFIX}" ) ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD = ( - f"At or above the {PERCENTILE}th percentile for low median household income as a " + f"Greater than or equal to the {PERCENTILE}th percentile for low median household income as a " f"percent of area median income" f" and has low HS education{ISLAND_AREAS_SUFFIX}" ) @@ -322,22 +334,22 @@ ISLAND_AREAS_LOW_HS_EDUCATION_FIELD = ( # Not currently used in a factor EXTREME_HEAT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD = ( - f"At or above the {PERCENTILE}th percentile for summer days above 90F and " + f"Greater than or equal to the {PERCENTILE}th percentile for summer days above 90F and " f"the median house value is less than {MEDIAN_HOUSE_VALUE_PERCENTILE}th " f"percentile and is low income" ) IMPENETRABLE_SURFACES_LOW_INCOME_FIELD = ( - f"At or above the {PERCENTILE}th percentile for impenetrable surfaces and is low " + f"Greater than or equal to the {PERCENTILE}th percentile for impenetrable surfaces and is low " f"income" ) -AIR_TOXICS_CANCER_RISK_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for air toxics cancer risk and is low income" -RESPIRATORY_HAZARD_LOW_INCOME_FIELD = f"At or above the {PERCENTILE}th percentile for respiratory hazard index and is low income" +AIR_TOXICS_CANCER_RISK_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for air toxics cancer risk and is low income" +RESPIRATORY_HAZARD_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for respiratory hazard index and is low income" HEALTHY_FOOD_LOW_INCOME_FIELD = ( - f"At or above the {PERCENTILE}th percentile for low " + f"Greater than or equal to the {PERCENTILE}th percentile for low " f"access to healthy food and is low income" ) LOW_READING_LOW_HS_EDUCATION_FIELD = ( - f"At or above the {PERCENTILE}th percentile for low 3rd grade reading proficiency" + f"Greater than or equal to the {PERCENTILE}th percentile for low 3rd grade reading proficiency" " and has low HS education" ) From cec0c4210213d49aeb5ffe4bfabcb5d9048c1382 Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Tue, 21 Dec 2021 14:32:24 -0500 Subject: [PATCH 3/9] Lucas and Beth column order changes --- data/data-pipeline/data_pipeline/etl/score/constants.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index bc8076ab..ae1d70a1 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -191,6 +191,7 @@ DOWNLOADABLE_SCORE_COLUMNS = [ field_names.GEOID_TRACT_FIELD, field_names.COUNTY_FIELD, field_names.STATE_FIELD, + field_names.THRESHOLD_COUNT, field_names.SCORE_L_COMMUNITIES, field_names.TOTAL_POP_FIELD, field_names.FPL_200_SERIES, @@ -201,10 +202,11 @@ DOWNLOADABLE_SCORE_COLUMNS = [ field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD, field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD, field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD, field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD, + field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD, field_names.ENERGY_BURDEN_LOW_INCOME_FIELD, field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.ENERGY_BURDEN_FIELD, @@ -273,5 +275,4 @@ DOWNLOADABLE_SCORE_COLUMNS = [ field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD, field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD, field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD, - field_names.THRESHOLD_COUNT, ] From c9ee6a43c1e51627ef885fcad94208e60cf1d0e0 Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Mon, 3 Jan 2022 18:14:36 -0500 Subject: [PATCH 4/9] cdc_places update --- .../data_pipeline/etl/sources/cdc_places/etl.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py index 337c60ef..ad58a23d 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py @@ -2,6 +2,7 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.utils import get_module_logger, download_file_from_url +from data_pipeline.score import field_names logger = get_module_logger(__name__) @@ -49,6 +50,20 @@ class CDCPlacesETL(ExtractTransformLoad): values=self.CDC_VALUE_FIELD_NAME, ) + # rename columns to be used in score + rename_fields = { + "Current asthma among adults aged >=18 years": field_names.ASTHMA_FIELD, # 'Current asthma among adults aged greater than or equal to 18 years' + "Coronary heart disease among adults aged >=18 years": field.names.HEART_DISEASE_FIELD, # "Coronary heart disease among adults aged greater than or equal to 18 years" + "Cancer (excluding skin cancer) among adults aged >=18 years": field_names.CANCER_FIELD, # 'Cancer (excluding skin cancer) among adults aged greater than or equal to 18 years', + "Diagnosed diabetes among adults aged >=18 years": field_names.DIABETES_FIELD, # 'Diagnosed diabetes among adults aged greater than or equal to 18 years', + "Physical health not good for >=14 days among adults aged >=18 years": field_names.PHYS_HEALTH_NOT_GOOD_FIELD, # 'Physical health not good for greater than or equal to 14 days among adults aged greater than or equal to 18 years', + } + self.df.rename( + columns=rename_fields, + inplace=True, + errors="raise", + ) + # Make the index (the census tract ID) a column, not the index. self.df.reset_index(inplace=True) From a16bf7cf5849250d3ca98b55dfab21da2b4a6f37 Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Tue, 4 Jan 2022 16:58:08 -0500 Subject: [PATCH 5/9] passing score --- .../etl/sources/cdc_places/etl.py | 10 +++--- .../etl/sources/census_acs/etl.py | 18 ++++++++-- .../etl/sources/census_acs_2010/etl.py | 33 ++++++++++++------- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py index ad58a23d..9527b242 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py @@ -52,11 +52,11 @@ class CDCPlacesETL(ExtractTransformLoad): # rename columns to be used in score rename_fields = { - "Current asthma among adults aged >=18 years": field_names.ASTHMA_FIELD, # 'Current asthma among adults aged greater than or equal to 18 years' - "Coronary heart disease among adults aged >=18 years": field.names.HEART_DISEASE_FIELD, # "Coronary heart disease among adults aged greater than or equal to 18 years" - "Cancer (excluding skin cancer) among adults aged >=18 years": field_names.CANCER_FIELD, # 'Cancer (excluding skin cancer) among adults aged greater than or equal to 18 years', - "Diagnosed diabetes among adults aged >=18 years": field_names.DIABETES_FIELD, # 'Diagnosed diabetes among adults aged greater than or equal to 18 years', - "Physical health not good for >=14 days among adults aged >=18 years": field_names.PHYS_HEALTH_NOT_GOOD_FIELD, # 'Physical health not good for greater than or equal to 14 days among adults aged greater than or equal to 18 years', + "Current asthma among adults aged >=18 years": field_names.ASTHMA_FIELD, + "Coronary heart disease among adults aged >=18 years": field_names.HEART_DISEASE_FIELD, + "Cancer (excluding skin cancer) among adults aged >=18 years": field_names.CANCER_FIELD, + "Diagnosed diabetes among adults aged >=18 years": field_names.DIABETES_FIELD, + "Physical health not good for >=14 days among adults aged >=18 years": field_names.PHYS_HEALTH_NOT_GOOD_FIELD, } self.df.rename( columns=rename_fields, diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index 51097cbe..af6b3c48 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -5,6 +5,7 @@ from data_pipeline.etl.sources.census_acs.etl_utils import ( retrieve_census_acs_data, ) from data_pipeline.utils import get_module_logger +from data_pipeline.score import field_names logger = get_module_logger(__name__) @@ -353,18 +354,29 @@ class CensusACSETL(ExtractTransformLoad): + df[self.COLLEGE_ATTENDANCE_FEMALE_ENROLLED_PRIVATE] ) / df[self.COLLEGE_ATTENDANCE_TOTAL_POPULATION_ASKED] + # strip columns + df = df[self.COLUMNS_TO_KEEP] + # Save results to self. self.df = df + # rename columns to be used in score + rename_fields = { + "Percent of individuals < 200% Federal Poverty Line": field_names.POVERTY_LESS_THAN_200_FPL_FIELD, + } + self.df.rename( + columns=rename_fields, + inplace=True, + errors="raise", + ) + def load(self) -> None: logger.info("Saving Census ACS Data") # mkdir census self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) - self.df[self.COLUMNS_TO_KEEP].to_csv( - path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False - ) + self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False) def validate(self) -> None: logger.info("Validating Census ACS Data") diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py index 05d823a6..ebc98121 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py @@ -5,6 +5,7 @@ from data_pipeline.etl.sources.census_acs.etl_utils import ( retrieve_census_acs_data, ) from data_pipeline.utils import get_module_logger +from data_pipeline.score import field_names logger = get_module_logger(__name__) @@ -149,15 +150,6 @@ class CensusACS2010ETL(ExtractTransformLoad): + df["C17002_007E"] ) / df["C17002_001E"] - # Save results to self. - self.df = df - - def load(self) -> None: - logger.info("Saving Census ACS Data") - - # mkdir census - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) - columns_to_include = [ self.GEOID_TRACT_FIELD_NAME, self.UNEMPLOYED_FIELD_NAME, @@ -166,7 +158,7 @@ class CensusACS2010ETL(ExtractTransformLoad): self.POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME, ] - output_df = self.df[columns_to_include] + output_df = df[columns_to_include] # Add the year to the end of every column, so when it's all joined in the # score df, it's obvious which year this data is from. @@ -178,7 +170,26 @@ class CensusACS2010ETL(ExtractTransformLoad): } ) - output_df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False) + # rename columns to be used in score + rename_fields = { + "Percent of individuals < 100% Federal Poverty Line in 2010": field_names.CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010, + } + output_df.rename( + columns=rename_fields, + inplace=True, + errors="raise", + ) + + # Save results to self. + self.df = output_df + + def load(self) -> None: + logger.info("Saving Census ACS Data") + + # mkdir census + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + + self.df.to_csv(path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False) def validate(self) -> None: logger.info("Validating Census ACS Data") From 7029a0dff5107e5151229e993a5102ad8ee3d1b0 Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Thu, 6 Jan 2022 12:43:25 -0500 Subject: [PATCH 6/9] pandas error --- data/data-pipeline/data_pipeline/etl/score/etl_score_post.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 89f22601..f2f61674 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -206,7 +206,9 @@ class PostScoreETL(ExtractTransformLoad): tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys()) # filter the columns on full score - score_tiles = score_county_state_merged_df[tiles_score_column_titles] + score_tiles = score_county_state_merged_df[ + tiles_score_column_titles + ].copy() score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[ constants.TILES_SCORE_FLOAT_COLUMNS From a995542e70da33b835523238154beda83eb2b5d7 Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Fri, 7 Jan 2022 16:47:42 -0500 Subject: [PATCH 7/9] checkpoint --- data/data-pipeline/.vscode/launch.json | 9 +++ .../data_pipeline/etl/score/etl_score_post.py | 18 ++++- .../etl/sources/census_acs/etl.py | 2 +- .../etl/sources/census_acs_2010/etl.py | 2 +- .../data_pipeline/score/field_names.py | 67 ++++++++++--------- 5 files changed, 61 insertions(+), 37 deletions(-) diff --git a/data/data-pipeline/.vscode/launch.json b/data/data-pipeline/.vscode/launch.json index 42bb1622..6dcae4dd 100644 --- a/data/data-pipeline/.vscode/launch.json +++ b/data/data-pipeline/.vscode/launch.json @@ -13,6 +13,15 @@ "score-run" ] }, + { + "name": "Generate Score Post", + "type": "python", + "request": "launch", + "module": "data_pipeline.application", + "args": [ + "generate-score-post" + ] + }, { "name": "Data Cleanup", "type": "python", diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index f2f61674..30bcf493 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -129,7 +129,7 @@ class PostScoreETL(ExtractTransformLoad): new_df = initial_states_df.rename( columns={ "fips": "State Code", - "state_name": "State Name", + "state_name": field_names.STATE_FIELD, "state_abbreviation": "State Abbreviation", } ) @@ -242,7 +242,19 @@ class PostScoreETL(ExtractTransformLoad): ) -> pd.DataFrame: df = score_county_state_merged_df[constants.DOWNLOADABLE_SCORE_COLUMNS] - # rename fields + float_columns = df.select_dtypes(include=["float64"]) + + # score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[ + # constants.TILES_SCORE_FLOAT_COLUMNS + # ].apply( + # func=lambda series: floor_series( + # series=series, + # number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS, + # ), + # axis=0, + # ) + + # [x for x in df.columns if field_names.PERCENTILE_FIELD_SUFFIX in x] return df @@ -301,7 +313,7 @@ class PostScoreETL(ExtractTransformLoad): # Rename score column downloadable_df_copy = downloadable_df.rename( columns={ - field_names.SCORE_L_COMMUNITIES: "Community of focus (v0.1)" + field_names.SCORE_L_COMMUNITIES: "Identified as disadvantaged (v0.1)" }, inplace=False, ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index af6b3c48..5ba83dc0 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -23,7 +23,7 @@ class CensusACSETL(ExtractTransformLoad): self.TOTAL_UNEMPLOYED_FIELD, self.TOTAL_IN_LABOR_FORCE, ] - self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)" + self.UNEMPLOYED_FIELD_NAME = "Unemployment (percent)" self.LINGUISTIC_ISOLATION_FIELD_NAME = "Linguistic isolation (percent)" self.LINGUISTIC_ISOLATION_TOTAL_FIELD_NAME = ( diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py index ebc98121..b113bc00 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py @@ -74,7 +74,7 @@ class CensusACS2010ETL(ExtractTransformLoad): self.EMPLOYMENT_COLLEGE_IN_LABOR_FORCE, ] - self.UNEMPLOYED_FIELD_NAME = "Unemployed civilians (percent)" + self.UNEMPLOYED_FIELD_NAME = "Unemployment (percent)" self.POVERTY_FIELDS = [ "C17002_001E", # Estimate!!Total, diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index ae779231..75090667 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -6,7 +6,7 @@ TOP_25_PERCENTILE_SUFFIX = " (top 25th percentile)" # Geographic field names GEOID_TRACT_FIELD = "GEOID10_TRACT" -STATE_FIELD = "State Name" +STATE_FIELD = "State/Territory" COUNTY_FIELD = "County Name" # Score file field names @@ -83,15 +83,15 @@ EXPECTED_POPULATION_LOSS_RATE_FIELD = ( ) # Environment -DIESEL_FIELD = "Diesel particulate matter" -PM25_FIELD = "Particulate matter (PM2.5)" +DIESEL_FIELD = "Diesel particulate matter exposure" +PM25_FIELD = "PM2.5 in the air" OZONE_FIELD = "Ozone" TRAFFIC_FIELD = "Traffic proximity and volume" LEAD_PAINT_FIELD = "Percent pre-1960s housing (lead paint indicator)" WASTEWATER_FIELD = "Wastewater discharge" AGGREGATION_POLLUTION_FIELD = "Pollution Burden" RMP_FIELD = "Proximity to Risk Management Plan (RMP) facilities" -TSDF_FIELD = "Proximity to TSDF sites" +TSDF_FIELD = "Proximity to hazardous waste sites" NPL_FIELD = "Proximity to NPL sites" AIR_TOXICS_CANCER_RISK_FIELD = "Air toxics cancer risk" RESPIRATORY_HAZARD_FIELD = "Respiratory hazard index" @@ -125,7 +125,7 @@ LOW_LIFE_EXPECTANCY_FIELD = "Low life expectancy" # Other Demographics TOTAL_POP_FIELD = "Total population" -UNEMPLOYMENT_FIELD = "Unemployed civilians (percent)" +UNEMPLOYMENT_FIELD = "Unemployment (percent)" LINGUISTIC_ISO_FIELD = "Linguistic isolation (percent)" HOUSEHOLDS_LINGUISTIC_ISO_FIELD = ( "Percent of households in linguistic isolation" @@ -143,16 +143,14 @@ CENSUS_DECENNIAL_POVERTY_LESS_THAN_100_FPL_FIELD_2009 = ( "Percentage households below 100% of federal poverty line in 2009" ) CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009 = "Percent individuals age 25 or over with less than high school degree in 2009" -CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = ( - "Unemployed civilians (percent) in 2009" -) +CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 = "Unemployment (percent) in 2009" CENSUS_DECENNIAL_TOTAL_POPULATION_FIELD_2009 = "Total population in 2009" CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = ( "Median household income as a percent of territory median income in 2009" ) LOW_CENSUS_DECENNIAL_AREA_MEDIAN_INCOME_PERCENT_FIELD_2009 = "Low median household income as a percent of territory median income in 2009" # Fields from 2010 ACS (loaded for comparison with the territories) -CENSUS_UNEMPLOYMENT_FIELD_2010 = "Unemployed civilians (percent) in 2010" +CENSUS_UNEMPLOYMENT_FIELD_2010 = "Unemployment (percent) in 2010" CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = ( "Percent of individuals less than 100% Federal Poverty Line in 2010" ) @@ -161,7 +159,9 @@ CENSUS_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = ( COMBINED_CENSUS_TOTAL_POPULATION_2010 = ( "Total population in 2009 (island areas) and 2019 (states and PR)" ) -COMBINED_UNEMPLOYMENT_2010 = "Unemployed civilians (percent) in 2009 (island areas) and 2010 (states and PR)" +COMBINED_UNEMPLOYMENT_2010 = ( + "Unemployment (percent) in 2009 (island areas) and 2010 (states and PR)" +) COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010 = ( "Percentage households below 100% of federal poverty line in 2009 (island areas) " "and 2010 (states and PR)" @@ -265,52 +265,55 @@ TRANSPORTATION_COSTS = "Transportation Costs" # Climate Change EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD = ( f"Greater than or equal to the {PERCENTILE}th percentile" - f" for expected population loss rate and is low income" + f" for expected population loss rate and is low income?" ) EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD = ( f"Greater than or equal to the {PERCENTILE}th percentile" - f" for expected agriculture loss rate and is low income" + f" for expected agriculture loss rate and is low income?" ) EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD = ( f"Greater than or equal to the {PERCENTILE}th percentile" - f" for expected building loss rate and is low income" + f" for expected building loss rate and is low income?" ) # Clean energy and efficiency -PM25_EXPOSURE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for PM2.5 exposure and is low income" -ENERGY_BURDEN_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for energy burden and is low income" +PM25_EXPOSURE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for PM2.5 exposure and is low income?" +ENERGY_BURDEN_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for energy burden and is low income?" # Clean transportation -DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for diesel particulate matter and is low income" -TRAFFIC_PROXIMITY_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for traffic proximity and is low income" +DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD = ( + f"Greater than or equal to the {PERCENTILE}th percentile for " + "diesel particulate matter and is low income?" +) +TRAFFIC_PROXIMITY_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for traffic proximity and is low income?" # Affordable and Sustainable Housing LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD = ( f"Greater than or equal to the {PERCENTILE}th percentile for lead paint and" f" the median house value is less than {MEDIAN_HOUSE_VALUE_PERCENTILE}th " - f"percentile and is low income" + f"percentile and is low income?" ) -HOUSING_BURDEN_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for housing burden and is low income" +HOUSING_BURDEN_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for housing burden and is low income?" # Remediation and Reduction of Legacy Pollution -RMP_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for proximity to RMP sites and is low income" -SUPERFUND_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for proximity to superfund sites and is low income" +RMP_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for proximity to RMP sites and is low income?" +SUPERFUND_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for proximity to superfund sites and is low income?" HAZARDOUS_WASTE_LOW_INCOME_FIELD = ( f"Greater than or equal to the {PERCENTILE}th percentile" - f" for proximity to hazardous waste facilities and is low income" + f" for proximity to hazardous waste facilities and is low income?" ) # Critical Clean Water and Waste Infrastructure -WASTEWATER_DISCHARGE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for wastewater discharge and is low income" +WASTEWATER_DISCHARGE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for wastewater discharge and is low income?" # Health Burdens -DIABETES_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for diabetes and is low income" -ASTHMA_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for asthma and is low income" -HEART_DISEASE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for heart disease and is low income" +DIABETES_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for diabetes and is low income?" +ASTHMA_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for asthma and is low income?" +HEART_DISEASE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for heart disease and is low income?" LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD = ( f"Greater than or equal to the {PERCENTILE}th percentile " - f"for low life expectancy and is low income" + f"for low life expectancy and is low income?" ) # Workforce @@ -362,17 +365,17 @@ ISLAND_AREAS_LOW_HS_EDUCATION_FIELD = ( EXTREME_HEAT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD = ( f"Greater than or equal to the {PERCENTILE}th percentile for summer days above 90F and " f"the median house value is less than {MEDIAN_HOUSE_VALUE_PERCENTILE}th " - f"percentile and is low income" + f"percentile and is low income?" ) IMPENETRABLE_SURFACES_LOW_INCOME_FIELD = ( f"Greater than or equal to the {PERCENTILE}th percentile for impenetrable surfaces and is low " f"income" ) -AIR_TOXICS_CANCER_RISK_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for air toxics cancer risk and is low income" -RESPIRATORY_HAZARD_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for respiratory hazard index and is low income" +AIR_TOXICS_CANCER_RISK_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for air toxics cancer risk and is low income?" +RESPIRATORY_HAZARD_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for respiratory hazard index and is low income?" HEALTHY_FOOD_LOW_INCOME_FIELD = ( f"Greater than or equal to the {PERCENTILE}th percentile for low " - f"access to healthy food and is low income" + f"access to healthy food and is low income?" ) LOW_READING_LOW_HS_EDUCATION_FIELD = ( f"Greater than or equal to the {PERCENTILE}th percentile for low 3rd grade reading proficiency" @@ -381,6 +384,6 @@ LOW_READING_LOW_HS_EDUCATION_FIELD = ( THRESHOLD_COUNT = "Total threshold criteria exceeded" -FPL_200_SERIES = "Is low income" +FPL_200_SERIES = "Is low income?" # End of names for individual factors being exceeded #### From 9ab75a156ff83ac9f08da3593b9f1bab0527ab08 Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Tue, 11 Jan 2022 14:07:04 -0500 Subject: [PATCH 8/9] score passing --- .../data_pipeline/etl/sources/census_decennial/etl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index cffb28d2..0f255e89 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -6,6 +6,7 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.utils import get_module_logger +from data_pipeline.score import field_names pd.options.mode.chained_assignment = "raise" @@ -141,7 +142,9 @@ class CensusDecennialETL(ExtractTransformLoad): "PBG036014" # Total!!Female!!In labor force!!Civilian!!Unemployed ) - self.UNEMPLOYMENT_FIELD_NAME = "Unemployed civilians (percent) in 2009" + self.UNEMPLOYMENT_FIELD_NAME = ( + field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009 + ) var_list = [ self.MEDIAN_INCOME_FIELD, From c6ed82fd2edc2169457ac25974f052d4c1bf4611 Mon Sep 17 00:00:00 2001 From: Jorge Escobar Date: Tue, 11 Jan 2022 15:03:18 -0500 Subject: [PATCH 9/9] checkpoint --- .../data_pipeline/etl/score/constants.py | 7 ++++ .../data_pipeline/etl/score/etl_score_post.py | 33 +++++++++++++++---- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index ae1d70a1..29dac8f2 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -63,6 +63,13 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = ( # Column subsets CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"] +# Percent prefixes for rounding +PERCENT_PREFIXES_SUFFIXES = [ + "Percent", + "Percentage", + field_names.PERCENTILE_FIELD_SUFFIX, +] + TILES_ROUND_NUM_DECIMALS = 2 # Tiles data: full field name, tile index name TILES_SCORE_COLUMNS = { diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 30bcf493..b7168e67 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -240,12 +240,35 @@ class PostScoreETL(ExtractTransformLoad): def _create_downloadable_data( self, score_county_state_merged_df: pd.DataFrame ) -> pd.DataFrame: - df = score_county_state_merged_df[constants.DOWNLOADABLE_SCORE_COLUMNS] + df = score_county_state_merged_df[ + constants.DOWNLOADABLE_SCORE_COLUMNS + ].copy() - float_columns = df.select_dtypes(include=["float64"]) + float_columns = df.select_dtypes(include=["float64"]).columns - # score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[ - # constants.TILES_SCORE_FLOAT_COLUMNS + # convert percentile_columns + percent_target_columns = [] + for x in float_columns: + for col in constants.PERCENT_PREFIXES_SUFFIXES: + if col in x: + percent_target_columns.append(x) + + df[percent_target_columns] = df[percent_target_columns].apply( + func=lambda series: floor_series( + series=series * 100, + number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS, + ) + ) + + # # convert percentile_columns + # non_percentile_float_columns = [ + # x + # for x in float_columns + # if x not in constants.PERCENT_PREFIXES_SUFFIXES + # ] + + # df[non_percentile_float_columns] = df[ + # non_percentile_float_columns # ].apply( # func=lambda series: floor_series( # series=series, @@ -254,8 +277,6 @@ class PostScoreETL(ExtractTransformLoad): # axis=0, # ) - # [x for x in df.columns if field_names.PERCENTILE_FIELD_SUFFIX in x] - return df def transform(self) -> None: