diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 29dac8f2..e1c15c77 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -63,13 +63,6 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = ( # Column subsets CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"] -# Percent prefixes for rounding -PERCENT_PREFIXES_SUFFIXES = [ - "Percent", - "Percentage", - field_names.PERCENTILE_FIELD_SUFFIX, -] - TILES_ROUND_NUM_DECIMALS = 2 # Tiles data: full field name, tile index name TILES_SCORE_COLUMNS = { @@ -198,88 +191,91 @@ DOWNLOADABLE_SCORE_COLUMNS = [ field_names.GEOID_TRACT_FIELD, field_names.COUNTY_FIELD, field_names.STATE_FIELD, - field_names.THRESHOLD_COUNT, field_names.SCORE_L_COMMUNITIES, field_names.TOTAL_POP_FIELD, field_names.FPL_200_SERIES, - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD, - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD, - field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD, - field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD, - field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD, - field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD, - field_names.ENERGY_BURDEN_LOW_INCOME_FIELD, - field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.ENERGY_BURDEN_FIELD, - field_names.PM25_EXPOSURE_LOW_INCOME_FIELD, - field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.PM25_FIELD, - field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD, - field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.DIESEL_FIELD, - field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD, - field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.TRAFFIC_FIELD, - field_names.HOUSING_BURDEN_LOW_INCOME_FIELD, - field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.HOUSING_BURDEN_FIELD, - field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD, - field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LEAD_PAINT_FIELD, - field_names.MEDIAN_HOUSE_VALUE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.MEDIAN_HOUSE_VALUE_FIELD, - field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD, - field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.TSDF_FIELD, - field_names.SUPERFUND_LOW_INCOME_FIELD, - field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.NPL_FIELD, - field_names.RMP_LOW_INCOME_FIELD, - field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.RMP_FIELD, - field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD, - field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.WASTEWATER_FIELD, - field_names.ASTHMA_LOW_INCOME_FIELD, - field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.ASTHMA_FIELD, - field_names.DIABETES_LOW_INCOME_FIELD, - field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.DIABETES_FIELD, - field_names.HEART_DISEASE_LOW_INCOME_FIELD, - field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.HEART_DISEASE_FIELD, - field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD, - field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LIFE_EXPECTANCY_FIELD, - field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD, - field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD - + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD, - field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD, - field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.LINGUISTIC_ISO_FIELD, - field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD, - field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.UNEMPLOYMENT_FIELD, - field_names.POVERTY_LOW_HS_EDUCATION_FIELD, + field_names.POVERTY_LESS_THAN_200_FPL_FIELD, field_names.POVERTY_LESS_THAN_200_FPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD, + field_names.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_FIELD, + field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD, + field_names.EXPECTED_BUILDING_LOSS_RATE_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_FIELD, + field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD, + field_names.EXPECTED_POPULATION_LOSS_RATE_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_FIELD, + field_names.ENERGY_BURDEN_FIELD, + field_names.ENERGY_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.ENERGY_BURDEN_LOW_INCOME_FIELD, + field_names.PM25_FIELD, + field_names.PM25_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.PM25_EXPOSURE_LOW_INCOME_FIELD, + field_names.DIESEL_FIELD, + field_names.DIESEL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD, + field_names.TRAFFIC_FIELD, + field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD, + field_names.HOUSING_BURDEN_FIELD, + field_names.HOUSING_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HOUSING_BURDEN_LOW_INCOME_FIELD, + field_names.LEAD_PAINT_FIELD, + field_names.LEAD_PAINT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD, + field_names.MEDIAN_HOUSE_VALUE_FIELD, + field_names.MEDIAN_HOUSE_VALUE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.TSDF_FIELD, + field_names.TSDF_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HAZARDOUS_WASTE_LOW_INCOME_FIELD, + field_names.NPL_FIELD, + field_names.NPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.SUPERFUND_LOW_INCOME_FIELD, + field_names.RMP_FIELD, + field_names.RMP_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.RMP_LOW_INCOME_FIELD, + field_names.WASTEWATER_FIELD, + field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD, + field_names.ASTHMA_FIELD, + field_names.ASTHMA_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.ASTHMA_LOW_INCOME_FIELD, + field_names.DIABETES_FIELD, + field_names.DIABETES_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.DIABETES_LOW_INCOME_FIELD, + field_names.HEART_DISEASE_FIELD, + field_names.HEART_DISEASE_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.HEART_DISEASE_LOW_INCOME_FIELD, + field_names.LIFE_EXPECTANCY_FIELD, + field_names.LOW_LIFE_EXPECTANCY_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LOW_LIFE_EXPECTANCY_LOW_INCOME_FIELD, + field_names.MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD, + field_names.LOW_MEDIAN_INCOME_AS_PERCENT_OF_AMI_FIELD + + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD, + field_names.LINGUISTIC_ISO_FIELD, + field_names.LINGUISTIC_ISOLATION_LOW_HS_EDUCATION_FIELD, + field_names.UNEMPLOYMENT_FIELD, + field_names.UNEMPLOYMENT_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LINGUISTIC_ISO_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.POVERTY_LESS_THAN_100_FPL_FIELD, field_names.POVERTY_LESS_THAN_100_FPL_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, - field_names.POVERTY_LESS_THAN_200_FPL_FIELD, - field_names.POVERTY_LESS_THAN_100_FPL_FIELD, - field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.POVERTY_LOW_HS_EDUCATION_FIELD, field_names.HIGH_SCHOOL_ED_FIELD, + field_names.HIGH_SCHOOL_ED_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, + field_names.LOW_HS_EDUCATION_FIELD, + field_names.THRESHOLD_COUNT, + field_names.UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD, field_names.COMBINED_UNEMPLOYMENT_2010, + field_names.CENSUS_DECENNIAL_UNEMPLOYMENT_FIELD_2009, field_names.COMBINED_POVERTY_LESS_THAN_100_FPL_FIELD_2010, field_names.ISLAND_AREAS_UNEMPLOYMENT_LOW_HS_EDUCATION_FIELD, field_names.ISLAND_AREAS_POVERTY_LOW_HS_EDUCATION_FIELD, field_names.ISLAND_AREAS_LOW_MEDIAN_INCOME_LOW_HS_EDUCATION_FIELD, + field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD, ] diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index b7168e67..244cccbe 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -129,7 +129,7 @@ class PostScoreETL(ExtractTransformLoad): new_df = initial_states_df.rename( columns={ "fips": "State Code", - "state_name": field_names.STATE_FIELD, + "state_name": "State Name", "state_abbreviation": "State Abbreviation", } ) @@ -206,9 +206,7 @@ class PostScoreETL(ExtractTransformLoad): tiles_score_column_titles = list(constants.TILES_SCORE_COLUMNS.keys()) # filter the columns on full score - score_tiles = score_county_state_merged_df[ - tiles_score_column_titles - ].copy() + score_tiles = score_county_state_merged_df[tiles_score_column_titles] score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[ constants.TILES_SCORE_FLOAT_COLUMNS @@ -240,44 +238,9 @@ class PostScoreETL(ExtractTransformLoad): def _create_downloadable_data( self, score_county_state_merged_df: pd.DataFrame ) -> pd.DataFrame: - df = score_county_state_merged_df[ + return score_county_state_merged_df[ constants.DOWNLOADABLE_SCORE_COLUMNS - ].copy() - - float_columns = df.select_dtypes(include=["float64"]).columns - - # convert percentile_columns - percent_target_columns = [] - for x in float_columns: - for col in constants.PERCENT_PREFIXES_SUFFIXES: - if col in x: - percent_target_columns.append(x) - - df[percent_target_columns] = df[percent_target_columns].apply( - func=lambda series: floor_series( - series=series * 100, - number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS, - ) - ) - - # # convert percentile_columns - # non_percentile_float_columns = [ - # x - # for x in float_columns - # if x not in constants.PERCENT_PREFIXES_SUFFIXES - # ] - - # df[non_percentile_float_columns] = df[ - # non_percentile_float_columns - # ].apply( - # func=lambda series: floor_series( - # series=series, - # number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS, - # ), - # axis=0, - # ) - - return df + ] def transform(self) -> None: logger.info("Transforming data sources for Score + County CSVs") @@ -334,7 +297,7 @@ class PostScoreETL(ExtractTransformLoad): # Rename score column downloadable_df_copy = downloadable_df.rename( columns={ - field_names.SCORE_L_COMMUNITIES: "Identified as disadvantaged (v0.1)" + field_names.SCORE_L_COMMUNITIES: "Community of focus (v0.1)" }, inplace=False, )