diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index ae1d70a1..29dac8f2 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -63,6 +63,13 @@ SCORE_DOWNLOADABLE_ZIP_FILE_PATH = ( # Column subsets CENSUS_COUNTIES_COLUMNS = ["USPS", "GEOID", "NAME"] +# Percent prefixes for rounding +PERCENT_PREFIXES_SUFFIXES = [ + "Percent", + "Percentage", + field_names.PERCENTILE_FIELD_SUFFIX, +] + TILES_ROUND_NUM_DECIMALS = 2 # Tiles data: full field name, tile index name TILES_SCORE_COLUMNS = { diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 30bcf493..b7168e67 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -240,12 +240,35 @@ class PostScoreETL(ExtractTransformLoad): def _create_downloadable_data( self, score_county_state_merged_df: pd.DataFrame ) -> pd.DataFrame: - df = score_county_state_merged_df[constants.DOWNLOADABLE_SCORE_COLUMNS] + df = score_county_state_merged_df[ + constants.DOWNLOADABLE_SCORE_COLUMNS + ].copy() - float_columns = df.select_dtypes(include=["float64"]) + float_columns = df.select_dtypes(include=["float64"]).columns - # score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[ - # constants.TILES_SCORE_FLOAT_COLUMNS + # convert percentile_columns + percent_target_columns = [] + for x in float_columns: + for col in constants.PERCENT_PREFIXES_SUFFIXES: + if col in x: + percent_target_columns.append(x) + + df[percent_target_columns] = df[percent_target_columns].apply( + func=lambda series: floor_series( + series=series * 100, + number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS, + ) + ) + + # # convert percentile_columns + # non_percentile_float_columns = [ + # x + # for x in float_columns + # if x not in constants.PERCENT_PREFIXES_SUFFIXES + # ] + + # df[non_percentile_float_columns] = df[ + # non_percentile_float_columns # ].apply( # func=lambda series: floor_series( # series=series, @@ -254,8 +277,6 @@ class PostScoreETL(ExtractTransformLoad): # axis=0, # ) - # [x for x in df.columns if field_names.PERCENTILE_FIELD_SUFFIX in x] - return df def transform(self) -> None: