From bbb5bbc60a7ef601a21c785c06d515024ad0a963 Mon Sep 17 00:00:00 2001 From: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:55:58 -0400 Subject: [PATCH] Changing LHE in tiles to a boolean (#1767) also includes merging / clean up of the release --- .../data_pipeline/etl/score/constants.py | 2 - .../data_pipeline/etl/score/etl_score.py | 48 ++++--------------- .../data_pipeline/score/field_names.py | 1 - .../data_pipeline/score/score_narwhal.py | 26 ++++------ .../data_pipeline/score/score_runner.py | 30 ------------ 5 files changed, 18 insertions(+), 89 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 8e7d17b1..5db119e3 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -342,8 +342,6 @@ TILES_SCORE_FLOAT_COLUMNS = [ + field_names.PERCENTILE_FIELD_SUFFIX, # Island areas HS degree attainment rate field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009, - field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD, - field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD, field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX, field_names.COLLEGE_NON_ATTENDANCE_FIELD, field_names.COLLEGE_ATTENDANCE_FIELD, diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index b073ce5a..d43cc325 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -274,26 +274,6 @@ class ScoreETL(ExtractTransformLoad): E.g., "PM2.5 exposure (percentile)". This will be for the entire country. - For an "apples-to-apples" comparison of urban tracts to other urban tracts, - and compare rural tracts to other rural tracts. - - This percentile will be created and returned as - f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}". - E.g., "PM2.5 exposure (percentile urban/rural)". - This field exists for every tract, but for urban tracts this value will be the - percentile compared to other urban tracts, and for rural tracts this value - will be the percentile compared to other rural tracts. - - Specific methdology: - 1. Decide a methodology for confirming whether a tract counts as urban or - rural. Currently in the codebase, we use Geocorr to identify the % rural of - a tract, and mark the tract as rural if the percentage is >50% and urban - otherwise. This may or may not be the right methodology. - 2. Once tracts are marked as urban or rural, create one percentile rank - that only ranks urban tracts, and one percentile rank that only ranks rural - tracts. - 3. Combine into a single field. - `output_column_name_root` is different from `input_column_name` to enable the reverse percentile use case. In that use case, `input_column_name` may be something like "3rd grade reading proficiency" and `output_column_name_root` @@ -413,7 +393,6 @@ class ScoreETL(ExtractTransformLoad): field_names.NPL_FIELD, field_names.WASTEWATER_FIELD, field_names.LEAD_PAINT_FIELD, - field_names.UST_FIELD, field_names.UNDER_5_FIELD, field_names.OVER_64_FIELD, field_names.LINGUISTIC_ISO_FIELD, @@ -436,6 +415,7 @@ class ScoreETL(ExtractTransformLoad): field_names.EXTREME_HEAT_FIELD, field_names.HEALTHY_FOOD_FIELD, field_names.IMPENETRABLE_SURFACES_FIELD, + field_names.UST_FIELD, # We have to pass this boolean here in order to include it in ag value loss percentiles. field_names.AGRICULTURAL_VALUE_BOOL_FIELD, field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD, @@ -488,7 +468,13 @@ class ScoreETL(ExtractTransformLoad): df_copy = df[columns_to_keep].copy() - df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric) + assert len(numeric_columns) == len( + set(numeric_columns) + ), "You have a double-entered column in the numeric columns list" + + df_copy[numeric_columns] = df_copy[numeric_columns].apply( + pd.to_numeric + ) # Convert all columns to numeric and do math # Note that we have a few special conditions here, that we handle explicitly. @@ -534,24 +520,6 @@ class ScoreETL(ExtractTransformLoad): drop_tracts=drop_tracts, ) - # Min-max normalization: - # ( - # Observed value - # - minimum of all values - # ) - # divided by - # ( - # Maximum of all values - # - minimum of all values - # ) - min_value = df_copy[numeric_column].min(skipna=True) - - max_value = df_copy[numeric_column].max(skipna=True) - - df_copy[f"{numeric_column}{field_names.MIN_MAX_FIELD_SUFFIX}"] = ( - df_copy[numeric_column] - min_value - ) / (max_value - min_value) - # Create reversed percentiles for these fields for reverse_percentile in reverse_percentiles: # Calculate reverse percentiles diff --git a/data/data-pipeline/data_pipeline/score/field_names.py b/data/data-pipeline/data_pipeline/score/field_names.py index 5b7a88af..895c52b1 100644 --- a/data/data-pipeline/data_pipeline/score/field_names.py +++ b/data/data-pipeline/data_pipeline/score/field_names.py @@ -432,7 +432,6 @@ HAZARDOUS_WASTE_LOW_INCOME_FIELD = ( WASTEWATER_DISCHARGE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for wastewater discharge and is low income?" UST_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for leaky underground storage tanks and is low income?" - # Health Burdens DIABETES_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for diabetes and is low income?" ASTHMA_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for asthma and is low income?" diff --git a/data/data-pipeline/data_pipeline/score/score_narwhal.py b/data/data-pipeline/data_pipeline/score/score_narwhal.py index 1299acaa..0480d93a 100644 --- a/data/data-pipeline/data_pipeline/score/score_narwhal.py +++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py @@ -435,6 +435,11 @@ class ScoreNarwhal(Score): # poverty level and has a low percent of higher ed students # Source: Census's American Community Survey + eligibility_columns = [ + field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD, + field_names.UST_LOW_INCOME_FIELD, + ] + self.df[field_names.WASTEWATER_PCTILE_THRESHOLD] = ( self.df[ field_names.WASTEWATER_FIELD @@ -457,28 +462,17 @@ class ScoreNarwhal(Score): & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED] ) - self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[ - [ - field_names.WASTEWATER_PCTILE_THRESHOLD, - field_names.UST_PCTILE_THRESHOLD, - ] - ].max(axis=1) - self._increment_total_eligibility_exceeded( - [ - field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD, - field_names.UST_LOW_INCOME_FIELD, - ], + eligibility_columns, skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS, ) - return self.df[ - [ - field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD, - field_names.UST_LOW_INCOME_FIELD, - ] + self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[ + eligibility_columns ].any(axis=1) + return self.df[field_names.WATER_THRESHOLD_EXCEEDED] + def _health_factor(self) -> bool: # In Xth percentile or above for diabetes (Source: CDC Places) # or diff --git a/data/data-pipeline/data_pipeline/score/score_runner.py b/data/data-pipeline/data_pipeline/score/score_runner.py index 64b192f4..c1838ca1 100644 --- a/data/data-pipeline/data_pipeline/score/score_runner.py +++ b/data/data-pipeline/data_pipeline/score/score_runner.py @@ -28,7 +28,6 @@ class ScoreRunner: self.df = ScoreA(df=self.df).add_columns() self.df = ScoreB(df=self.df).add_columns() self.df = ScoreC(df=self.df).add_columns() - self.df = ScoreD(df=self.df).add_columns() self.df = ScoreF(df=self.df).add_columns() self.df = ScoreG(df=self.df).add_columns() self.df = ScoreH(df=self.df).add_columns() @@ -38,33 +37,4 @@ class ScoreRunner: self.df = ScoreM(df=self.df).add_columns() self.df = ScoreNarwhal(df=self.df).add_columns() - # TODO do this with each score instead of in a bundle - # Create percentiles for these index scores - self.df = self._add_score_percentiles() - - return self.df - - def _add_score_percentiles(self) -> pd.DataFrame: - logger.info("Adding Score Percentiles") - for score_field in [ - field_names.SCORE_A, - field_names.SCORE_B, - field_names.SCORE_C, - field_names.SCORE_D, - field_names.SCORE_E, - ]: - self.df[ - f"{score_field}{field_names.PERCENTILE_FIELD_SUFFIX}" - ] = self.df[score_field].rank(pct=True) - - for threshold in [0.25, 0.3, 0.35, 0.4]: - fraction_converted_to_percent = int(100 * threshold) - self.df[ - f"{score_field} (top {fraction_converted_to_percent}th percentile)" - ] = ( - self.df[ - f"{score_field}{field_names.PERCENTILE_FIELD_SUFFIX}" - ] - >= 1 - threshold - ) return self.df