Changing LHE in tiles to a boolean (#1767)

also includes merging / clean up of the release
2025-02-23 10:04:18 -08:00 · 2022-08-03 13:55:58 -04:00 · 2022-08-03 13:55:58 -04:00 · 0d90ae563a
commit 0d90ae563a
parent b0a728437c
5 changed files with 18 additions and 89 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -342,8 +342,6 @@ TILES_SCORE_FLOAT_COLUMNS = [
    + field_names.PERCENTILE_FIELD_SUFFIX,
    # Island areas HS degree attainment rate
    field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009,
    field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD,
    field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD,
    field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.COLLEGE_NON_ATTENDANCE_FIELD,
    field_names.COLLEGE_ATTENDANCE_FIELD,
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -274,26 +274,6 @@ class ScoreETL(ExtractTransformLoad):
        E.g., "PM2.5 exposure (percentile)".
        This will be for the entire country.
        For an "apples-to-apples" comparison of urban tracts to other urban tracts,
        and compare rural tracts to other rural tracts.
        This percentile will be created and returned as
        f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}".
        E.g., "PM2.5 exposure (percentile urban/rural)".
        This field exists for every tract, but for urban tracts this value will be the
        percentile compared to other urban tracts, and for rural tracts this value
        will be the percentile compared to other rural tracts.
        Specific methdology:
            1. Decide a methodology for confirming whether a tract counts as urban or
            rural. Currently in the codebase, we use Geocorr to identify the % rural of
            a tract, and mark the tract as rural if the percentage is >50% and urban
            otherwise. This may or may not be the right methodology.
            2. Once tracts are marked as urban or rural, create one percentile rank
            that only ranks urban tracts, and one percentile rank that only ranks rural
            tracts.
            3. Combine into a single field.
        `output_column_name_root` is different from `input_column_name` to enable the
        reverse percentile use case. In that use case, `input_column_name` may be
        something like "3rd grade reading proficiency" and `output_column_name_root`
@ -413,7 +393,6 @@ class ScoreETL(ExtractTransformLoad):
            field_names.NPL_FIELD,
            field_names.WASTEWATER_FIELD,
            field_names.LEAD_PAINT_FIELD,
            field_names.UST_FIELD,
            field_names.UNDER_5_FIELD,
            field_names.OVER_64_FIELD,
            field_names.LINGUISTIC_ISO_FIELD,
@ -436,6 +415,7 @@ class ScoreETL(ExtractTransformLoad):
            field_names.EXTREME_HEAT_FIELD,
            field_names.HEALTHY_FOOD_FIELD,
            field_names.IMPENETRABLE_SURFACES_FIELD,
            field_names.UST_FIELD,
            # We have to pass this boolean here in order to include it in ag value loss percentiles.
            field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
@ -489,7 +469,13 @@ class ScoreETL(ExtractTransformLoad):
        df_copy = df[columns_to_keep].copy()
-        df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)
+        assert len(numeric_columns) == len(
            set(numeric_columns)
        ), "You have a double-entered column in the numeric columns list"
        df_copy[numeric_columns] = df_copy[numeric_columns].apply(
            pd.to_numeric
        )
        # Convert all columns to numeric and do math
        # Note that we have a few special conditions here, that we handle explicitly.
@ -535,24 +521,6 @@ class ScoreETL(ExtractTransformLoad):
                drop_tracts=drop_tracts,
            )
            # Min-max normalization:
            # (
            #     Observed value
            #     - minimum of all values
            # )
            # divided by
            # (
            #    Maximum of all values
            #     - minimum of all values
            # )
            min_value = df_copy[numeric_column].min(skipna=True)
            max_value = df_copy[numeric_column].max(skipna=True)
            df_copy[f"{numeric_column}{field_names.MIN_MAX_FIELD_SUFFIX}"] = (
                df_copy[numeric_column] - min_value
            ) / (max_value - min_value)
        # Create reversed percentiles for these fields
        for reverse_percentile in reverse_percentiles:
            # Calculate reverse percentiles
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -432,7 +432,6 @@ HAZARDOUS_WASTE_LOW_INCOME_FIELD = (
 WASTEWATER_DISCHARGE_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for wastewater discharge and is low income?"
 UST_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for leaky underground storage tanks and is low income?"
 # Health Burdens
 DIABETES_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for diabetes and is low income?"
 ASTHMA_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for asthma and is low income?"
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -435,6 +435,11 @@ class ScoreNarwhal(Score):
        # poverty level and has a low percent of higher ed students
        # Source: Census's American Community Survey
        eligibility_columns = [
            field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
            field_names.UST_LOW_INCOME_FIELD,
        ]
        self.df[field_names.WASTEWATER_PCTILE_THRESHOLD] = (
            self.df[
                field_names.WASTEWATER_FIELD
@ -457,28 +462,17 @@ class ScoreNarwhal(Score):
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )
        self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[
            [
                field_names.WASTEWATER_PCTILE_THRESHOLD,
                field_names.UST_PCTILE_THRESHOLD,
            ]
        ].max(axis=1)
        self._increment_total_eligibility_exceeded(
-            [
+            eligibility_columns,
                field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
                field_names.UST_LOW_INCOME_FIELD,
            ],
            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,
        )
-        return self.df[
+        self.df[field_names.WATER_THRESHOLD_EXCEEDED] = self.df[
-            [
+            eligibility_columns
                field_names.WASTEWATER_DISCHARGE_LOW_INCOME_FIELD,
                field_names.UST_LOW_INCOME_FIELD,
            ]
        ].any(axis=1)
        return self.df[field_names.WATER_THRESHOLD_EXCEEDED]
    def _health_factor(self) -> bool:
        # In Xth percentile or above for diabetes (Source: CDC Places)
        # or
--- a/data/data-pipeline/data_pipeline/score/score_runner.py
+++ b/data/data-pipeline/data_pipeline/score/score_runner.py
@ -28,7 +28,6 @@ class ScoreRunner:
        self.df = ScoreA(df=self.df).add_columns()
        self.df = ScoreB(df=self.df).add_columns()
        self.df = ScoreC(df=self.df).add_columns()
        self.df = ScoreD(df=self.df).add_columns()
        self.df = ScoreF(df=self.df).add_columns()
        self.df = ScoreG(df=self.df).add_columns()
        self.df = ScoreH(df=self.df).add_columns()
@ -38,33 +37,4 @@ class ScoreRunner:
        self.df = ScoreM(df=self.df).add_columns()
        self.df = ScoreNarwhal(df=self.df).add_columns()
        # TODO do this with each score instead of in a bundle
        # Create percentiles for these index scores
        self.df = self._add_score_percentiles()
        return self.df
    def _add_score_percentiles(self) -> pd.DataFrame:
        logger.info("Adding Score Percentiles")
        for score_field in [
            field_names.SCORE_A,
            field_names.SCORE_B,
            field_names.SCORE_C,
            field_names.SCORE_D,
            field_names.SCORE_E,
        ]:
            self.df[
                f"{score_field}{field_names.PERCENTILE_FIELD_SUFFIX}"
            ] = self.df[score_field].rank(pct=True)
            for threshold in [0.25, 0.3, 0.35, 0.4]:
                fraction_converted_to_percent = int(100 * threshold)
                self.df[
                    f"{score_field} (top {fraction_converted_to_percent}th percentile)"
                ] = (
                    self.df[
                        f"{score_field}{field_names.PERCENTILE_FIELD_SUFFIX}"
                    ]
                    >= 1 - threshold
                )
        return self.df