Changing LHE in tiles to a boolean (#1767)

also includes merging / clean up of the release
2025-07-28 14:01:16 -07:00 · 2022-08-03 13:55:58 -04:00 · 2022-08-03 13:55:58 -04:00 · 0d90ae563a
commit 0d90ae563a
parent b0a728437c
5 changed files with 18 additions and 89 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -342,8 +342,6 @@ TILES_SCORE_FLOAT_COLUMNS = [
    + field_names.PERCENTILE_FIELD_SUFFIX,
    # Island areas HS degree attainment rate
    field_names.CENSUS_DECENNIAL_HIGH_SCHOOL_ED_FIELD_2009,
-    field_names.LOW_HS_EDUCATION_LOW_HIGHER_ED_FIELD,
-    field_names.ISLAND_AREAS_LOW_HS_EDUCATION_FIELD,
    field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.COLLEGE_NON_ATTENDANCE_FIELD,
    field_names.COLLEGE_ATTENDANCE_FIELD,
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -274,26 +274,6 @@ class ScoreETL(ExtractTransformLoad):
        E.g., "PM2.5 exposure (percentile)".
        This will be for the entire country.

-        For an "apples-to-apples" comparison of urban tracts to other urban tracts,
-        and compare rural tracts to other rural tracts.
-
-        This percentile will be created and returned as
-        f"{output_column_name_root}{field_names.PERCENTILE_URBAN_RURAL_FIELD_SUFFIX}".
-        E.g., "PM2.5 exposure (percentile urban/rural)".
-        This field exists for every tract, but for urban tracts this value will be the
-        percentile compared to other urban tracts, and for rural tracts this value
-        will be the percentile compared to other rural tracts.
-
-        Specific methdology:
-            1. Decide a methodology for confirming whether a tract counts as urban or
-            rural. Currently in the codebase, we use Geocorr to identify the % rural of
-            a tract, and mark the tract as rural if the percentage is >50% and urban
-            otherwise. This may or may not be the right methodology.
-            2. Once tracts are marked as urban or rural, create one percentile rank
-            that only ranks urban tracts, and one percentile rank that only ranks rural
-            tracts.
-            3. Combine into a single field.
-
        `output_column_name_root` is different from `input_column_name` to enable the
        reverse percentile use case. In that use case, `input_column_name` may be
        something like "3rd grade reading proficiency" and `output_column_name_root`
@ -413,7 +393,6 @@ class ScoreETL(ExtractTransformLoad):
            field_names.NPL_FIELD,
            field_names.WASTEWATER_FIELD,
            field_names.LEAD_PAINT_FIELD,
-            field_names.UST_FIELD,
            field_names.UNDER_5_FIELD,
            field_names.OVER_64_FIELD,
            field_names.LINGUISTIC_ISO_FIELD,
@ -436,6 +415,7 @@ class ScoreETL(ExtractTransformLoad):
            field_names.EXTREME_HEAT_FIELD,
            field_names.HEALTHY_FOOD_FIELD,
            field_names.IMPENETRABLE_SURFACES_FIELD,
+            field_names.UST_FIELD,
            # We have to pass this boolean here in order to include it in ag value loss percentiles.
            field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
@ -489,7 +469,13 @@ class ScoreETL(ExtractTransformLoad):

        df_copy = df[columns_to_keep].copy()

-        df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)
+        assert len(numeric_columns) == len(
+            set(numeric_columns)
+        ), "You have a double-entered column in the numeric columns list"
+
+        df_copy[numeric_columns] = df_copy[numeric_columns].apply(
+            pd.to_numeric
+        )

        # Convert all columns to numeric and do math
        # Note that we have a few special conditions here, that we handle explicitly.
@ -535,24 +521,6 @@ class ScoreETL(ExtractTransformLoad):
                drop_tracts=drop_tracts,
            )

-            # Min-max normalization:
-            # (
-            #     Observed value
-            #     - minimum of all values
-            # )
-            # divided by
-            # (
-            #    Maximum of all values
-            #     - minimum of all values
-            # )
-            min_value = df_copy[numeric_column].min(skipna=True)
-
-            max_value = df_copy[numeric_column].max(skipna=True)
-
-            df_copy[f"{numeric_column}{field_names.MIN_MAX_FIELD_SUFFIX}"] = (
-                df_copy[numeric_column] - min_value
-            ) / (max_value - min_value)
-
        # Create reversed percentiles for these fields
        for reverse_percentile in reverse_percentiles:
            # Calculate reverse percentiles