Revert "Fast flag update (#1844)"

This reverts commit d892bce6cf.
2025-02-22 01:31:25 -08:00 · 2022-08-19 14:05:45 -04:00 · 2022-08-19 14:05:45 -04:00 · 5c41c95764
commit 5c41c95764
parent d892bce6cf
14 changed files with 31 additions and 63 deletions
--- a/data/data-pipeline/data_pipeline/content/config/csv.yml
+++ b/data/data-pipeline/data_pipeline/content/config/csv.yml
@ -21,14 +21,17 @@ fields:
    label: Total categories exceeded
    format: int64
  - score_name: Definition N (communities)
-    label: Identified as disadvantaged without considering neighbors
-    format: bool
-  - score_name: Definition N (communities) (based on adjacency index and low income alone)
-    label: Identified as disadvantaged based on neighbors and relaxed low income threshold only
-    format: bool
-  - score_name: Definition M community, including adjacency index tracts
    label: Identified as disadvantaged
    format: bool
+  - score_name: Definition N (communities) (including adjacency index)
+    label: Identified as disadvantaged (including adjacency index)
+    format: bool
+  - score_name: Is the tract surrounded by disadvantaged communities?
+    label: Is the tract surrounded by disadvantaged communities?
+    format: bool
+  - score_name: Meets the less stringent low income criterion for the adjacency index?
+    label: Meets the less stringent low income criterion for the adjacency index?
+    format: bool
  - score_name: Definition N (communities) (average of neighbors)
    label: Share of neighbors that are identified as disadvantaged
    format: percentage
@ -338,6 +341,3 @@ fields:
  - score_name: Tract-level redlining score meets or exceeds 3.25
    label: Tract experienced historic underinvestment
    format: bool
-  - score_name: Income data has been estimated based on neighbor income
-    label: Income data has been estimated based on geographic neighbor income
-    format: bool
--- a/data/data-pipeline/data_pipeline/content/config/excel.yml
+++ b/data/data-pipeline/data_pipeline/content/config/excel.yml
@ -25,14 +25,17 @@ sheets:
        label: Total categories exceeded
        format: int64
      - score_name: Definition N (communities)
-        label: Identified as disadvantaged without considering neighbors
-        format: bool
-      - score_name: Definition N (communities) (based on adjacency index and low income alone)
-        label: Identified as disadvantaged based on neighbors and relaxed low income threshold only
-        format: bool
-      - score_name: Definition M community, including adjacency index tracts
        label: Identified as disadvantaged
        format: bool
+      - score_name: Definition N (communities) (including adjacency index)
+        label: Identified as disadvantaged (including adjacency index)
+        format: bool
+      - score_name: Is the tract surrounded by disadvantaged communities?
+        label: Is the tract surrounded by disadvantaged communities?
+        format: bool
+      - score_name: Meets the less stringent low income criterion for the adjacency index?
+        label: Meets the less stringent low income criterion for the adjacency index?
+        format: bool
      - score_name: Definition N (communities) (average of neighbors)
        label: Share of neighbors that are identified as disadvantaged
        format: percentage
@ -342,6 +345,3 @@ sheets:
      - score_name: Tract-level redlining score meets or exceeds 3.25
        label: Tract experienced historic underinvestment
        format: bool
-      - score_name: Income data has been estimated based on neighbor income
-        label: Income data has been estimated based on geographic neighbor income
-        format: bool
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -208,10 +208,9 @@ TILES_SCORE_COLUMNS = {
    field_names.M_HEALTH: "M_HLTH",
    # temporarily update this so that it's the Narwhal score that gets visualized on the map
    # The NEW final score value INCLUDES the adjacency index.
-    field_names.FINAL_SCORE_N_BOOLEAN: "SM_C",
+    field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX: "SM_C",
    field_names.SCORE_N_COMMUNITIES
-    + field_names.ADJACENT_MEAN_SUFFIX: "SM_DON",
-    field_names.SCORE_N_COMMUNITIES: "SM_NO_DON",
+    + field_names.PERCENTILE_FIELD_SUFFIX: "SM_PFS",
    field_names.EXPECTED_POPULATION_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EPLRLI",
    field_names.EXPECTED_AGRICULTURE_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EALRLI",
    field_names.EXPECTED_BUILDING_LOSS_RATE_LOW_INCOME_LOW_HIGHER_ED_FIELD: "EBLRLI",
@ -314,8 +313,7 @@ TILES_SCORE_COLUMNS = {
    + field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
    field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
    field_names.AML_BOOLEAN: "AML_ET",
-    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET",
-    field_names.IMPUTED_INCOME_FLAG_FIELD_NAME: "IMP_FLG"
+    field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME: "FUDS_ET"
    ## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
    ## FPL_200 (there is no higher ed in narwhal)
 }
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -471,7 +471,6 @@ class ScoreETL(ExtractTransformLoad):
            field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
            field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
            field_names.AML_BOOLEAN,
-            field_names.IMPUTED_INCOME_FLAG_FIELD_NAME,
        ]

        # For some columns, high values are "good", so we want to reverse the percentile
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -521,6 +521,8 @@ class PostScoreETL(ExtractTransformLoad):
        score_tiles_df.to_csv(tile_score_path, index=False, encoding="utf-8")

    def _load_downloadable_zip(self, downloadable_info_path: Path) -> None:
+        logger.info("Saving Downloadable CSV")
+
        downloadable_info_path.mkdir(parents=True, exist_ok=True)
        csv_path = constants.SCORE_DOWNLOADABLE_CSV_FILE_PATH
        excel_path = constants.SCORE_DOWNLOADABLE_EXCEL_FILE_PATH
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -227,7 +227,6 @@ class CensusACSETL(ExtractTransformLoad):
                self.COLLEGE_ATTENDANCE_FIELD,
                self.COLLEGE_NON_ATTENDANCE_FIELD,
                self.IMPUTED_COLLEGE_ATTENDANCE_FIELD,
-                field_names.IMPUTED_INCOME_FLAG_FIELD_NAME,
            ]
            + self.RE_OUTPUT_FIELDS
            + [self.PERCENT_PREFIX + field for field in self.RE_OUTPUT_FIELDS]
@ -504,13 +503,6 @@ class CensusACSETL(ExtractTransformLoad):
            }
        )

-        # We generate a boolean that is TRUE when there is an imputed income but not a baseline income, and FALSE otherwise.
-        # This allows us to see which tracts have an imputed income. 
-        df[field_names.IMPUTED_INCOME_FLAG_FIELD_NAME] = (
-            df[field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD].notna()
-            & df[field_names.POVERTY_LESS_THAN_200_FPL_FIELD].isna()
-        )
-
        # Strip columns and save results to self.
        self.df = df[self.COLUMNS_TO_KEEP]

--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl_imputations.py
@ -92,17 +92,12 @@ def calculate_income_measures(
    )

    # Iterate through the dataframe to impute in place
-    ## TODO: We should probably convert this to a spatial join now that we are doing >1 imputation and it's taking a lot
-    ## of time, but thinking through how to do this while maintaining the masking will take some time. I think the best
-    ## way would be to (1) spatial join to all neighbors, and then (2) iterate to take the "smallest" set of neighbors...
-    ## but haven't implemented it yet.
    for index, row in geo_df.iterrows():
        if row[geoid_field] in tract_list:
            neighbor_mask = _get_neighbor_mask(geo_df, row)
            county_mask = _get_fips_mask(
                geo_df=geo_df, row=row, fips_digits=5, geoid_field=geoid_field
            )
-            ## TODO: Did CEQ decide to cut this?
            state_mask = _get_fips_mask(
                geo_df=geo_df, row=row, fips_digits=2, geoid_field=geoid_field
            )
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -1,7 +1,7 @@
 # Suffixes
 PERCENTILE_FIELD_SUFFIX = " (percentile)"
 ISLAND_AREAS_PERCENTILE_ADJUSTMENT_FIELD = " for island areas"
-ADJACENT_MEAN_SUFFIX = " (based on adjacency index and low income alone)"
+ADJACENT_MEAN_SUFFIX = " (including adjacency index)"
 ADJACENCY_INDEX_SUFFIX = " (average of neighbors)"

 # Geographic field names
@ -12,9 +12,6 @@ COUNTY_FIELD = "County Name"
 # Score file field names
 # Definition M fields
 SCORE_M = "Definition M"
-FINAL_SCORE_N_BOOLEAN = (
-    "Definition M community, including adjacency index tracts"
-)
 SCORE_M_COMMUNITIES = "Definition M (communities)"
 M_CLIMATE = "Climate Factor (Definition M)"
 M_ENERGY = "Energy Factor (Definition M)"
@ -70,9 +67,6 @@ ADJUSTED_POVERTY_LESS_THAN_200_PERCENT_FPL_FIELD_NAME = (

 # this is what gets used in the score
 POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD = "Percent of individuals below 200% Federal Poverty Line, imputed and adjusted"
-IMPUTED_INCOME_FLAG_FIELD_NAME = (
-    "Income data has been estimated based on neighbor income"
-)
 POVERTY_LESS_THAN_150_FPL_FIELD = (
    "Percent of individuals < 150% Federal Poverty Line"
 )
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -385,10 +385,8 @@ class ScoreNarwhal(Score):

        # Kitchen / plumbing
        self.df[field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_PCTILE_THRESHOLD] = (
-            self.df[
-                field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD
-                + field_names.PERCENTILE_FIELD_SUFFIX
-            ]
+            self.df[field_names.NO_KITCHEN_OR_INDOOR_PLUMBING_FIELD 
+                    + field_names.PERCENTILE_FIELD_SUFFIX]
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )

@ -973,8 +971,8 @@ class ScoreNarwhal(Score):
            >= self.SCORE_THRESHOLD_DONUT
        )

-        # This constructs the boolean for whether it's a donut hole community
-        # This can also be true when the tract itself is a DAC on its own
+        # This should be the "final list" of Score Narwhal communities, meaning that we would
+        # expect this to be True if either the tract is a donut hole community OR the tract is a DAC
        self.df[
            field_names.SCORE_N_COMMUNITIES + field_names.ADJACENT_MEAN_SUFFIX
        ] = (
@ -982,16 +980,6 @@ class ScoreNarwhal(Score):
            & self.df[field_names.ADJACENT_TRACT_SCORE_ABOVE_DONUT_THRESHOLD]
        )

-        # This should be the "final list" of Score Narwhal communities, meaning that we would
-        # expect this to be True if either the tract is a donut hole community OR the tract is a DAC
-        self.df[field_names.FINAL_SCORE_N_BOOLEAN] = (
-            self.df[field_names.SCORE_N_COMMUNITIES]
-            | self.df[
-                field_names.SCORE_N_COMMUNITIES
-                + field_names.ADJACENT_MEAN_SUFFIX
-            ]
-        )
-
    def add_columns(self) -> pd.DataFrame:
        logger.info("Adding Score Narhwal")