Adding NLCD data (#1826)

Adding NLCD's natural space indicator end to end to the score.
2025-07-28 13:51:16 -07:00 · 2022-08-17 14:21:28 -04:00 · 2022-08-17 14:21:28 -04:00 · 7d89d41e49
commit 7d89d41e49
parent 49623e4da0
18 changed files with 288 additions and 18 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
+++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@ -35,7 +35,6 @@ datasets:
        include_in_tiles: true
        include_in_downloadable_files: true
        create_percentile: true
-
      - short_name: "ex_ag_loss"
        df_field_name: "EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME"
        long_name: "Expected agricultural loss rate (Natural Hazards Risk Index)"
@ -54,7 +53,6 @@ datasets:
        include_in_tiles: true
        include_in_downloadable_files: true
        create_percentile: true
-
      - short_name: "ex_bldg_loss"
        df_field_name: "EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME"
        long_name: "Expected building loss rate (Natural Hazards Risk Index)"
@ -72,7 +70,6 @@ datasets:
        include_in_tiles: true
        include_in_downloadable_files: true
        create_percentile: true
-
      - short_name: "has_ag_val"
        df_field_name: "CONTAINS_AGRIVALUE"
        long_name: "Contains agricultural value"
@ -168,7 +165,6 @@ datasets:
        field_type: float
        include_in_tiles: true
        include_in_downloadable_files: true
-
  - long_name: "First Street Foundation Flood Risk"
    short_name: "FSF Flood Risk"
    module_name: fsf_flood_risk
@ -209,7 +205,6 @@ datasets:
        include_in_tiles: false
        include_in_downloadable_files: true
        create_percentile: true
-
  - long_name: "First Street Foundation Wildfire Risk"
    short_name: "FSF Wildfire Risk"
    module_name: fsf_wildfire_risk
@ -250,7 +245,6 @@ datasets:
        include_in_tiles: false
        include_in_downloadable_files: true
        create_percentile: true
-
  - long_name: "DOT Travel Disadvantage Index"
    short_name: "DOT"
    module_name: "travel_composite"
@ -263,3 +257,36 @@ datasets:
        include_in_tiles: true
        include_in_downloadable_files: true
        create_percentile: true
+  - long_name: "National Land Cover Database (NLCD) Lack of Green Space / Nature-Deprived Communities dataset, as compiled by TPL"
+    short_name: "nlcd_nature_deprived"
+    module_name: "nlcd_nature_deprived"
+    input_geoid_tract_field_name: "GEOID10_TRACT"
+    load_fields:
+      - short_name: "ncld_eligible"
+        df_field_name: "ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME"
+        long_name: "Does the tract have at least 35 acres in it?" 
+        field_type: bool
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: false
+      - short_name: "percent_impervious"
+        df_field_name: "TRACT_PERCENT_IMPERVIOUS_FIELD_NAME"
+        long_name: "Share of the tract's land area that is covered by impervious surface as a percent" 
+        field_type: percentage
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
+      - short_name: "percent_nonnatural"
+        df_field_name: "TRACT_PERCENT_NON_NATURAL_FIELD_NAME"
+        long_name: "Share of the tract's land area that is covered by impervious surface or cropland as a percent" 
+        field_type: percentage
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
+      - short_name: "percent_cropland"
+        df_field_name: "TRACT_PERCENT_CROPLAND_FIELD_NAME"
+        long_name: "Share of the tract's land area that is covered by cropland as a percent" 
+        field_type: percentage
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -305,6 +305,9 @@ TILES_SCORE_COLUMNS = {
    + field_names.PERCENTILE_FIELD_SUFFIX: "WF_PFS",
    field_names.HIGH_FUTURE_FLOOD_RISK_FIELD: "FLD_ET",
    field_names.HIGH_FUTURE_WILDFIRE_RISK_FIELD: "WF_ET",
+    field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
+    + field_names.PERCENTILE_FIELD_SUFFIX: "IS_PFS",
+    field_names.NON_NATURAL_LOW_INCOME_FIELD_NAME: "IS_ET",
    ## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
    ## FPL_200 (there is no higher ed in narwhal)
 }
@ -361,4 +364,6 @@ TILES_SCORE_FLOAT_COLUMNS = [
    field_names.FUTURE_FLOOD_RISK_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.FUTURE_WILDFIRE_RISK_FIELD
    + field_names.PERCENTILE_FIELD_SUFFIX,
+    field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME
+    + field_names.PERCENTILE_FIELD_SUFFIX,
 ]
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -14,6 +14,7 @@ from data_pipeline.etl.sources.dot_travel_composite.etl import (
 from data_pipeline.etl.sources.fsf_flood_risk.etl import (
    FloodRiskETL,
 )
+from data_pipeline.etl.sources.nlcd_nature_deprived.etl import NatureDeprivedETL
 from data_pipeline.etl.sources.fsf_wildfire_risk.etl import WildfireRiskETL
 from data_pipeline.score.score_runner import ScoreRunner
 from data_pipeline.score import field_names
@ -47,6 +48,7 @@ class ScoreETL(ExtractTransformLoad):
        self.dot_travel_disadvantage_df: pd.DataFrame
        self.fsf_flood_df: pd.DataFrame
        self.fsf_fire_df: pd.DataFrame
+        self.nature_deprived_df: pd.DataFrame

    def extract(self) -> None:
        logger.info("Loading data sets from disk.")
@ -134,6 +136,9 @@ class ScoreETL(ExtractTransformLoad):
        # Load flood risk data
        self.fsf_flood_df = FloodRiskETL.get_data_frame()

+        # Load NLCD Nature-Deprived Communities data
+        self.nature_deprived_df = NatureDeprivedETL.get_data_frame()
+
        # Load GeoCorr Urban Rural Map
        geocorr_urban_rural_csv = (
            constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
@ -356,6 +361,7 @@ class ScoreETL(ExtractTransformLoad):
            self.dot_travel_disadvantage_df,
            self.fsf_flood_df,
            self.fsf_fire_df,
+            self.nature_deprived_df,
        ]

        # Sanity check each data frame before merging.
@ -439,9 +445,9 @@ class ScoreETL(ExtractTransformLoad):
            field_names.IMPENETRABLE_SURFACES_FIELD,
            field_names.UST_FIELD,
            field_names.DOT_TRAVEL_BURDEN_FIELD,
-            field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
            field_names.FUTURE_FLOOD_RISK_FIELD,
            field_names.FUTURE_WILDFIRE_RISK_FIELD,
+            field_names.TRACT_PERCENT_NON_NATURAL_FIELD_NAME,
            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
        ]

@ -449,6 +455,8 @@ class ScoreETL(ExtractTransformLoad):
            self.GEOID_TRACT_FIELD_NAME,
            field_names.PERSISTENT_POVERTY_FIELD,
            field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
+            field_names.TRACT_ELIGIBLE_FOR_NONNATURAL_THRESHOLD,
+            field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
        ]

        # For some columns, high values are "good", so we want to reverse the percentile
@ -500,7 +508,7 @@ class ScoreETL(ExtractTransformLoad):
        df_copy[numeric_columns] = df_copy[numeric_columns].apply(pd.to_numeric)

        # Convert all columns to numeric and do math
-        # Note that we have a few special conditions here, that we handle explicitly.
+        # Note that we have a few special conditions here and we handle them explicitly.
        #     For *Linguistic Isolation*, we do NOT want to include Puerto Rico in the percentile
        #     calculation. This is because linguistic isolation as a category doesn't make much sense
        #     in Puerto Rico, where Spanish is a recognized language. Thus, we construct a list
@ -509,6 +517,10 @@ class ScoreETL(ExtractTransformLoad):
        #     For *Expected Agricultural Loss*, we only want to include in the percentile tracts
        #     in which there is some agricultural value. This helps us adjust the data such that we have
        #     the ability to discern which tracts truly are at the 90th percentile, since many tracts have 0 value.
+        #
+        #     For *Non-Natural Space*, we may only want to include tracts that have at least 35 acreas, I think. This will
+        #     get rid of  tracts that we think are aberrations statistically. Right now, we have left this out
+        #     pending ground-truthing.

        for numeric_column in numeric_columns:
            drop_tracts = []
@ -524,7 +536,6 @@ class ScoreETL(ExtractTransformLoad):
                logger.info(
                    f"Dropping {len(drop_tracts)} tracts from Agricultural Value Loss"
                )
-
            elif numeric_column == field_names.LINGUISTIC_ISO_FIELD:
                drop_tracts = df_copy[
                    # 72 is the FIPS code for Puerto Rico
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl