Adding DOT composite to travel score (#1820)

This adds the DOT dataset to the ETL and to the score. Note that currently we take a percentile of an average of percentiles.
2025-07-25 07:20:18 -07:00 · 2022-08-16 14:44:39 -04:00 · 2022-08-16 14:44:39 -04:00 · ebac552d75
commit ebac552d75
parent 932179841f
17 changed files with 553 additions and 354 deletions
--- a/data/data-pipeline/data_pipeline/content/config/csv.yml
+++ b/data/data-pipeline/data_pipeline/content/config/csv.yml
@ -260,6 +260,12 @@ fields:
  - score_name: Greater than or equal to the 90th percentile for leaky underground storage tanks and is low income?
    label: Greater than or equal to the 90th percentile for leaky underground storage tanks and is low income?
    format: bool
+  - score_name: Greater than or equal to the 90th percentile for DOT transit barriers and is low income?
+    label: Greater than or equal to the 90th percentile for DOT transit barriers and is low income?
+    format: bool
+  - score_name: DOT Travel Barriers Score (percentile)
+    label: DOT Travel Barriers Score (percentile)
+    format: percentage
  - score_name: Leaky underground storage tanks (percentile)
    label: Leaky underground storage tanks (percentile)
    format: percentage
--- a/data/data-pipeline/data_pipeline/content/config/excel.yml
+++ b/data/data-pipeline/data_pipeline/content/config/excel.yml
@ -258,6 +258,12 @@ sheets:
      - score_name: Unemployment (percent) in 2009 (island areas) and 2010 (states and PR)
        label: Unemployment (percent) in 2009 (island areas) and 2010 (states and PR)
        format: percentage
+      - score_name: Greater than or equal to the 90th percentile for DOT transit barriers and is low income?
+        label: Greater than or equal to the 90th percentile for DOT transit barriers and is low income?
+        format: bool
+      - score_name: DOT Travel Barriers Score (percentile)
+        label: DOT Travel Barriers Score (percentile)
+        format: percentage
      - score_name: Percentage households below 100% of federal poverty line in 2009 (island areas) and 2010 (states and PR)
        label: Percentage households below 100% of federal poverty line in 2009 (island areas) and 2010 (states and PR)
        format: percentage
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -9,6 +9,11 @@ DATASET_LIST = [
        "module_dir": "national_risk_index",
        "class_name": "NationalRiskIndexETL",
    },
+    {
+        "name": "travel_composite",
+        "module_dir": "dot_travel_composite",
+        "class_name": "TravelCompositeETL",
+    },
    {
        "name": "tree_equity_score",
        "module_dir": "tree_equity_score",
--- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
+++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@ -156,3 +156,16 @@ datasets:
        field_type: float
        include_in_tiles: true
        include_in_downloadable_files: true
+
+  - long_name: "DOT Travel Disadvantage Index"
+    short_name: "DOT"
+    module_name: "travel_composite"
+    input_geoid_tract_field_name: "GEOID10_TRACT"
+    load_fields:
+      - short_name: "travel_burden"
+        df_field_name: "TRAVEL_BURDEN_FIELD_NAME"
+        long_name: "DOT Travel Barriers Score" 
+        field_type: float
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -296,6 +296,9 @@ TILES_SCORE_COLUMNS = {
    field_names.FPL_200_SERIES: "FPL200S",
    ## Low high school for t&wd
    field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI",
+    field_names.DOT_BURDEN_PCTILE_THRESHOLD: "TD_ET",
+    field_names.DOT_TRAVEL_BURDEN_FIELD
+    + field_names.PERCENTILE_FIELD_SUFFIX: "TD_PFS"
    ## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
    ## FPL_200 (there is no higher ed in narwhal)
 }
@ -348,4 +351,5 @@ TILES_SCORE_FLOAT_COLUMNS = [
    field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.COLLEGE_NON_ATTENDANCE_FIELD,
    field_names.COLLEGE_ATTENDANCE_FIELD,
+    field_names.DOT_TRAVEL_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
 ]
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -8,6 +8,9 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.national_risk_index.etl import (
    NationalRiskIndexETL,
 )
+from data_pipeline.etl.sources.dot_travel_composite.etl import (
+    TravelCompositeETL,
+)
 from data_pipeline.score.score_runner import ScoreRunner
 from data_pipeline.score import field_names
 from data_pipeline.etl.score import constants
@ -37,6 +40,7 @@ class ScoreETL(ExtractTransformLoad):
        self.census_2010_df: pd.DataFrame
        self.child_opportunity_index_df: pd.DataFrame
        self.hrs_df: pd.DataFrame
+        self.dot_travel_disadvantage_df: pd.DataFrame

    def extract(self) -> None:
        logger.info("Loading data sets from disk.")
@ -115,6 +119,9 @@ class ScoreETL(ExtractTransformLoad):
        # Load FEMA national risk index data
        self.national_risk_index_df = NationalRiskIndexETL.get_data_frame()

+        # Load DOT Travel Disadvantage
+        self.dot_travel_disadvantage_df = TravelCompositeETL.get_data_frame()
+
        # Load GeoCorr Urban Rural Map
        geocorr_urban_rural_csv = (
            constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
@ -334,6 +341,7 @@ class ScoreETL(ExtractTransformLoad):
            self.census_2010_df,
            self.child_opportunity_index_df,
            self.hrs_df,
+            self.dot_travel_disadvantage_df,
        ]

        # Sanity check each data frame before merging.
@ -416,6 +424,7 @@ class ScoreETL(ExtractTransformLoad):
            field_names.HEALTHY_FOOD_FIELD,
            field_names.IMPENETRABLE_SURFACES_FIELD,
            field_names.UST_FIELD,
+            field_names.DOT_TRAVEL_BURDEN_FIELD,
            field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
        ]
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/README.md
@ -0,0 +1,16 @@
+# DOT travel barriers
+
+The below description is taken from DOT directly: 
+
+Consistent with OMB’s Interim Guidance for the Justice40 Initiative, DOT’s interim definition of DACs includes (a) certain qualifying census tracts, (b) any Tribal land, or (c) any territory or possession of the United States. DOT has provided a mapping tool to assist applicants in identifying whether a project is located in a Disadvantaged Community, available at Transportation Disadvantaged Census Tracts (arcgis.com). A shapefile of the geospatial data is available  Transportation Disadvantaged Census Tracts shapefile (version 2 .0, posted 5/10/22).
+
+The DOT interim definition for DACs was developed by an internal and external collaborative research process (see recordings from November 2021 public meetings). It includes data for 22 indicators collected at the census tract level and grouped into six (6) categories of transportation disadvantage. The numbers in parenthesis show how many indicators fall in that category:
+
+- Transportation access disadvantage identifies communities and places that spend more, and take longer, to get where they need to go. (4)
+- Health disadvantage identifies communities based on variables associated with adverse health outcomes, disability, as well as environmental exposures. (3)
+- Environmental disadvantage identifies communities with disproportionately high levels of certain air pollutants and high potential presence of lead-based paint in housing units. (6)
+- Economic disadvantage identifies areas and populations with high poverty, low wealth, lack of local jobs, low homeownership, low educational attainment, and high inequality. (7)
+Resilience disadvantage identifies communities vulnerable to hazards caused by climate change. (1)
+- Equity disadvantage identifies communities with a with a high percentile of persons (age 5+) who speak English "less than well." (1)
+
+The CEJST uses only Transportation Access Disadvantage. 
--- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
@ -0,0 +1,59 @@
+# pylint: disable=unsubscriptable-object
+# pylint: disable=unsupported-assignment-operation
+
+import pandas as pd
+import geopandas as gpd
+
+from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+class TravelCompositeETL(ExtractTransformLoad):
+    """ETL class for the DOT Travel Disadvantage Dataset"""
+
+    NAME = "travel_composite"
+    SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
+    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
+
+    # Output score variables (values set on datasets.yml) for linting purposes
+    TRAVEL_BURDEN_FIELD_NAME: str
+
+    def __init__(self):
+        # define the full path for the input CSV file
+        self.INPUT_SHP = (
+            self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
+        )
+
+        # this is the main dataframe
+        self.df: pd.DataFrame
+
+        # Start dataset-specific vars here
+        ## Average of Transportation Indicator Percentiles (calculated)
+        ## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
+        ## See metadata for more information
+        self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
+        self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
+
+    def transform(self) -> None:
+        """Reads the unzipped data file into memory and applies the following
+        transformations to prepare it for the load() method:
+
+        - Renames the Census Tract column to match the other datasets
+        - Converts to CSV
+        """
+        logger.info("Transforming DOT Travel Disadvantage Data")
+
+        # read in the unzipped shapefile from data source
+        # reformat it to be standard df, remove unassigned rows, and
+        # then rename the Census Tract column for merging
+        df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
+        df_dot = df_dot.rename(
+            columns={
+                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
+                self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
+            }
+        ).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
+        # Assign the final df to the class' output_df for the load method
+        self.output_df = df_dot
--- a/data/data-pipeline/data_pipeline/score/field_names.py
+++ b/data/data-pipeline/data_pipeline/score/field_names.py
@ -344,6 +344,9 @@ CDC_SVI_INDEX_RPL_THEMES_OVERALL_FIELD: str = (
 )
 CDC_SVI_INDEX_THEMES_PRIORITY_COMMUNITY: str = "At or above 90 for overall percentile ranking according to Social Vulnerability Indices"

+# DOT Travel Burden Data
+DOT_TRAVEL_BURDEN_FIELD: str = "DOT Travel Barriers Score"
+
 # Maryland EJSCREEN Data.
 MARYLAND_EJSCREEN_SCORE_FIELD: str = "Maryland Environmental Justice Score"

@ -416,6 +419,7 @@ DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD = (
 )
 TRAFFIC_PROXIMITY_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for traffic proximity and is low income?"

+
 # Affordable and Sustainable Housing
 LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD = (
    f"Greater than or equal to the {PERCENTILE}th percentile for lead paint and"
@ -494,6 +498,10 @@ TRAFFIC_PROXIMITY_LOW_INCOME_LOW_HIGHER_ED_FIELD = (
    f"traffic proximity{SCORE_M_LOW_INCOME_SUFFIX}?"
 )

+DOT_TRAVEL_BURDEN_LOW_INCOME_FIELD = (
+    f"Greater than or equal to the {PERCENTILE}th percentile "
+    f"for DOT transit barriers and is low income?"
+)
 # Affordable and Sustainable Housing
 LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_LOW_HIGHER_ED_FIELD = (
    f"Greater than or equal to the {PERCENTILE}th percentile for lead paint,"
@ -624,6 +632,7 @@ PM25_EXCEEDS_PCTILE_THRESHOLD = (
 )
 DIESEL_EXCEEDS_PCTILE_THRESHOLD = f"Greater than or equal to the {PERCENTILE}th percentile for diesel particulate matter"
 TRAFFIC_PROXIMITY_PCTILE_THRESHOLD = f"Greater than or equal to the {PERCENTILE}th percentile for traffic proximity"
+DOT_BURDEN_PCTILE_THRESHOLD = f"Greater than or equal to the {PERCENTILE}th percentile for DOT travel barriers"
 LEAD_PAINT_PROXY_PCTILE_THRESHOLD = (
    f"Greater than or equal to the {PERCENTILE}th percentile for lead paint and"
    f" the median house value is less than {MEDIAN_HOUSE_VALUE_PERCENTILE}th "
--- a/data/data-pipeline/data_pipeline/score/score_narwhal.py
+++ b/data/data-pipeline/data_pipeline/score/score_narwhal.py
@ -246,6 +246,8 @@ class ScoreNarwhal(Score):
        # In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)]
        # or
        # In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data
+        # or
+        # In Xth percentile or above for DOT Travel Disadvantage
        # AND
        # Low income: In Nth percentile or above for percent of block group population
        # of households where household income is less than or equal to twice the federal
@ -255,6 +257,7 @@ class ScoreNarwhal(Score):
        transportion_eligibility_columns = [
            field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD,
            field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD,
+            field_names.DOT_TRAVEL_BURDEN_LOW_INCOME_FIELD,
        ]

        self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD] = (
@ -264,6 +267,14 @@ class ScoreNarwhal(Score):
            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
        )

+        self.df[field_names.DOT_BURDEN_PCTILE_THRESHOLD] = (
+            self.df[
+                field_names.DOT_TRAVEL_BURDEN_FIELD
+                + field_names.PERCENTILE_FIELD_SUFFIX
+            ]
+            >= self.ENVIRONMENTAL_BURDEN_THRESHOLD
+        )
+
        self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD] = (
            self.df[
                field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
@ -274,6 +285,7 @@ class ScoreNarwhal(Score):
        self.df[field_names.TRAFFIC_THRESHOLD_EXCEEDED] = (
            self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD]
            | self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD]
+            | self.df[field_names.DOT_BURDEN_PCTILE_THRESHOLD]
        )

        self.df[field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD] = (
@ -286,6 +298,11 @@ class ScoreNarwhal(Score):
            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
        )

+        self.df[field_names.DOT_TRAVEL_BURDEN_LOW_INCOME_FIELD] = (
+            self.df[field_names.DOT_BURDEN_PCTILE_THRESHOLD]
+            & self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
+        )
+
        self._increment_total_eligibility_exceeded(
            transportion_eligibility_columns,
            skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,