Adding DOT composite to travel score (#1820)

This adds the DOT dataset to the ETL and to the score. Note that currently we take a percentile of an average of percentiles.
2025-07-26 19:01:17 -07:00 · 2022-08-16 14:44:39 -04:00 · 2022-08-16 14:44:39 -04:00 · ebac552d75
commit ebac552d75
parent 932179841f
17 changed files with 553 additions and 354 deletions
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -9,6 +9,11 @@ DATASET_LIST = [
        "module_dir": "national_risk_index",
        "class_name": "NationalRiskIndexETL",
    },
+    {
+        "name": "travel_composite",
+        "module_dir": "dot_travel_composite",
+        "class_name": "TravelCompositeETL",
+    },
    {
        "name": "tree_equity_score",
        "module_dir": "tree_equity_score",
--- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
+++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@ -156,3 +156,16 @@ datasets:
        field_type: float
        include_in_tiles: true
        include_in_downloadable_files: true
+
+  - long_name: "DOT Travel Disadvantage Index"
+    short_name: "DOT"
+    module_name: "travel_composite"
+    input_geoid_tract_field_name: "GEOID10_TRACT"
+    load_fields:
+      - short_name: "travel_burden"
+        df_field_name: "TRAVEL_BURDEN_FIELD_NAME"
+        long_name: "DOT Travel Barriers Score" 
+        field_type: float
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -296,6 +296,9 @@ TILES_SCORE_COLUMNS = {
    field_names.FPL_200_SERIES: "FPL200S",
    ## Low high school for t&wd
    field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI",
+    field_names.DOT_BURDEN_PCTILE_THRESHOLD: "TD_ET",
+    field_names.DOT_TRAVEL_BURDEN_FIELD
+    + field_names.PERCENTILE_FIELD_SUFFIX: "TD_PFS"
    ## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
    ## FPL_200 (there is no higher ed in narwhal)
 }
@ -348,4 +351,5 @@ TILES_SCORE_FLOAT_COLUMNS = [
    field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
    field_names.COLLEGE_NON_ATTENDANCE_FIELD,
    field_names.COLLEGE_ATTENDANCE_FIELD,
+    field_names.DOT_TRAVEL_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
 ]
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -8,6 +8,9 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.national_risk_index.etl import (
    NationalRiskIndexETL,
 )
+from data_pipeline.etl.sources.dot_travel_composite.etl import (
+    TravelCompositeETL,
+)
 from data_pipeline.score.score_runner import ScoreRunner
 from data_pipeline.score import field_names
 from data_pipeline.etl.score import constants
@ -37,6 +40,7 @@ class ScoreETL(ExtractTransformLoad):
        self.census_2010_df: pd.DataFrame
        self.child_opportunity_index_df: pd.DataFrame
        self.hrs_df: pd.DataFrame
+        self.dot_travel_disadvantage_df: pd.DataFrame

    def extract(self) -> None:
        logger.info("Loading data sets from disk.")
@ -115,6 +119,9 @@ class ScoreETL(ExtractTransformLoad):
        # Load FEMA national risk index data
        self.national_risk_index_df = NationalRiskIndexETL.get_data_frame()

+        # Load DOT Travel Disadvantage
+        self.dot_travel_disadvantage_df = TravelCompositeETL.get_data_frame()
+
        # Load GeoCorr Urban Rural Map
        geocorr_urban_rural_csv = (
            constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
@ -334,6 +341,7 @@ class ScoreETL(ExtractTransformLoad):
            self.census_2010_df,
            self.child_opportunity_index_df,
            self.hrs_df,
+            self.dot_travel_disadvantage_df,
        ]

        # Sanity check each data frame before merging.
@ -416,6 +424,7 @@ class ScoreETL(ExtractTransformLoad):
            field_names.HEALTHY_FOOD_FIELD,
            field_names.IMPENETRABLE_SURFACES_FIELD,
            field_names.UST_FIELD,
+            field_names.DOT_TRAVEL_BURDEN_FIELD,
            field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
            field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
        ]
--- a/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/sample_data/score_data_initial.csv
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/downloadable_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/score_transformed_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/README.md
+++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/README.md
@ -0,0 +1,16 @@
+# DOT travel barriers
+
+The below description is taken from DOT directly: 
+
+Consistent with OMB’s Interim Guidance for the Justice40 Initiative, DOT’s interim definition of DACs includes (a) certain qualifying census tracts, (b) any Tribal land, or (c) any territory or possession of the United States. DOT has provided a mapping tool to assist applicants in identifying whether a project is located in a Disadvantaged Community, available at Transportation Disadvantaged Census Tracts (arcgis.com). A shapefile of the geospatial data is available  Transportation Disadvantaged Census Tracts shapefile (version 2 .0, posted 5/10/22).
+
+The DOT interim definition for DACs was developed by an internal and external collaborative research process (see recordings from November 2021 public meetings). It includes data for 22 indicators collected at the census tract level and grouped into six (6) categories of transportation disadvantage. The numbers in parenthesis show how many indicators fall in that category:
+
+- Transportation access disadvantage identifies communities and places that spend more, and take longer, to get where they need to go. (4)
+- Health disadvantage identifies communities based on variables associated with adverse health outcomes, disability, as well as environmental exposures. (3)
+- Environmental disadvantage identifies communities with disproportionately high levels of certain air pollutants and high potential presence of lead-based paint in housing units. (6)
+- Economic disadvantage identifies areas and populations with high poverty, low wealth, lack of local jobs, low homeownership, low educational attainment, and high inequality. (7)
+Resilience disadvantage identifies communities vulnerable to hazards caused by climate change. (1)
+- Equity disadvantage identifies communities with a with a high percentile of persons (age 5+) who speak English "less than well." (1)
+
+The CEJST uses only Transportation Access Disadvantage. 
--- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/init.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/init.py
--- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
@ -0,0 +1,59 @@
+# pylint: disable=unsubscriptable-object
+# pylint: disable=unsupported-assignment-operation
+
+import pandas as pd
+import geopandas as gpd
+
+from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
+from data_pipeline.utils import get_module_logger
+
+logger = get_module_logger(__name__)
+
+
+class TravelCompositeETL(ExtractTransformLoad):
+    """ETL class for the DOT Travel Disadvantage Dataset"""
+
+    NAME = "travel_composite"
+    SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
+    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
+
+    # Output score variables (values set on datasets.yml) for linting purposes
+    TRAVEL_BURDEN_FIELD_NAME: str
+
+    def __init__(self):
+        # define the full path for the input CSV file
+        self.INPUT_SHP = (
+            self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
+        )
+
+        # this is the main dataframe
+        self.df: pd.DataFrame
+
+        # Start dataset-specific vars here
+        ## Average of Transportation Indicator Percentiles (calculated)
+        ## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
+        ## See metadata for more information
+        self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
+        self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
+
+    def transform(self) -> None:
+        """Reads the unzipped data file into memory and applies the following
+        transformations to prepare it for the load() method:
+
+        - Renames the Census Tract column to match the other datasets
+        - Converts to CSV
+        """
+        logger.info("Transforming DOT Travel Disadvantage Data")
+
+        # read in the unzipped shapefile from data source
+        # reformat it to be standard df, remove unassigned rows, and
+        # then rename the Census Tract column for merging
+        df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
+        df_dot = df_dot.rename(
+            columns={
+                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
+                self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
+            }
+        ).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
+        # Assign the final df to the class' output_df for the load method
+        self.output_df = df_dot