Adding DOT composite to travel score (#1820)

This adds the DOT dataset to the ETL and to the score. Note that currently we take a percentile of an average of percentiles.
This commit is contained in:
Emma Nechamkin 2022-08-16 14:44:39 -04:00 committed by GitHub
commit ebac552d75
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 553 additions and 354 deletions

View file

@ -9,6 +9,11 @@ DATASET_LIST = [
"module_dir": "national_risk_index",
"class_name": "NationalRiskIndexETL",
},
{
"name": "travel_composite",
"module_dir": "dot_travel_composite",
"class_name": "TravelCompositeETL",
},
{
"name": "tree_equity_score",
"module_dir": "tree_equity_score",

View file

@ -156,3 +156,16 @@ datasets:
field_type: float
include_in_tiles: true
include_in_downloadable_files: true
- long_name: "DOT Travel Disadvantage Index"
short_name: "DOT"
module_name: "travel_composite"
input_geoid_tract_field_name: "GEOID10_TRACT"
load_fields:
- short_name: "travel_burden"
df_field_name: "TRAVEL_BURDEN_FIELD_NAME"
long_name: "DOT Travel Barriers Score"
field_type: float
include_in_tiles: true
include_in_downloadable_files: true
create_percentile: true

View file

@ -296,6 +296,9 @@ TILES_SCORE_COLUMNS = {
field_names.FPL_200_SERIES: "FPL200S",
## Low high school for t&wd
field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI",
field_names.DOT_BURDEN_PCTILE_THRESHOLD: "TD_ET",
field_names.DOT_TRAVEL_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "TD_PFS"
## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
## FPL_200 (there is no higher ed in narwhal)
}
@ -348,4 +351,5 @@ TILES_SCORE_FLOAT_COLUMNS = [
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.COLLEGE_NON_ATTENDANCE_FIELD,
field_names.COLLEGE_ATTENDANCE_FIELD,
field_names.DOT_TRAVEL_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
]

View file

@ -8,6 +8,9 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.national_risk_index.etl import (
NationalRiskIndexETL,
)
from data_pipeline.etl.sources.dot_travel_composite.etl import (
TravelCompositeETL,
)
from data_pipeline.score.score_runner import ScoreRunner
from data_pipeline.score import field_names
from data_pipeline.etl.score import constants
@ -37,6 +40,7 @@ class ScoreETL(ExtractTransformLoad):
self.census_2010_df: pd.DataFrame
self.child_opportunity_index_df: pd.DataFrame
self.hrs_df: pd.DataFrame
self.dot_travel_disadvantage_df: pd.DataFrame
def extract(self) -> None:
logger.info("Loading data sets from disk.")
@ -115,6 +119,9 @@ class ScoreETL(ExtractTransformLoad):
# Load FEMA national risk index data
self.national_risk_index_df = NationalRiskIndexETL.get_data_frame()
# Load DOT Travel Disadvantage
self.dot_travel_disadvantage_df = TravelCompositeETL.get_data_frame()
# Load GeoCorr Urban Rural Map
geocorr_urban_rural_csv = (
constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
@ -334,6 +341,7 @@ class ScoreETL(ExtractTransformLoad):
self.census_2010_df,
self.child_opportunity_index_df,
self.hrs_df,
self.dot_travel_disadvantage_df,
]
# Sanity check each data frame before merging.
@ -416,6 +424,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.HEALTHY_FOOD_FIELD,
field_names.IMPENETRABLE_SURFACES_FIELD,
field_names.UST_FIELD,
field_names.DOT_TRAVEL_BURDEN_FIELD,
field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
]

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,16 @@
# DOT travel barriers
The below description is taken from DOT directly:
Consistent with OMBs Interim Guidance for the Justice40 Initiative, DOTs interim definition of DACs includes (a) certain qualifying census tracts, (b) any Tribal land, or (c) any territory or possession of the United States. DOT has provided a mapping tool to assist applicants in identifying whether a project is located in a Disadvantaged Community, available at Transportation Disadvantaged Census Tracts (arcgis.com). A shapefile of the geospatial data is available Transportation Disadvantaged Census Tracts shapefile (version 2 .0, posted 5/10/22).
The DOT interim definition for DACs was developed by an internal and external collaborative research process (see recordings from November 2021 public meetings). It includes data for 22 indicators collected at the census tract level and grouped into six (6) categories of transportation disadvantage. The numbers in parenthesis show how many indicators fall in that category:
- Transportation access disadvantage identifies communities and places that spend more, and take longer, to get where they need to go. (4)
- Health disadvantage identifies communities based on variables associated with adverse health outcomes, disability, as well as environmental exposures. (3)
- Environmental disadvantage identifies communities with disproportionately high levels of certain air pollutants and high potential presence of lead-based paint in housing units. (6)
- Economic disadvantage identifies areas and populations with high poverty, low wealth, lack of local jobs, low homeownership, low educational attainment, and high inequality. (7)
Resilience disadvantage identifies communities vulnerable to hazards caused by climate change. (1)
- Equity disadvantage identifies communities with a with a high percentile of persons (age 5+) who speak English "less than well." (1)
The CEJST uses only Transportation Access Disadvantage.

View file

@ -0,0 +1,59 @@
# pylint: disable=unsubscriptable-object
# pylint: disable=unsupported-assignment-operation
import pandas as pd
import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class TravelCompositeETL(ExtractTransformLoad):
"""ETL class for the DOT Travel Disadvantage Dataset"""
NAME = "travel_composite"
SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
# Output score variables (values set on datasets.yml) for linting purposes
TRAVEL_BURDEN_FIELD_NAME: str
def __init__(self):
# define the full path for the input CSV file
self.INPUT_SHP = (
self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
)
# this is the main dataframe
self.df: pd.DataFrame
# Start dataset-specific vars here
## Average of Transportation Indicator Percentiles (calculated)
## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
## See metadata for more information
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method:
- Renames the Census Tract column to match the other datasets
- Converts to CSV
"""
logger.info("Transforming DOT Travel Disadvantage Data")
# read in the unzipped shapefile from data source
# reformat it to be standard df, remove unassigned rows, and
# then rename the Census Tract column for merging
df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
df_dot = df_dot.rename(
columns={
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
}
).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
# Assign the final df to the class' output_df for the load method
self.output_df = df_dot