Adding DOT composite to travel score (#1820)

This adds the DOT dataset to the ETL and to the score. Note that currently we take a percentile of an average of percentiles.
This commit is contained in:
Emma Nechamkin 2022-08-16 14:44:39 -04:00 committed by GitHub
commit ebac552d75
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 553 additions and 354 deletions

View file

@ -260,6 +260,12 @@ fields:
- score_name: Greater than or equal to the 90th percentile for leaky underground storage tanks and is low income?
label: Greater than or equal to the 90th percentile for leaky underground storage tanks and is low income?
format: bool
- score_name: Greater than or equal to the 90th percentile for DOT transit barriers and is low income?
label: Greater than or equal to the 90th percentile for DOT transit barriers and is low income?
format: bool
- score_name: DOT Travel Barriers Score (percentile)
label: DOT Travel Barriers Score (percentile)
format: percentage
- score_name: Leaky underground storage tanks (percentile)
label: Leaky underground storage tanks (percentile)
format: percentage

View file

@ -258,6 +258,12 @@ sheets:
- score_name: Unemployment (percent) in 2009 (island areas) and 2010 (states and PR)
label: Unemployment (percent) in 2009 (island areas) and 2010 (states and PR)
format: percentage
- score_name: Greater than or equal to the 90th percentile for DOT transit barriers and is low income?
label: Greater than or equal to the 90th percentile for DOT transit barriers and is low income?
format: bool
- score_name: DOT Travel Barriers Score (percentile)
label: DOT Travel Barriers Score (percentile)
format: percentage
- score_name: Percentage households below 100% of federal poverty line in 2009 (island areas) and 2010 (states and PR)
label: Percentage households below 100% of federal poverty line in 2009 (island areas) and 2010 (states and PR)
format: percentage

View file

@ -9,6 +9,11 @@ DATASET_LIST = [
"module_dir": "national_risk_index",
"class_name": "NationalRiskIndexETL",
},
{
"name": "travel_composite",
"module_dir": "dot_travel_composite",
"class_name": "TravelCompositeETL",
},
{
"name": "tree_equity_score",
"module_dir": "tree_equity_score",

View file

@ -156,3 +156,16 @@ datasets:
field_type: float
include_in_tiles: true
include_in_downloadable_files: true
- long_name: "DOT Travel Disadvantage Index"
short_name: "DOT"
module_name: "travel_composite"
input_geoid_tract_field_name: "GEOID10_TRACT"
load_fields:
- short_name: "travel_burden"
df_field_name: "TRAVEL_BURDEN_FIELD_NAME"
long_name: "DOT Travel Barriers Score"
field_type: float
include_in_tiles: true
include_in_downloadable_files: true
create_percentile: true

View file

@ -296,6 +296,9 @@ TILES_SCORE_COLUMNS = {
field_names.FPL_200_SERIES: "FPL200S",
## Low high school for t&wd
field_names.WORKFORCE_SOCIO_INDICATORS_EXCEEDED: "M_WKFC_EBSI",
field_names.DOT_BURDEN_PCTILE_THRESHOLD: "TD_ET",
field_names.DOT_TRAVEL_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX: "TD_PFS"
## FPL 200 and low higher ed for all others should no longer be M_EBSI, but rather
## FPL_200 (there is no higher ed in narwhal)
}
@ -348,4 +351,5 @@ TILES_SCORE_FLOAT_COLUMNS = [
field_names.WASTEWATER_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
field_names.COLLEGE_NON_ATTENDANCE_FIELD,
field_names.COLLEGE_ATTENDANCE_FIELD,
field_names.DOT_TRAVEL_BURDEN_FIELD + field_names.PERCENTILE_FIELD_SUFFIX,
]

View file

@ -8,6 +8,9 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.national_risk_index.etl import (
NationalRiskIndexETL,
)
from data_pipeline.etl.sources.dot_travel_composite.etl import (
TravelCompositeETL,
)
from data_pipeline.score.score_runner import ScoreRunner
from data_pipeline.score import field_names
from data_pipeline.etl.score import constants
@ -37,6 +40,7 @@ class ScoreETL(ExtractTransformLoad):
self.census_2010_df: pd.DataFrame
self.child_opportunity_index_df: pd.DataFrame
self.hrs_df: pd.DataFrame
self.dot_travel_disadvantage_df: pd.DataFrame
def extract(self) -> None:
logger.info("Loading data sets from disk.")
@ -115,6 +119,9 @@ class ScoreETL(ExtractTransformLoad):
# Load FEMA national risk index data
self.national_risk_index_df = NationalRiskIndexETL.get_data_frame()
# Load DOT Travel Disadvantage
self.dot_travel_disadvantage_df = TravelCompositeETL.get_data_frame()
# Load GeoCorr Urban Rural Map
geocorr_urban_rural_csv = (
constants.DATA_PATH / "dataset" / "geocorr" / "usa.csv"
@ -334,6 +341,7 @@ class ScoreETL(ExtractTransformLoad):
self.census_2010_df,
self.child_opportunity_index_df,
self.hrs_df,
self.dot_travel_disadvantage_df,
]
# Sanity check each data frame before merging.
@ -416,6 +424,7 @@ class ScoreETL(ExtractTransformLoad):
field_names.HEALTHY_FOOD_FIELD,
field_names.IMPENETRABLE_SURFACES_FIELD,
field_names.UST_FIELD,
field_names.DOT_TRAVEL_BURDEN_FIELD,
field_names.AGRICULTURAL_VALUE_BOOL_FIELD,
field_names.POVERTY_LESS_THAN_200_FPL_IMPUTED_FIELD,
]

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,16 @@
# DOT travel barriers
The below description is taken from DOT directly:
Consistent with OMBs Interim Guidance for the Justice40 Initiative, DOTs interim definition of DACs includes (a) certain qualifying census tracts, (b) any Tribal land, or (c) any territory or possession of the United States. DOT has provided a mapping tool to assist applicants in identifying whether a project is located in a Disadvantaged Community, available at Transportation Disadvantaged Census Tracts (arcgis.com). A shapefile of the geospatial data is available Transportation Disadvantaged Census Tracts shapefile (version 2 .0, posted 5/10/22).
The DOT interim definition for DACs was developed by an internal and external collaborative research process (see recordings from November 2021 public meetings). It includes data for 22 indicators collected at the census tract level and grouped into six (6) categories of transportation disadvantage. The numbers in parenthesis show how many indicators fall in that category:
- Transportation access disadvantage identifies communities and places that spend more, and take longer, to get where they need to go. (4)
- Health disadvantage identifies communities based on variables associated with adverse health outcomes, disability, as well as environmental exposures. (3)
- Environmental disadvantage identifies communities with disproportionately high levels of certain air pollutants and high potential presence of lead-based paint in housing units. (6)
- Economic disadvantage identifies areas and populations with high poverty, low wealth, lack of local jobs, low homeownership, low educational attainment, and high inequality. (7)
Resilience disadvantage identifies communities vulnerable to hazards caused by climate change. (1)
- Equity disadvantage identifies communities with a with a high percentile of persons (age 5+) who speak English "less than well." (1)
The CEJST uses only Transportation Access Disadvantage.

View file

@ -0,0 +1,59 @@
# pylint: disable=unsubscriptable-object
# pylint: disable=unsupported-assignment-operation
import pandas as pd
import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class TravelCompositeETL(ExtractTransformLoad):
"""ETL class for the DOT Travel Disadvantage Dataset"""
NAME = "travel_composite"
SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
# Output score variables (values set on datasets.yml) for linting purposes
TRAVEL_BURDEN_FIELD_NAME: str
def __init__(self):
# define the full path for the input CSV file
self.INPUT_SHP = (
self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
)
# this is the main dataframe
self.df: pd.DataFrame
# Start dataset-specific vars here
## Average of Transportation Indicator Percentiles (calculated)
## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
## See metadata for more information
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method:
- Renames the Census Tract column to match the other datasets
- Converts to CSV
"""
logger.info("Transforming DOT Travel Disadvantage Data")
# read in the unzipped shapefile from data source
# reformat it to be standard df, remove unassigned rows, and
# then rename the Census Tract column for merging
df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
df_dot = df_dot.rename(
columns={
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
}
).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
# Assign the final df to the class' output_df for the load method
self.output_df = df_dot

View file

@ -344,6 +344,9 @@ CDC_SVI_INDEX_RPL_THEMES_OVERALL_FIELD: str = (
)
CDC_SVI_INDEX_THEMES_PRIORITY_COMMUNITY: str = "At or above 90 for overall percentile ranking according to Social Vulnerability Indices"
# DOT Travel Burden Data
DOT_TRAVEL_BURDEN_FIELD: str = "DOT Travel Barriers Score"
# Maryland EJSCREEN Data.
MARYLAND_EJSCREEN_SCORE_FIELD: str = "Maryland Environmental Justice Score"
@ -416,6 +419,7 @@ DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD = (
)
TRAFFIC_PROXIMITY_LOW_INCOME_FIELD = f"Greater than or equal to the {PERCENTILE}th percentile for traffic proximity and is low income?"
# Affordable and Sustainable Housing
LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_FIELD = (
f"Greater than or equal to the {PERCENTILE}th percentile for lead paint and"
@ -494,6 +498,10 @@ TRAFFIC_PROXIMITY_LOW_INCOME_LOW_HIGHER_ED_FIELD = (
f"traffic proximity{SCORE_M_LOW_INCOME_SUFFIX}?"
)
DOT_TRAVEL_BURDEN_LOW_INCOME_FIELD = (
f"Greater than or equal to the {PERCENTILE}th percentile "
f"for DOT transit barriers and is low income?"
)
# Affordable and Sustainable Housing
LEAD_PAINT_MEDIAN_HOUSE_VALUE_LOW_INCOME_LOW_HIGHER_ED_FIELD = (
f"Greater than or equal to the {PERCENTILE}th percentile for lead paint,"
@ -624,6 +632,7 @@ PM25_EXCEEDS_PCTILE_THRESHOLD = (
)
DIESEL_EXCEEDS_PCTILE_THRESHOLD = f"Greater than or equal to the {PERCENTILE}th percentile for diesel particulate matter"
TRAFFIC_PROXIMITY_PCTILE_THRESHOLD = f"Greater than or equal to the {PERCENTILE}th percentile for traffic proximity"
DOT_BURDEN_PCTILE_THRESHOLD = f"Greater than or equal to the {PERCENTILE}th percentile for DOT travel barriers"
LEAD_PAINT_PROXY_PCTILE_THRESHOLD = (
f"Greater than or equal to the {PERCENTILE}th percentile for lead paint and"
f" the median house value is less than {MEDIAN_HOUSE_VALUE_PERCENTILE}th "

View file

@ -246,6 +246,8 @@ class ScoreNarwhal(Score):
# In Xth percentile or above for PM 2.5 (Source: EPA, Office of Air and Radiation (OAR) fusion of model and monitor data)]
# or
# In Xth percentile or above traffic proximity and volume (Source: 2017 U.S. Department of Transportation (DOT) traffic data
# or
# In Xth percentile or above for DOT Travel Disadvantage
# AND
# Low income: In Nth percentile or above for percent of block group population
# of households where household income is less than or equal to twice the federal
@ -255,6 +257,7 @@ class ScoreNarwhal(Score):
transportion_eligibility_columns = [
field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD,
field_names.TRAFFIC_PROXIMITY_LOW_INCOME_FIELD,
field_names.DOT_TRAVEL_BURDEN_LOW_INCOME_FIELD,
]
self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD] = (
@ -264,6 +267,14 @@ class ScoreNarwhal(Score):
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.DOT_BURDEN_PCTILE_THRESHOLD] = (
self.df[
field_names.DOT_TRAVEL_BURDEN_FIELD
+ field_names.PERCENTILE_FIELD_SUFFIX
]
>= self.ENVIRONMENTAL_BURDEN_THRESHOLD
)
self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD] = (
self.df[
field_names.TRAFFIC_FIELD + field_names.PERCENTILE_FIELD_SUFFIX
@ -274,6 +285,7 @@ class ScoreNarwhal(Score):
self.df[field_names.TRAFFIC_THRESHOLD_EXCEEDED] = (
self.df[field_names.TRAFFIC_PROXIMITY_PCTILE_THRESHOLD]
| self.df[field_names.DIESEL_EXCEEDS_PCTILE_THRESHOLD]
| self.df[field_names.DOT_BURDEN_PCTILE_THRESHOLD]
)
self.df[field_names.DIESEL_PARTICULATE_MATTER_LOW_INCOME_FIELD] = (
@ -286,6 +298,11 @@ class ScoreNarwhal(Score):
& self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
)
self.df[field_names.DOT_TRAVEL_BURDEN_LOW_INCOME_FIELD] = (
self.df[field_names.DOT_BURDEN_PCTILE_THRESHOLD]
& self.df[field_names.FPL_200_SERIES_IMPUTED_AND_ADJUSTED]
)
self._increment_total_eligibility_exceeded(
transportion_eligibility_columns,
skip_fips=constants.DROP_FIPS_FROM_NON_WTD_THRESHOLDS,