j40-cejst-2/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py

213 lines
7.6 KiB
Python
Raw Normal View History

import geopandas as gpd
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import (
add_tracts_for_geometries,
get_tribal_geojson,
get_tract_geojson,
)
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class TribalOverlapETL(ExtractTransformLoad):
"""Calculates the overlap between Census tracts and Tribal boundaries."""
# Metadata for the baseclass
NAME = "tribal_overlap"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
ALASKA_AND_HAWAII_EXPECTED_IN_DATA = True
EXPECTED_MISSING_STATES = [
# 15 is Hawaii, which has Hawaiian Home Lands, but they are not included in
# this dataset.
"15",
# The following states do not have any federally recognized Tribes in this
# dataset.
"10",
"11",
"13",
"17",
"18",
"21",
"24",
"33",
"34",
"39",
"50",
"51",
"54",
]
# A Tribal area that requires some special processing.
ANNETTE_ISLAND_TRIBAL_NAME = "Annette Island LAR"
CRS_INTEGER = 3857
# Define these for easy code completion
def __init__(self):
self.COLUMNS_TO_KEEP = [
self.GEOID_TRACT_FIELD_NAME,
field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT,
field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT,
]
self.output_df: pd.DataFrame
self.census_tract_gdf: gpd.GeoDataFrame
self.tribal_gdf: gpd.GeoDataFrame
@staticmethod
def _create_string_from_list(series: pd.Series) -> str:
"""Helper method that creates a sorted string list (for tribal names)."""
str_list = series.tolist()
str_list = sorted(str_list)
return ", ".join(str_list)
def extract(self) -> None:
self.census_tract_gdf = get_tract_geojson()
self.tribal_gdf = get_tribal_geojson()
def transform(self) -> None:
logger.info("Starting tribal overlap transforms.")
# First, calculate whether tracts include any areas from the Tribal areas,
# for both the points in AK and the polygons in the continental US (CONUS).
tribal_overlap_with_tracts = add_tracts_for_geometries(
df=self.tribal_gdf, tract_data=self.census_tract_gdf
)
tribal_overlap_with_tracts = tribal_overlap_with_tracts.groupby(
[self.GEOID_TRACT_FIELD_NAME]
).agg(
{
field_names.TRIBAL_ID: "count",
field_names.TRIBAL_LAND_AREA_NAME: self._create_string_from_list,
}
)
tribal_overlap_with_tracts = tribal_overlap_with_tracts.reset_index()
tribal_overlap_with_tracts = tribal_overlap_with_tracts.rename(
columns={
field_names.TRIBAL_ID: field_names.COUNT_OF_TRIBAL_AREAS_IN_TRACT,
field_names.TRIBAL_LAND_AREA_NAME: field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT,
}
)
# Second, calculate percentage overlap.
# Drop the points from the Tribal data (because these cannot be joined to a
# (Multi)Polygon tract data frame)
tribal_gdf_without_points = self.tribal_gdf[
self.tribal_gdf.geom_type.isin(["Polygon", "MultiPolygon"])
]
# Switch from geographic to projected CRSes
# because logically that's right
self.census_tract_gdf = self.census_tract_gdf.to_crs(
crs=self.CRS_INTEGER
)
tribal_gdf_without_points = tribal_gdf_without_points.to_crs(
crs=self.CRS_INTEGER
)
# Create a measure for the entire census tract area
self.census_tract_gdf["area_tract"] = self.census_tract_gdf.area
# Performing overlay funcion
# We have a mix of polygons and multipolygons, and we just want the overlaps
# without caring a ton about the specific types, so we ignore geom type.
# Realistically, this changes almost nothing in the calculation; True and False
# are the same within 9 digits of precision
gdf_joined = gpd.overlay(
self.census_tract_gdf,
tribal_gdf_without_points,
how="intersection",
keep_geom_type=False,
)
# Calculating the areas of the newly-created overlapping geometries
gdf_joined["area_joined"] = gdf_joined.area
# Calculating the areas of the newly-created geometries in relation
# to the original tract geometries
gdf_joined[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT] = (
gdf_joined["area_joined"] / gdf_joined["area_tract"]
)
# Aggregate the results
percentage_results = gdf_joined.groupby(
[self.GEOID_TRACT_FIELD_NAME]
).agg({field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT: "sum"})
percentage_results = percentage_results.reset_index()
# Merge the two results.
merged_output_df = tribal_overlap_with_tracts.merge(
right=percentage_results,
how="outer",
on=self.GEOID_TRACT_FIELD_NAME,
)
# Finally, fix one unique error.
# There is one unique Tribal area (self.ANNETTE_ISLAND_TRIBAL_NAME) that is a polygon in
# Alaska. All other Tribal areas in Alaska are points.
# For tracts that *only* contain that Tribal area, leave percentage as is.
# For tracts that include that Tribal area AND Alaska Native villages,
# null the percentage, because we cannot calculate the percent of the tract
# this is within Tribal areas.
# Create state FIPS codes.
merged_output_df_state_fips_code = merged_output_df[
self.GEOID_TRACT_FIELD_NAME
].str[0:2]
# Start by testing for Annette Island exception, to make sure data is as
# expected
alaskan_non_annette_matches = (
# Data from Alaska
(merged_output_df_state_fips_code == "02")
# Where the Tribal areas do *not* include Annette
& (
~merged_output_df[
field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT
].str.contains(self.ANNETTE_ISLAND_TRIBAL_NAME)
)
# But somehow percentage is greater than zero.
& (
merged_output_df[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT]
> 0
)
)
# There should be none of these matches.
if sum(alaskan_non_annette_matches) > 0:
raise ValueError(
"Data has changed. More than one Alaskan Tribal Area has polygon "
"boundaries. You'll need to refactor this ETL. \n"
f"Data:\n{merged_output_df[alaskan_non_annette_matches]}"
)
# Now, fix the exception that is already known.
merged_output_df[
field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT
] = np.where(
# For tracts inside Alaska
(merged_output_df_state_fips_code == "02")
# That are not only represented by Annette Island
& (
merged_output_df[field_names.NAMES_OF_TRIBAL_AREAS_IN_TRACT]
!= self.ANNETTE_ISLAND_TRIBAL_NAME
),
# Set the value to `None` for tracts with more than just Annette.
None,
# Otherwise, set the value to what it was.
merged_output_df[field_names.PERCENT_OF_TRIBAL_AREA_IN_TRACT],
)
self.output_df = merged_output_df