mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-25 02:54:19 -08:00
* working notebook * updating notebook * wip * fixing broken tests * adding tribal overlap files * WIP * WIP * WIP, calculated count and names * working * partial cleanup * partial cleanup * updating field names * fixing bug * removing pyogrio * removing unused imports * updating test fixtures to be more realistic * cleaning up notebook * fixing black * fixing flake8 errors * adding tox instructions * updating etl_score * suppressing warning * Use projected CRSes, ignore geom types (#1900) I looked into this a bit, and in general the geometry type mismatch changes very little about the calculation; we have a mix of multipolygons and polygons. The fastest thing to do is just not keep geom type; I did some runs with it set to both True and False, and they're the same within 9 digits of precision. Logically we just want to overlaps, regardless of how the actual geometries are encoded between the frames, so we can in this case ignore the geom types and feel OKAY. I also moved to projected CRSes, since we are actually trying to do area calculations and so like, we should. Again, the change is small in magnitude but logically more sound. * Readd CDC dataset config (#1900) * adding comments to fips code * delete unnecessary loggers Co-authored-by: matt bowen <matthew.r.bowen@omb.eop.gov>
207 lines
6.2 KiB
Python
207 lines
6.2 KiB
Python
from pathlib import Path
|
|
import geopandas as gpd
|
|
import pandas as pd
|
|
|
|
from data_pipeline.etl.base import ExtractTransformLoad
|
|
from data_pipeline.score import field_names
|
|
from data_pipeline.utils import get_module_logger, unzip_file_from_url
|
|
|
|
logger = get_module_logger(__name__)
|
|
|
|
|
|
class TribalETL(ExtractTransformLoad):
|
|
def __init__(self):
|
|
self.GEOJSON_BASE_PATH = self.DATA_PATH / "tribal" / "geojson"
|
|
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
|
|
self.NATIONAL_TRIBAL_GEOJSON_PATH = self.GEOJSON_BASE_PATH / "usa.json"
|
|
self.USA_TRIBAL_DF_LIST = []
|
|
|
|
def extract(self) -> None:
|
|
"""Extract the tribal geojson zip files from Justice40 S3 data folder
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
logger.info("Downloading Tribal Data")
|
|
|
|
bia_geojson_url = "https://justice40-data.s3.amazonaws.com/data-sources/BIA_National_LAR_json.zip"
|
|
alaska_geojson_url = "https://justice40-data.s3.amazonaws.com/data-sources/Alaska_Native_Villages_json.zip"
|
|
|
|
unzip_file_from_url(
|
|
bia_geojson_url,
|
|
self.TMP_PATH,
|
|
self.DATA_PATH / "tribal" / "geojson" / "bia_national_lar",
|
|
)
|
|
|
|
unzip_file_from_url(
|
|
alaska_geojson_url,
|
|
self.TMP_PATH,
|
|
self.DATA_PATH / "tribal" / "geojson" / "alaska_native_villages",
|
|
)
|
|
pass
|
|
|
|
def _transform_bia_national_lar(self, tribal_geojson_path: Path) -> None:
|
|
"""Transform the Tribal BIA National Lar Geodataframe and appends it to the
|
|
national Tribal Dataframe List
|
|
|
|
Args:
|
|
tribal_geojson_path (Path): the Path to the Tribal Geojson
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
|
|
bia_national_lar_df = gpd.read_file(tribal_geojson_path)
|
|
|
|
bia_national_lar_df.drop(
|
|
["OBJECTID", "GISAcres", "Shape_Length", "Shape_Area"],
|
|
axis=1,
|
|
inplace=True,
|
|
)
|
|
|
|
bia_national_lar_df.rename(
|
|
columns={
|
|
"LARID": field_names.TRIBAL_ID,
|
|
"LARName": field_names.TRIBAL_LAND_AREA_NAME,
|
|
},
|
|
inplace=True,
|
|
)
|
|
|
|
self.USA_TRIBAL_DF_LIST.append(bia_national_lar_df)
|
|
|
|
def _transform_bia_aian_supplemental(
|
|
self, tribal_geojson_path: Path
|
|
) -> None:
|
|
"""Transform the Tribal BIA Supplemental Geodataframe and appends it to the
|
|
national Tribal Dataframe List
|
|
|
|
Args:
|
|
tribal_geojson_path (Path): the Path to the Tribal Geojson
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
|
|
bia_aian_supplemental_df = gpd.read_file(tribal_geojson_path)
|
|
|
|
bia_aian_supplemental_df.drop(
|
|
["GISAcres", "Source", "Shape_Length", "Shape_Area"],
|
|
axis=1,
|
|
inplace=True,
|
|
)
|
|
|
|
bia_aian_supplemental_df.rename(
|
|
columns={
|
|
"OBJECTID": field_names.TRIBAL_ID,
|
|
"Land_Area_": field_names.TRIBAL_LAND_AREA_NAME,
|
|
},
|
|
inplace=True,
|
|
)
|
|
|
|
self.USA_TRIBAL_DF_LIST.append(bia_aian_supplemental_df)
|
|
|
|
def _transform_bia_tsa(self, tribal_geojson_path: Path) -> None:
|
|
"""Transform the Tribal BIA TSA Geodataframe and appends it to the
|
|
national Tribal Dataframe List
|
|
|
|
Args:
|
|
tribal_geojson_path (Path): the Path to the Tribal Geojson
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
|
|
bia_tsa_df = gpd.read_file(tribal_geojson_path)
|
|
|
|
bia_tsa_df.drop(
|
|
["OBJECTID", "GISAcres", "Shape_Length", "Shape_Area"],
|
|
axis=1,
|
|
inplace=True,
|
|
)
|
|
|
|
bia_tsa_df.rename(
|
|
columns={
|
|
"TSAID": field_names.TRIBAL_ID,
|
|
"LARName": field_names.TRIBAL_LAND_AREA_NAME,
|
|
},
|
|
inplace=True,
|
|
)
|
|
|
|
self.USA_TRIBAL_DF_LIST.append(bia_tsa_df)
|
|
|
|
def _transform_alaska_native_villages(
|
|
self, tribal_geojson_path: Path
|
|
) -> None:
|
|
"""Transform the Alaska Native Villages Geodataframe and appends it to the
|
|
national Tribal Dataframe List
|
|
|
|
Args:
|
|
tribal_geojson_path (Path): the Path to the Tribal Geojson
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
|
|
alaska_native_villages_df = gpd.read_file(tribal_geojson_path)
|
|
|
|
alaska_native_villages_df.rename(
|
|
columns={
|
|
"GlobalID": field_names.TRIBAL_ID,
|
|
"TRIBALOFFICENAME": field_names.TRIBAL_LAND_AREA_NAME,
|
|
},
|
|
inplace=True,
|
|
)
|
|
|
|
self.USA_TRIBAL_DF_LIST.append(alaska_native_villages_df)
|
|
|
|
def transform(self) -> None:
|
|
"""Transform the tribal geojson files to generate national CSVs and GeoJSONs
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
logger.info("Transforming Tribal Data")
|
|
|
|
# load the geojsons
|
|
bia_national_lar_geojson = (
|
|
self.GEOJSON_BASE_PATH
|
|
/ "bia_national_lar"
|
|
/ "BIA_National_LAR.json"
|
|
)
|
|
bia_aian_supplemental_geojson = (
|
|
self.GEOJSON_BASE_PATH
|
|
/ "bia_national_lar"
|
|
/ "BIA_AIAN_Supplemental.json"
|
|
)
|
|
bia_tsa_geojson_geojson = (
|
|
self.GEOJSON_BASE_PATH / "bia_national_lar" / "BIA_TSA.json"
|
|
)
|
|
alaska_native_villages_geojson = (
|
|
self.GEOJSON_BASE_PATH
|
|
/ "alaska_native_villages"
|
|
/ "AlaskaNativeVillages.gdb.geojson"
|
|
)
|
|
|
|
self._transform_bia_national_lar(bia_national_lar_geojson)
|
|
self._transform_bia_aian_supplemental(bia_aian_supplemental_geojson)
|
|
self._transform_bia_tsa(bia_tsa_geojson_geojson)
|
|
self._transform_alaska_native_villages(alaska_native_villages_geojson)
|
|
|
|
def load(self) -> None:
|
|
"""Create tribal national CSV and GeoJSON
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
logger.info("Saving Tribal GeoJson and CSV")
|
|
|
|
usa_tribal_df = gpd.GeoDataFrame(
|
|
pd.concat(self.USA_TRIBAL_DF_LIST, ignore_index=True)
|
|
)
|
|
usa_tribal_df = usa_tribal_df.to_crs(
|
|
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
|
)
|
|
logger.info("Writing national geojson file")
|
|
usa_tribal_df.to_file(
|
|
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
|
|
)
|