mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-03 07:14:18 -07:00
Update logging messages and message consistency This update includes changes to the level of many log messages. Rather than everything being logged at the info level, it differentiates between debug, info, warning, and error messages. It also changes the default log level to info to avoid much of the noise previously in the logs. It also removes many extra log messages, and adds additional decorators at the beginning of each pipeline run.
231 lines
6.5 KiB
Python
231 lines
6.5 KiB
Python
from pathlib import Path
|
|
|
|
import geopandas as gpd
|
|
import pandas as pd
|
|
from data_pipeline.config import settings
|
|
from data_pipeline.etl.base import ExtractTransformLoad
|
|
from data_pipeline.score import field_names
|
|
from data_pipeline.utils import get_module_logger
|
|
from data_pipeline.utils import unzip_file_from_url
|
|
|
|
logger = get_module_logger(__name__)
|
|
|
|
|
|
class TribalETL(ExtractTransformLoad):
|
|
def __init__(self):
|
|
self.GEOGRAPHIC_BASE_PATH = (
|
|
self.DATA_PATH / "tribal" / "geographic_data"
|
|
)
|
|
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
|
|
self.NATIONAL_TRIBAL_GEOJSON_PATH = (
|
|
self.GEOGRAPHIC_BASE_PATH / "usa.json"
|
|
)
|
|
self.USA_TRIBAL_DF_LIST = []
|
|
|
|
def extract(self) -> None:
|
|
"""Extract the tribal geojson zip files from Justice40 S3 data folder
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
|
|
bia_shapefile_zip_url = (
|
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
|
+ "/BIA_National_LAR_updated_20220929.zip"
|
|
)
|
|
|
|
tsa_and_aian_geojson_zip_url = (
|
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
|
+ "/BIA_TSA_and_AIAN_json.zip"
|
|
)
|
|
|
|
alaska_geojson_url = (
|
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
|
+ "/Alaska_Native_Villages_json.zip"
|
|
)
|
|
|
|
unzip_file_from_url(
|
|
bia_shapefile_zip_url,
|
|
self.TMP_PATH,
|
|
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar",
|
|
)
|
|
|
|
unzip_file_from_url(
|
|
tsa_and_aian_geojson_zip_url,
|
|
self.TMP_PATH,
|
|
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian",
|
|
)
|
|
|
|
unzip_file_from_url(
|
|
alaska_geojson_url,
|
|
self.TMP_PATH,
|
|
self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages",
|
|
)
|
|
|
|
def _transform_bia_national_lar(self, path: Path) -> None:
|
|
"""Transform the Tribal BIA National Lar Geodataframe and appends it to the
|
|
national Tribal Dataframe List
|
|
|
|
Args:
|
|
path (Path): the Path to the BIA National Lar
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
|
|
bia_national_lar_df = gpd.read_file(path)
|
|
|
|
# DELETE
|
|
logger.debug(f"Columns: {bia_national_lar_df.columns}\n")
|
|
|
|
bia_national_lar_df.drop(
|
|
["GISAcres"],
|
|
axis=1,
|
|
inplace=True,
|
|
)
|
|
|
|
bia_national_lar_df.rename(
|
|
columns={
|
|
"LARID": field_names.TRIBAL_ID,
|
|
"LARName": field_names.TRIBAL_LAND_AREA_NAME,
|
|
},
|
|
inplace=True,
|
|
)
|
|
|
|
self.USA_TRIBAL_DF_LIST.append(bia_national_lar_df)
|
|
|
|
def _transform_bia_aian_supplemental(
|
|
self, tribal_geojson_path: Path
|
|
) -> None:
|
|
"""Transform the Tribal BIA Supplemental Geodataframe and appends it to the
|
|
national Tribal Dataframe List
|
|
|
|
Args:
|
|
tribal_geojson_path (Path): the Path to the Tribal Geojson
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
|
|
bia_aian_supplemental_df = gpd.read_file(tribal_geojson_path)
|
|
|
|
bia_aian_supplemental_df.drop(
|
|
["GISAcres", "Source", "Shape_Length", "Shape_Area"],
|
|
axis=1,
|
|
inplace=True,
|
|
)
|
|
|
|
bia_aian_supplemental_df.rename(
|
|
columns={
|
|
"OBJECTID": field_names.TRIBAL_ID,
|
|
"Land_Area_": field_names.TRIBAL_LAND_AREA_NAME,
|
|
},
|
|
inplace=True,
|
|
)
|
|
|
|
self.USA_TRIBAL_DF_LIST.append(bia_aian_supplemental_df)
|
|
|
|
def _transform_bia_tsa(self, tribal_geojson_path: Path) -> None:
|
|
"""Transform the Tribal BIA TSA Geodataframe and appends it to the
|
|
national Tribal Dataframe List
|
|
|
|
Args:
|
|
tribal_geojson_path (Path): the Path to the Tribal Geojson
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
|
|
bia_tsa_df = gpd.read_file(tribal_geojson_path)
|
|
|
|
bia_tsa_df.drop(
|
|
["OBJECTID", "GISAcres", "Shape_Length", "Shape_Area"],
|
|
axis=1,
|
|
inplace=True,
|
|
)
|
|
|
|
bia_tsa_df.rename(
|
|
columns={
|
|
"TSAID": field_names.TRIBAL_ID,
|
|
"LARName": field_names.TRIBAL_LAND_AREA_NAME,
|
|
},
|
|
inplace=True,
|
|
)
|
|
|
|
self.USA_TRIBAL_DF_LIST.append(bia_tsa_df)
|
|
|
|
def _transform_alaska_native_villages(
|
|
self, tribal_geojson_path: Path
|
|
) -> None:
|
|
"""Transform the Alaska Native Villages Geodataframe and appends it to the
|
|
national Tribal Dataframe List
|
|
|
|
Args:
|
|
tribal_geojson_path (Path): the Path to the Tribal Geojson
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
|
|
alaska_native_villages_df = gpd.read_file(tribal_geojson_path)
|
|
|
|
alaska_native_villages_df.rename(
|
|
columns={
|
|
"GlobalID": field_names.TRIBAL_ID,
|
|
"TRIBALOFFICENAME": field_names.TRIBAL_LAND_AREA_NAME,
|
|
},
|
|
inplace=True,
|
|
)
|
|
|
|
self.USA_TRIBAL_DF_LIST.append(alaska_native_villages_df)
|
|
|
|
def transform(self) -> None:
|
|
"""Transform the tribal geojson files to generate national CSVs and GeoJSONs
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
# Set the filepaths:
|
|
bia_national_lar_shapefile = (
|
|
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
|
|
)
|
|
|
|
bia_aian_supplemental_geojson = (
|
|
self.GEOGRAPHIC_BASE_PATH
|
|
/ "tsa_and_aian"
|
|
/ "BIA_AIAN_Supplemental.json"
|
|
)
|
|
|
|
bia_tsa_geojson = (
|
|
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json"
|
|
)
|
|
|
|
alaska_native_villages_geojson = (
|
|
self.GEOGRAPHIC_BASE_PATH
|
|
/ "alaska_native_villages"
|
|
/ "AlaskaNativeVillages.gdb.geojson"
|
|
)
|
|
|
|
self._transform_bia_national_lar(bia_national_lar_shapefile)
|
|
self._transform_bia_aian_supplemental(bia_aian_supplemental_geojson)
|
|
self._transform_bia_tsa(bia_tsa_geojson)
|
|
self._transform_alaska_native_villages(alaska_native_villages_geojson)
|
|
|
|
def load(self) -> None:
|
|
"""Create tribal national CSV and GeoJSON
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
logger.debug("Saving Tribal GeoJson and CSV")
|
|
usa_tribal_df = gpd.GeoDataFrame(
|
|
pd.concat(self.USA_TRIBAL_DF_LIST, ignore_index=True)
|
|
)
|
|
usa_tribal_df = usa_tribal_df.to_crs(
|
|
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
|
)
|
|
|
|
logger.debug("Writing national geojson file")
|
|
usa_tribal_df.to_file(
|
|
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
|
|
)
|