j40-cejst-2/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py

from pathlib import Path
import geopandas as gpd
import pandas as pd

from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import get_module_logger, unzip_file_from_url

logger = get_module_logger(__name__)


class TribalETL(ExtractTransformLoad):
    def __init__(self):
        self.GEOJSON_BASE_PATH = self.DATA_PATH / "tribal" / "geojson"
        self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
        self.NATIONAL_TRIBAL_GEOJSON_PATH = self.GEOJSON_BASE_PATH / "usa.json"
        self.USA_TRIBAL_DF_LIST = []

    def extract(self) -> None:
        """Extract the tribal geojson zip files from Justice40 S3 data folder

        Returns:
            None
        """
        logger.info("Downloading Tribal Data")

        bia_geojson_url = "https://justice40-data.s3.amazonaws.com/data-sources/BIA_National_LAR_json.zip"
        alaska_geojson_url = "https://justice40-data.s3.amazonaws.com/data-sources/Alaska_Native_Villages_json.zip"

        unzip_file_from_url(
            bia_geojson_url,
            self.TMP_PATH,
            self.DATA_PATH / "tribal" / "geojson" / "bia_national_lar",
        )

        unzip_file_from_url(
            alaska_geojson_url,
            self.TMP_PATH,
            self.DATA_PATH / "tribal" / "geojson" / "alaska_native_villages",
        )
        pass

    def _transform_bia_national_lar(self, tribal_geojson_path: Path) -> None:
        """Transform the Tribal BIA National Lar Geodataframe and appends it to the
        national Tribal Dataframe List

        Args:
            tribal_geojson_path (Path): the Path to the Tribal Geojson

        Returns:
            None
        """

        bia_national_lar_df = gpd.read_file(tribal_geojson_path)

        bia_national_lar_df.drop(
            ["OBJECTID", "GISAcres", "Shape_Length", "Shape_Area"],
            axis=1,
            inplace=True,
        )

        bia_national_lar_df.rename(
            columns={"TSAID": "tribalId", "LARName": "landAreaName"},
            inplace=True,
        )

        self.USA_TRIBAL_DF_LIST.append(bia_national_lar_df)

    def _transform_bia_aian_supplemental(
        self, tribal_geojson_path: Path
    ) -> None:
        """Transform the Tribal BIA Supplemental Geodataframe and appends it to the
        national Tribal Dataframe List

        Args:
            tribal_geojson_path (Path): the Path to the Tribal Geojson

        Returns:
            None
        """

        bia_aian_supplemental_df = gpd.read_file(tribal_geojson_path)

        bia_aian_supplemental_df.drop(
            ["GISAcres", "Source", "Shape_Length", "Shape_Area"],
            axis=1,
            inplace=True,
        )

        bia_aian_supplemental_df.rename(
            columns={"OBJECTID": "tribalId", "Land_Area_": "landAreaName"},
            inplace=True,
        )

        self.USA_TRIBAL_DF_LIST.append(bia_aian_supplemental_df)

    def _transform_bia_tsa(self, tribal_geojson_path: Path) -> None:
        """Transform the Tribal BIA TSA Geodataframe and appends it to the
        national Tribal Dataframe List

        Args:
            tribal_geojson_path (Path): the Path to the Tribal Geojson

        Returns:
            None
        """

        bia_tsa_df = gpd.read_file(tribal_geojson_path)

        bia_tsa_df.drop(
            ["OBJECTID", "GISAcres", "Shape_Length", "Shape_Area"],
            axis=1,
            inplace=True,
        )

        bia_tsa_df.rename(
            columns={"TSAID": "tribalId", "LARName": "landAreaName"},
            inplace=True,
        )

        self.USA_TRIBAL_DF_LIST.append(bia_tsa_df)

    def _transform_alaska_native_villages(
        self, tribal_geojson_path: Path
    ) -> None:
        """Transform the Alaska Native Villages Geodataframe and appends it to the
        national Tribal Dataframe List

        Args:
            tribal_geojson_path (Path): the Path to the Tribal Geojson

        Returns:
            None
        """

        alaska_native_villages_df = gpd.read_file(tribal_geojson_path)

        alaska_native_villages_df.rename(
            columns={
                "GlobalID": "tribalId",
                "TRIBALOFFICENAME": "landAreaName",
            },
            inplace=True,
        )

        self.USA_TRIBAL_DF_LIST.append(alaska_native_villages_df)

    def transform(self) -> None:
        """Transform the tribal geojson files to generate national CSVs and GeoJSONs

        Returns:
            None
        """
        logger.info("Transforming Tribal Data")

        # load the geojsons
        bia_national_lar_geojson = (
            self.GEOJSON_BASE_PATH / "bia_national_lar" / "BIA_TSA.json"
        )
        bia_aian_supplemental_geojson = (
            self.GEOJSON_BASE_PATH
            / "bia_national_lar"
            / "BIA_AIAN_Supplemental.json"
        )
        bia_tsa_geojson_geojson = (
            self.GEOJSON_BASE_PATH / "bia_national_lar" / "BIA_TSA.json"
        )
        alaska_native_villages_geojson = (
            self.GEOJSON_BASE_PATH
            / "alaska_native_villages"
            / "AlaskaNativeVillages.gdb.geojson"
        )

        self._transform_bia_national_lar(bia_national_lar_geojson)
        self._transform_bia_aian_supplemental(bia_aian_supplemental_geojson)
        self._transform_bia_tsa(bia_tsa_geojson_geojson)
        self._transform_alaska_native_villages(alaska_native_villages_geojson)

    def load(self) -> None:
        """Create tribal national CSV and GeoJSON

        Returns:
            None
        """
        logger.info("Saving Tribal GeoJson and CSV")

        usa_tribal_df = gpd.GeoDataFrame(
            pd.concat(self.USA_TRIBAL_DF_LIST, ignore_index=True)
        )
        usa_tribal_df = usa_tribal_df.to_crs(
            "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
        )
        logger.info("Writing national geojson file")
        usa_tribal_df.to_file(
            self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
        )