j40-cejst-2/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py

import pandas as pd

from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.utils import get_module_logger


logger = get_module_logger(__name__)


class HudRecapETL(ExtractTransformLoad):
    def __init__(self):

        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
            self.hud_recap_csv_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
            )
        else:
            self.hud_recap_csv_url = (
                "https://opendata.arcgis.com/api/v3/datasets/"
                "56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
            )

        # input
        self.hud_recap_source = (
            self.get_sources_path()
            / "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
        )

        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"

        # Defining some variable names
        self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
            "hud_recap_priority_community"
        )

        self.df: pd.DataFrame

    def get_data_sources(self) -> [DataSource]:
        return [
            FileDataSource(
                source=self.hud_recap_csv_url, destination=self.hud_recap_source
            )
        ]

    def extract(self, use_cached_data_sources: bool = False) -> None:

        super().extract(
            use_cached_data_sources
        )  # download and extract data sources

        # Load comparison index (CalEnviroScreen 4)
        self.df = pd.read_csv(self.hud_recap_source, dtype={"GEOID": "string"})

    def transform(self) -> None:

        self.df.rename(
            columns={
                "GEOID": self.GEOID_TRACT_FIELD_NAME,
                # Interestingly, there's no data dictionary for the RECAP data that I could find.
                # However, this site (http://www.schousing.com/library/Tax%20Credit/2020/QAP%20Instructions%20(2).pdf)
                # suggests:
                # "If RCAP_Current for the tract in which the site is located is 1, the tract is an R/ECAP. If RCAP_Current is 0, it is not."
                "RCAP_Current": self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME,
            },
            inplace=True,
        )

        # Convert to boolean
        self.df[self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME] = self.df[
            self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME
        ].astype("bool")

        self.df[self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME].value_counts()

        self.df.sort_values(by=self.GEOID_TRACT_FIELD_NAME, inplace=True)

    def load(self) -> None:
        # write nationwide csv
        self.CSV_PATH.mkdir(parents=True, exist_ok=True)
        self.df.to_csv(self.CSV_PATH / "usa.csv", index=False)