Run ETL processes in parallel (#1253)

* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
2025-07-28 02:51:17 -07:00 · 2022-02-11 14:04:53 -05:00 · 2022-02-11 14:04:53 -05:00 · a0d6e55f0a
commit a0d6e55f0a
parent 389eb59ac4
30 changed files with 286 additions and 160 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
@ -14,7 +14,7 @@ class CalEnviroScreenETL(ExtractTransformLoad):
            + "/CalEnviroScreen_4.0_2021.zip"
        )
        self.CALENVIROSCREEN_CSV = (
-            self.TMP_PATH / "CalEnviroScreen_4.0_2021.csv"
+            self.get_tmp_path() / "CalEnviroScreen_4.0_2021.csv"
        )
        self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"

@ -37,7 +37,7 @@ class CalEnviroScreenETL(ExtractTransformLoad):
        logger.info("Downloading CalEnviroScreen Data")
        super().extract(
            self.CALENVIROSCREEN_FTP_URL,
-            self.TMP_PATH,
+            self.get_tmp_path(),
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@ -29,7 +29,9 @@ class CDCLifeExpectancy(ExtractTransformLoad):
    def extract(self) -> None:
        logger.info("Starting data download.")

-        download_file_name = self.TMP_PATH / "cdc_life_expectancy" / "usa.csv"
+        download_file_name = (
+            self.get_tmp_path() / "cdc_life_expectancy" / "usa.csv"
+        )
        download_file_from_url(
            file_url=self.FILE_URL,
            download_file_name=download_file_name,
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
@ -22,7 +22,7 @@ class CDCPlacesETL(ExtractTransformLoad):
        logger.info("Starting to download 520MB CDC Places file.")
        file_path = download_file_from_url(
            file_url=self.CDC_PLACES_URL,
-            download_file_name=self.TMP_PATH
+            download_file_name=self.get_tmp_path()
            / "cdc_places"
            / "census_tract.csv",
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
@ -101,6 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
        self.df: pd.DataFrame

    def extract(self) -> None:
+        logger.info("Starting Census 2010 ACS Transform")
        # Define the variables to retrieve
        variables = (
            self.UNEMPLOYED_FIELDS
@ -118,7 +119,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
        )

    def transform(self) -> None:
-        logger.info("Starting Census ACS Transform")
+        logger.info("Starting Census 2010 ACS Transform")

        df = self.df

@ -184,7 +185,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
        self.df = output_df

    def load(self) -> None:
-        logger.info("Saving Census ACS Data")
+        logger.info("Saving Census 2010 ACS Data")

        # mkdir census
        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
@ -238,12 +238,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        unzip_file_from_url(
            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/geocorr2014_all_states_tracts_only.csv.zip",
-            download_path=self.TMP_PATH,
-            unzipped_file_path=self.TMP_PATH / "geocorr",
+            download_path=self.get_tmp_path(),
+            unzipped_file_path=self.get_tmp_path() / "geocorr",
        )

        self.raw_geocorr_df = pd.read_csv(
-            filepath_or_buffer=self.TMP_PATH
+            filepath_or_buffer=self.get_tmp_path()
            / "geocorr"
            / "geocorr2014_all_states_tracts_only.csv",
            # Skip second row, which has descriptions.
--- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
@ -57,12 +57,12 @@ class ChildOpportunityIndex(ExtractTransformLoad):

        unzip_file_from_url(
            file_url=self.COI_FILE_URL,
-            download_path=self.TMP_PATH,
-            unzipped_file_path=self.TMP_PATH / "child_opportunity_index",
+            download_path=self.get_tmp_path(),
+            unzipped_file_path=self.get_tmp_path() / "child_opportunity_index",
        )

        self.raw_df = pd.read_csv(
-            filepath_or_buffer=self.TMP_PATH
+            filepath_or_buffer=self.get_tmp_path()
            / "child_opportunity_index"
            / "raw.csv",
            # The following need to remain as strings for all of their digits, not get
--- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
@ -37,12 +37,12 @@ class DOEEnergyBurden(ExtractTransformLoad):

        unzip_file_from_url(
            file_url=self.DOE_FILE_URL,
-            download_path=self.TMP_PATH,
-            unzipped_file_path=self.TMP_PATH / "doe_energy_burden",
+            download_path=self.get_tmp_path(),
+            unzipped_file_path=self.get_tmp_path() / "doe_energy_burden",
        )

        self.raw_df = pd.read_csv(
-            filepath_or_buffer=self.TMP_PATH
+            filepath_or_buffer=self.get_tmp_path()
            / "doe_energy_burden"
            / "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
            # The following need to remain as strings for all of their digits, not get converted to numbers.
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@ -16,7 +16,7 @@ class EJSCREENETL(ExtractTransformLoad):

    def __init__(self):
        self.EJSCREEN_FTP_URL = "https://edap-arcgiscloud-data-commons.s3.amazonaws.com/EJSCREEN2020/EJSCREEN_Tract_2020_USPR.csv.zip"
-        self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_Tract_2020_USPR.csv"
+        self.EJSCREEN_CSV = self.get_tmp_path() / "EJSCREEN_Tract_2020_USPR.csv"
        self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
        self.df: pd.DataFrame

@ -45,7 +45,7 @@ class EJSCREENETL(ExtractTransformLoad):
        logger.info("Downloading EJScreen Data")
        super().extract(
            self.EJSCREEN_FTP_URL,
-            self.TMP_PATH,
+            self.get_tmp_path(),
            verify=False,  # EPA EJScreen end point has certificate issues often
        )

--- a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
@ -52,13 +52,13 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):

        unzip_file_from_url(
            file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
-            download_path=self.TMP_PATH,
-            unzipped_file_path=self.TMP_PATH
+            download_path=self.get_tmp_path(),
+            unzipped_file_path=self.get_tmp_path()
            / "energy_definition_alternative_draft",
        )

        self.df = pd.read_csv(
-            filepath_or_buffer=self.TMP_PATH
+            filepath_or_buffer=self.get_tmp_path()
            / "energy_definition_alternative_draft"
            / "J40 alternative DAC definition.csv",
            # The following need to remain as strings for all of their digits, not get converted to numbers.
--- a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
@ -71,12 +71,12 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):

        unzip_file_from_url(
            file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL,
-            download_path=self.TMP_PATH,
-            unzipped_file_path=self.TMP_PATH / "epa_rsei",
+            download_path=self.get_tmp_path(),
+            unzipped_file_path=self.get_tmp_path() / "epa_rsei",
        )

        self.df = pd.read_csv(
-            filepath_or_buffer=self.TMP_PATH
+            filepath_or_buffer=self.get_tmp_path()
            / "epa_rsei"
            / "CensusMicroTracts2019_2019_aggregated.csv",
            # The following need to remain as strings for all of their digits, not get
--- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
@ -34,12 +34,12 @@ class GeoCorrETL(ExtractTransformLoad):
        unzip_file_from_url(
            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/geocorr_urban_rural.csv.zip",
-            download_path=self.TMP_PATH,
-            unzipped_file_path=self.TMP_PATH / "geocorr",
+            download_path=self.get_tmp_path(),
+            unzipped_file_path=self.get_tmp_path() / "geocorr",
        )

        self.df = pd.read_csv(
-            filepath_or_buffer=self.TMP_PATH
+            filepath_or_buffer=self.get_tmp_path()
            / "geocorr"
            / "geocorr_urban_rural.csv",
            dtype={
--- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
@ -21,14 +21,16 @@ class HousingTransportationETL(ExtractTransformLoad):
    def extract(self) -> None:
        # Download each state / territory individually
        dfs = []
-        zip_file_dir = self.TMP_PATH / "housing_and_transportation_index"
+        zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
        for fips in get_state_fips_codes(self.DATA_PATH):
            logger.info(
                f"Downloading housing data for state/territory with FIPS code {fips}"
            )

            unzip_file_from_url(
-                f"{self.HOUSING_FTP_URL}{fips}", self.TMP_PATH, zip_file_dir
+                f"{self.HOUSING_FTP_URL}{fips}",
+                self.get_tmp_path(),
+                zip_file_dir,
            )

            # New file name:
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
@ -10,7 +10,7 @@ class HudHousingETL(ExtractTransformLoad):
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "hud_housing"
        self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
        self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
-        self.HOUSING_ZIP_FILE_DIR = self.TMP_PATH / "hud_housing"
+        self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path() / "hud_housing"

        # We measure households earning less than 80% of HUD Area Median Family Income by county
        # and paying greater than 30% of their income to housing costs.
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
@ -12,7 +12,7 @@ class HudRecapETL(ExtractTransformLoad):
        # pylint: disable=line-too-long
        self.HUD_RECAP_CSV_URL = "https://opendata.arcgis.com/api/v3/datasets/56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"  # noqa: E501
        self.HUD_RECAP_CSV = (
-            self.TMP_PATH
+            self.get_tmp_path()
            / "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
        )
        self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
@ -19,8 +19,8 @@ class MappingForEJETL(ExtractTransformLoad):
        self.MAPPING_FOR_EJ_CO_URL = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
        )
-        self.VA_SHP_FILE_PATH = self.TMP_PATH / "mej_virginia_7_1.shp"
-        self.CO_SHP_FILE_PATH = self.TMP_PATH / "mej_colorado_final.shp"
+        self.VA_SHP_FILE_PATH = self.get_tmp_path() / "mej_virginia_7_1.shp"
+        self.CO_SHP_FILE_PATH = self.get_tmp_path() / "mej_colorado_final.shp"

        # Defining variables
        self.COLUMNS_TO_KEEP = [
@ -43,11 +43,11 @@ class MappingForEJETL(ExtractTransformLoad):
        logger.info("Downloading Mapping for EJ Data")
        super().extract(
            self.MAPPING_FOR_EJ_VA_URL,
-            self.TMP_PATH,
+            self.get_tmp_path(),
        )
        super().extract(
            self.MAPPING_FOR_EJ_CO_URL,
-            self.TMP_PATH,
+            self.get_tmp_path(),
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
@ -25,7 +25,9 @@ class MappingInequalityETL(ExtractTransformLoad):
            "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
            "main/2010_Census_Tracts/holc_tract_lookup.csv"
        )
-        self.MAPPING_INEQUALITY_CSV = self.TMP_PATH / "holc_tract_lookup.csv"
+        self.MAPPING_INEQUALITY_CSV = (
+            self.get_tmp_path() / "holc_tract_lookup.csv"
+        )
        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"

        self.HOLC_MANUAL_MAPPING_CSV_PATH = (
--- a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
@ -21,7 +21,7 @@ class MarylandEJScreenETL(ExtractTransformLoad):
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
        )

-        self.SHAPE_FILES_PATH = self.TMP_PATH / "mdejscreen"
+        self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen"
        self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"

        self.COLUMNS_TO_KEEP = [
@ -36,7 +36,7 @@ class MarylandEJScreenETL(ExtractTransformLoad):
        logger.info("Downloading 207MB Maryland EJSCREEN Data")
        super().extract(
            self.MARYLAND_EJSCREEN_URL,
-            self.TMP_PATH,
+            self.get_tmp_path(),
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -20,7 +20,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT

    def __init__(self):
-        self.INPUT_CSV = self.TMP_PATH / "NRI_Table_CensusTracts.csv"
+        self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"

        self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
            "EAL_SCORE"
@ -68,7 +68,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        logger.info("Downloading 405MB National Risk Index Data")
        super().extract(
            source_url=self.SOURCE_URL,
-            extract_path=self.TMP_PATH,
+            extract_path=self.get_tmp_path(),
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
@ -75,12 +75,12 @@ class PersistentPovertyETL(ExtractTransformLoad):
    def extract(self) -> None:
        logger.info("Starting to download 86MB persistent poverty file.")

-        unzipped_file_path = self.TMP_PATH / "persistent_poverty"
+        unzipped_file_path = self.get_tmp_path() / "persistent_poverty"

        unzip_file_from_url(
            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/LTDB_Std_All_Sample.zip",
-            download_path=self.TMP_PATH,
+            download_path=self.get_tmp_path(),
            unzipped_file_path=unzipped_file_path,
        )

@ -93,7 +93,6 @@ class PersistentPovertyETL(ExtractTransformLoad):
        temporary_input_dfs = []

        for file_name in file_names:
-            print(file_name)
            temporary_input_df = pd.read_csv(
                filepath_or_buffer=unzipped_file_path
                / f"ltdb_std_all_sample/{file_name}",
--- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
@ -21,7 +21,7 @@ class TreeEquityScoreETL(ExtractTransformLoad):

    def __init__(self):
        self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
-        self.TES_CSV = self.TMP_PATH / "tes_2021_data.csv"
+        self.TES_CSV = self.get_tmp_path() / "tes_2021_data.csv"
        self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
        self.df: gpd.GeoDataFrame
        self.states = [
@ -81,7 +81,7 @@ class TreeEquityScoreETL(ExtractTransformLoad):
        for state in self.states:
            super().extract(
                f"{self.TES_URL}{state}.zip.zip",
-                f"{self.TMP_PATH}/{state}",
+                f"{self.get_tmp_path()}/{state}",
            )

    def transform(self) -> None:
@ -89,7 +89,7 @@ class TreeEquityScoreETL(ExtractTransformLoad):
        tes_state_dfs = []
        for state in self.states:
            tes_state_dfs.append(
-                gpd.read_file(f"{self.TMP_PATH}/{state}/{state}.shp")
+                gpd.read_file(f"{self.get_tmp_path()}/{state}/{state}.shp")
            )
        self.df = gpd.GeoDataFrame(
            pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs