Run ETL processes in parallel (#1253)

* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
2025-07-28 10:51:16 -07:00 · 2022-02-11 14:04:53 -05:00 · 2022-02-11 14:04:53 -05:00 · a0d6e55f0a
commit a0d6e55f0a
parent 389eb59ac4
30 changed files with 286 additions and 160 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
@ -101,6 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
        self.df: pd.DataFrame

    def extract(self) -> None:
+        logger.info("Starting Census 2010 ACS Transform")
        # Define the variables to retrieve
        variables = (
            self.UNEMPLOYED_FIELDS
@ -118,7 +119,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
        )

    def transform(self) -> None:
-        logger.info("Starting Census ACS Transform")
+        logger.info("Starting Census 2010 ACS Transform")

        df = self.df

@ -184,7 +185,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
        self.df = output_df

    def load(self) -> None:
-        logger.info("Saving Census ACS Data")
+        logger.info("Saving Census 2010 ACS Data")

        # mkdir census
        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)