Run ETL processes in parallel (#1253)

* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
2025-07-23 05:10:36 -07:00 · 2022-02-11 14:04:53 -05:00 · 2022-02-11 14:04:53 -05:00 · a0d6e55f0a
commit a0d6e55f0a
parent 389eb59ac4
30 changed files with 286 additions and 160 deletions
--- a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
@ -41,12 +41,12 @@ class ExampleETL(ExtractTransformLoad):

        logger.info(f"Extracting {zip_file_path}")
        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
-            zip_ref.extractall(self.TMP_PATH)
+            zip_ref.extractall(self.get_tmp_path())

    def transform(self):
-        logger.info(f"Loading file from {self.TMP_PATH / 'input.csv'}.")
+        logger.info(f"Loading file from {self.get_tmp_path() / 'input.csv'}.")
        df: pd.DataFrame = pd.read_csv(
-            self.TMP_PATH / "input.csv",
+            self.get_tmp_path() / "input.csv",
            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
            low_memory=False,
        )
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@ -543,7 +543,7 @@ class TestETL:
            f"Writing data to {self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME}"
        )
        copy_data_files(
-            src=etl.TMP_PATH / "input.csv",
+            src=etl.get_tmp_path() / "input.csv",
            dst=self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME,
        )