Run ETL processes in parallel (#1253)

* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
2025-07-26 09:51:16 -07:00 · 2022-02-11 14:04:53 -05:00 · 2022-02-11 14:04:53 -05:00 · a0d6e55f0a
commit a0d6e55f0a
parent 389eb59ac4
30 changed files with 286 additions and 160 deletions
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -98,6 +98,17 @@ class ExtractTransformLoad:
        )
        return output_file_path

+    def get_tmp_path(self) -> pathlib.Path:
+        """Returns the temporary path associated with this ETL class."""
+        # Note: the temporary path will be defined on `init`, because it uses the class
+        # of the instance which is often a child class.
+        tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
+
+        # Create directory if it doesn't exist
+        tmp_path.mkdir(parents=True, exist_ok=True)
+
+        return tmp_path
+
    def extract(
        self,
        source_url: str = None,
@ -112,7 +123,7 @@ class ExtractTransformLoad:
        if source_url and extract_path:
            unzip_file_from_url(
                file_url=source_url,
-                download_path=self.TMP_PATH,
+                download_path=self.get_tmp_path(),
                unzipped_file_path=extract_path,
                verify=verify,
            )
@ -265,4 +276,4 @@ class ExtractTransformLoad:

    def cleanup(self) -> None:
        """Clears out any files stored in the TMP folder"""
-        remove_all_from_dir(self.TMP_PATH)
+        remove_all_from_dir(self.get_tmp_path())