Run ETL processes in parallel (#1253)

* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
2025-09-30 02:53:17 -07:00 · 2022-02-11 14:04:53 -05:00 · 2022-02-11 14:04:53 -05:00 · a0d6e55f0a
commit a0d6e55f0a
parent 389eb59ac4
30 changed files with 286 additions and 160 deletions
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -1,13 +1,18 @@
 import importlib
+import concurrent.futures
+import typing

 from data_pipeline.etl.score.etl_score import ScoreETL
 from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
 from data_pipeline.etl.score.etl_score_post import PostScoreETL
+from data_pipeline.utils import get_module_logger

 from . import constants

+logger = get_module_logger(__name__)

-def get_datasets_to_run(dataset_to_run: str):
+
+def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
    """Returns a list of appropriate datasets to run given input args

    Args:
@ -29,9 +34,36 @@ def get_datasets_to_run(dataset_to_run: str):
        else:
            # reset the list to just the dataset
            dataset_list = [dataset_element]
+
    return dataset_list


+def _run_one_dataset(dataset: dict) -> None:
+    """Runs one etl process."""
+    etl_module = importlib.import_module(
+        f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
+    )
+    etl_class = getattr(etl_module, dataset["class_name"])
+    etl_instance = etl_class()
+
+    # run extract
+    etl_instance.extract()
+
+    # run transform
+    etl_instance.transform()
+
+    # run load
+    etl_instance.load()
+
+    # run validate
+    etl_instance.validate()
+
+    # cleanup
+    etl_instance.cleanup()
+
+    logger.info(f"Finished `etl-run` for dataset `{dataset['name']}`.")
+
+
 def etl_runner(dataset_to_run: str = None) -> None:
    """Runs all etl processes or a specific one

@ -41,33 +73,18 @@ def etl_runner(dataset_to_run: str = None) -> None:
    Returns:
        None
    """
-    dataset_list = get_datasets_to_run(dataset_to_run)
+    dataset_list = _get_datasets_to_run(dataset_to_run)

-    # Run the ETLs for the dataset_list
-    for dataset in dataset_list:
-        etl_module = importlib.import_module(
-            f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
-        )
-        etl_class = getattr(etl_module, dataset["class_name"])
-        etl_instance = etl_class()
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = {
+            executor.submit(_run_one_dataset, dataset=dataset)
+            for dataset in dataset_list
+        }

-        # run extract
-        etl_instance.extract()
-
-        # run transform
-        etl_instance.transform()
-
-        # run load
-        etl_instance.load()
-
-        # run validate
-        etl_instance.validate()
-
-        # cleanup
-        etl_instance.cleanup()
-
-    # update the front end JSON/CSV of list of data sources
-    pass
+        for fut in concurrent.futures.as_completed(futures):
+            # Calling result will raise an exception if one occurred.
+            # Otherwise, the exceptions are silently ignored.
+            fut.result()


 def score_generate() -> None: