mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-30 00:31:16 -07:00
Run ETL processes in parallel (#1253)
* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
This commit is contained in:
parent
389eb59ac4
commit
a0d6e55f0a
30 changed files with 286 additions and 160 deletions
|
@ -1,13 +1,18 @@
|
|||
import importlib
|
||||
import concurrent.futures
|
||||
import typing
|
||||
|
||||
from data_pipeline.etl.score.etl_score import ScoreETL
|
||||
from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
|
||||
from data_pipeline.etl.score.etl_score_post import PostScoreETL
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
from . import constants
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
def get_datasets_to_run(dataset_to_run: str):
|
||||
|
||||
def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
|
||||
"""Returns a list of appropriate datasets to run given input args
|
||||
|
||||
Args:
|
||||
|
@ -29,9 +34,36 @@ def get_datasets_to_run(dataset_to_run: str):
|
|||
else:
|
||||
# reset the list to just the dataset
|
||||
dataset_list = [dataset_element]
|
||||
|
||||
return dataset_list
|
||||
|
||||
|
||||
def _run_one_dataset(dataset: dict) -> None:
|
||||
"""Runs one etl process."""
|
||||
etl_module = importlib.import_module(
|
||||
f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
|
||||
)
|
||||
etl_class = getattr(etl_module, dataset["class_name"])
|
||||
etl_instance = etl_class()
|
||||
|
||||
# run extract
|
||||
etl_instance.extract()
|
||||
|
||||
# run transform
|
||||
etl_instance.transform()
|
||||
|
||||
# run load
|
||||
etl_instance.load()
|
||||
|
||||
# run validate
|
||||
etl_instance.validate()
|
||||
|
||||
# cleanup
|
||||
etl_instance.cleanup()
|
||||
|
||||
logger.info(f"Finished `etl-run` for dataset `{dataset['name']}`.")
|
||||
|
||||
|
||||
def etl_runner(dataset_to_run: str = None) -> None:
|
||||
"""Runs all etl processes or a specific one
|
||||
|
||||
|
@ -41,33 +73,18 @@ def etl_runner(dataset_to_run: str = None) -> None:
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
dataset_list = get_datasets_to_run(dataset_to_run)
|
||||
dataset_list = _get_datasets_to_run(dataset_to_run)
|
||||
|
||||
# Run the ETLs for the dataset_list
|
||||
for dataset in dataset_list:
|
||||
etl_module = importlib.import_module(
|
||||
f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
|
||||
)
|
||||
etl_class = getattr(etl_module, dataset["class_name"])
|
||||
etl_instance = etl_class()
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = {
|
||||
executor.submit(_run_one_dataset, dataset=dataset)
|
||||
for dataset in dataset_list
|
||||
}
|
||||
|
||||
# run extract
|
||||
etl_instance.extract()
|
||||
|
||||
# run transform
|
||||
etl_instance.transform()
|
||||
|
||||
# run load
|
||||
etl_instance.load()
|
||||
|
||||
# run validate
|
||||
etl_instance.validate()
|
||||
|
||||
# cleanup
|
||||
etl_instance.cleanup()
|
||||
|
||||
# update the front end JSON/CSV of list of data sources
|
||||
pass
|
||||
for fut in concurrent.futures.as_completed(futures):
|
||||
# Calling result will raise an exception if one occurred.
|
||||
# Otherwise, the exceptions are silently ignored.
|
||||
fut.result()
|
||||
|
||||
|
||||
def score_generate() -> None:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue