mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 17:44:20 -08:00
Improve stability by limiting multithreading when running ETL
This commit is contained in:
parent
d4898b8f55
commit
6f3432d48a
2 changed files with 18 additions and 5 deletions
|
@ -139,9 +139,14 @@ def pull_census_data(data_source: str):
|
||||||
@cli.command(
|
@cli.command(
|
||||||
help="Run all ETL processes or a specific one",
|
help="Run all ETL processes or a specific one",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"--no-concurrency",
|
||||||
|
is_flag=True,
|
||||||
|
help="Run ETLs sequentially instead of concurrently.",
|
||||||
|
)
|
||||||
@dataset_option
|
@dataset_option
|
||||||
@use_cache_option
|
@use_cache_option
|
||||||
def etl_run(dataset: str, use_cache: bool):
|
def etl_run(dataset: str, use_cache: bool, no_concurrency: bool):
|
||||||
"""Run a specific or all ETL processes
|
"""Run a specific or all ETL processes
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -153,7 +158,7 @@ def etl_run(dataset: str, use_cache: bool):
|
||||||
log_title("Run ETL")
|
log_title("Run ETL")
|
||||||
|
|
||||||
log_info("Running dataset(s)")
|
log_info("Running dataset(s)")
|
||||||
etl_runner(dataset, use_cache)
|
etl_runner(dataset, use_cache, no_concurrency)
|
||||||
|
|
||||||
log_goodbye()
|
log_goodbye()
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import importlib
|
import importlib
|
||||||
import typing
|
import typing
|
||||||
|
import os
|
||||||
|
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
|
|
||||||
|
@ -84,7 +85,11 @@ def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
|
||||||
logger.info(f"Finished ETL for dataset {dataset['name']}")
|
logger.info(f"Finished ETL for dataset {dataset['name']}")
|
||||||
|
|
||||||
|
|
||||||
def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None:
|
def etl_runner(
|
||||||
|
dataset_to_run: str = None,
|
||||||
|
use_cache: bool = False,
|
||||||
|
no_concurrency: bool = False,
|
||||||
|
) -> None:
|
||||||
"""Runs all etl processes or a specific one
|
"""Runs all etl processes or a specific one
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -112,9 +117,12 @@ def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None:
|
||||||
dataset for dataset in dataset_list if dataset["is_memory_intensive"]
|
dataset for dataset in dataset_list if dataset["is_memory_intensive"]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
max_workers = 1 if no_concurrency else os.cpu_count()
|
||||||
if concurrent_datasets:
|
if concurrent_datasets:
|
||||||
logger.info("Running concurrent ETL jobs")
|
logger.info(f"Running concurrent ETL jobs on {max_workers} thread(s)")
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor(
|
||||||
|
max_workers=max_workers
|
||||||
|
) as executor:
|
||||||
futures = {
|
futures = {
|
||||||
executor.submit(
|
executor.submit(
|
||||||
_run_one_dataset, dataset=dataset, use_cache=use_cache
|
_run_one_dataset, dataset=dataset, use_cache=use_cache
|
||||||
|
|
Loading…
Add table
Reference in a new issue