mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-04 06:34:18 -07:00
Data Pipeline performance improvements for Census GeoJson and Score file
This commit is contained in:
parent
d5d055864f
commit
c32bd1f363
37 changed files with 1305 additions and 1413 deletions
|
@ -1,5 +1,6 @@
|
|||
import concurrent.futures
|
||||
import importlib
|
||||
import time
|
||||
import typing
|
||||
import os
|
||||
|
||||
|
@ -27,9 +28,7 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
|
|||
None
|
||||
"""
|
||||
dataset_list = constants.DATASET_LIST
|
||||
etls_to_search = (
|
||||
dataset_list + [constants.CENSUS_INFO] + [constants.TRIBAL_INFO]
|
||||
)
|
||||
etls_to_search = dataset_list + [constants.CENSUS_INFO]
|
||||
|
||||
if dataset_to_run:
|
||||
dataset_element = next(
|
||||
|
@ -59,6 +58,8 @@ def _get_dataset(dataset: dict) -> ExtractTransformLoad:
|
|||
def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
|
||||
"""Runs one etl process."""
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
logger.info(f"Running ETL for {dataset['name']}")
|
||||
etl_instance = _get_dataset(dataset)
|
||||
|
||||
|
@ -83,6 +84,9 @@ def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
|
|||
etl_instance.cleanup()
|
||||
|
||||
logger.info(f"Finished ETL for dataset {dataset['name']}")
|
||||
logger.debug(
|
||||
f"Execution time for ETL for dataset {dataset['name']} was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def etl_runner(
|
||||
|
@ -197,10 +201,14 @@ def score_generate() -> None:
|
|||
"""
|
||||
|
||||
# Score Gen
|
||||
start_time = time.time()
|
||||
score_gen = ScoreETL()
|
||||
score_gen.extract()
|
||||
score_gen.transform()
|
||||
score_gen.load()
|
||||
logger.debug(
|
||||
f"Execution time for Score Generation was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def score_post(data_source: str = "local") -> None:
|
||||
|
@ -216,11 +224,15 @@ def score_post(data_source: str = "local") -> None:
|
|||
None
|
||||
"""
|
||||
# Post Score Processing
|
||||
start_time = time.time()
|
||||
score_post = PostScoreETL(data_source=data_source)
|
||||
score_post.extract()
|
||||
score_post.transform()
|
||||
score_post.load()
|
||||
score_post.cleanup()
|
||||
logger.debug(
|
||||
f"Execution time for Score Post was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def score_geo(data_source: str = "local") -> None:
|
||||
|
@ -237,10 +249,14 @@ def score_geo(data_source: str = "local") -> None:
|
|||
"""
|
||||
|
||||
# Score Geo
|
||||
start_time = time.time()
|
||||
score_geo = GeoScoreETL(data_source=data_source)
|
||||
score_geo.extract()
|
||||
score_geo.transform()
|
||||
score_geo.load()
|
||||
logger.debug(
|
||||
f"Execution time for Score Geo was {time.time() - start_time}s"
|
||||
)
|
||||
|
||||
|
||||
def _find_dataset_index(dataset_list, key, value):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue