Data Pipeline performance improvements for Census GeoJson and Score file

This commit is contained in:
Carlos Felix 2025-01-13 09:28:14 -05:00 committed by Carlos Felix
commit c32bd1f363
37 changed files with 1305 additions and 1413 deletions

View file

@ -1,5 +1,6 @@
import concurrent.futures
import importlib
import time
import typing
import os
@ -27,9 +28,7 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
None
"""
dataset_list = constants.DATASET_LIST
etls_to_search = (
dataset_list + [constants.CENSUS_INFO] + [constants.TRIBAL_INFO]
)
etls_to_search = dataset_list + [constants.CENSUS_INFO]
if dataset_to_run:
dataset_element = next(
@ -59,6 +58,8 @@ def _get_dataset(dataset: dict) -> ExtractTransformLoad:
def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
"""Runs one etl process."""
start_time = time.time()
logger.info(f"Running ETL for {dataset['name']}")
etl_instance = _get_dataset(dataset)
@ -83,6 +84,9 @@ def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
etl_instance.cleanup()
logger.info(f"Finished ETL for dataset {dataset['name']}")
logger.debug(
f"Execution time for ETL for dataset {dataset['name']} was {time.time() - start_time}s"
)
def etl_runner(
@ -197,10 +201,14 @@ def score_generate() -> None:
"""
# Score Gen
start_time = time.time()
score_gen = ScoreETL()
score_gen.extract()
score_gen.transform()
score_gen.load()
logger.debug(
f"Execution time for Score Generation was {time.time() - start_time}s"
)
def score_post(data_source: str = "local") -> None:
@ -216,11 +224,15 @@ def score_post(data_source: str = "local") -> None:
None
"""
# Post Score Processing
start_time = time.time()
score_post = PostScoreETL(data_source=data_source)
score_post.extract()
score_post.transform()
score_post.load()
score_post.cleanup()
logger.debug(
f"Execution time for Score Post was {time.time() - start_time}s"
)
def score_geo(data_source: str = "local") -> None:
@ -237,10 +249,14 @@ def score_geo(data_source: str = "local") -> None:
"""
# Score Geo
start_time = time.time()
score_geo = GeoScoreETL(data_source=data_source)
score_geo.extract()
score_geo.transform()
score_geo.load()
logger.debug(
f"Execution time for Score Geo was {time.time() - start_time}s"
)
def _find_dataset_index(dataset_list, key, value):