Data Pipeline performance improvements for Census GeoJson and Score file

2025-10-01 06:53:18 -07:00 · 2025-01-13 09:28:14 -05:00 · 2025-01-13 09:28:14 -05:00 · c32bd1f363
commit c32bd1f363
parent d5d055864f
37 changed files with 1305 additions and 1413 deletions
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -1,5 +1,6 @@
 import concurrent.futures
 import importlib
+import time
 import typing
 import os

@ -27,9 +28,7 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
        None
    """
    dataset_list = constants.DATASET_LIST
-    etls_to_search = (
-        dataset_list + [constants.CENSUS_INFO] + [constants.TRIBAL_INFO]
-    )
+    etls_to_search = dataset_list + [constants.CENSUS_INFO]

    if dataset_to_run:
        dataset_element = next(
@ -59,6 +58,8 @@ def _get_dataset(dataset: dict) -> ExtractTransformLoad:
 def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
    """Runs one etl process."""

+    start_time = time.time()
+
    logger.info(f"Running ETL for {dataset['name']}")
    etl_instance = _get_dataset(dataset)

@ -83,6 +84,9 @@ def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
    etl_instance.cleanup()

    logger.info(f"Finished ETL for dataset {dataset['name']}")
+    logger.debug(
+        f"Execution time for ETL for dataset {dataset['name']} was {time.time() - start_time}s"
+    )


 def etl_runner(
@ -197,10 +201,14 @@ def score_generate() -> None:
    """

    # Score Gen
+    start_time = time.time()
    score_gen = ScoreETL()
    score_gen.extract()
    score_gen.transform()
    score_gen.load()
+    logger.debug(
+        f"Execution time for Score Generation was {time.time() - start_time}s"
+    )


 def score_post(data_source: str = "local") -> None:
@ -216,11 +224,15 @@ def score_post(data_source: str = "local") -> None:
        None
    """
    # Post Score Processing
+    start_time = time.time()
    score_post = PostScoreETL(data_source=data_source)
    score_post.extract()
    score_post.transform()
    score_post.load()
    score_post.cleanup()
+    logger.debug(
+        f"Execution time for Score Post was {time.time() - start_time}s"
+    )


 def score_geo(data_source: str = "local") -> None:
@ -237,10 +249,14 @@ def score_geo(data_source: str = "local") -> None:
    """

    # Score Geo
+    start_time = time.time()
    score_geo = GeoScoreETL(data_source=data_source)
    score_geo.extract()
    score_geo.transform()
    score_geo.load()
+    logger.debug(
+        f"Execution time for Score Geo was {time.time() - start_time}s"
+    )


 def _find_dataset_index(dataset_list, key, value):