2021-07-12 15:50:44 -04:00
|
|
|
import importlib
|
2022-02-11 14:04:53 -05:00
|
|
|
import concurrent.futures
|
|
|
|
import typing
|
2021-07-12 15:50:44 -04:00
|
|
|
|
2021-08-05 15:35:54 -04:00
|
|
|
from data_pipeline.etl.score.etl_score import ScoreETL
|
|
|
|
from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
|
|
|
|
from data_pipeline.etl.score.etl_score_post import PostScoreETL
|
2022-02-11 14:04:53 -05:00
|
|
|
from data_pipeline.utils import get_module_logger
|
2021-07-12 15:50:44 -04:00
|
|
|
|
2021-08-09 09:52:06 -04:00
|
|
|
from . import constants
|
2021-07-12 15:50:44 -04:00
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
logger = get_module_logger(__name__)
|
2021-08-09 09:52:06 -04:00
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
|
|
|
|
def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
|
2021-08-09 09:52:06 -04:00
|
|
|
"""Returns a list of appropriate datasets to run given input args
|
2021-07-12 15:50:44 -04:00
|
|
|
|
|
|
|
Args:
|
|
|
|
dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
None
|
|
|
|
"""
|
2021-08-09 09:52:06 -04:00
|
|
|
dataset_list = constants.DATASET_LIST
|
2022-07-30 01:13:10 -04:00
|
|
|
etls_to_search = (
|
|
|
|
dataset_list + [constants.CENSUS_INFO] + [constants.TRIBAL_INFO]
|
|
|
|
)
|
2021-07-12 15:50:44 -04:00
|
|
|
|
|
|
|
if dataset_to_run:
|
|
|
|
dataset_element = next(
|
2021-08-09 09:52:06 -04:00
|
|
|
(item for item in etls_to_search if item["name"] == dataset_to_run),
|
2021-07-15 13:34:08 -04:00
|
|
|
None,
|
2021-07-12 15:50:44 -04:00
|
|
|
)
|
2021-08-09 09:52:06 -04:00
|
|
|
if not dataset_element:
|
2021-07-12 15:50:44 -04:00
|
|
|
raise ValueError("Invalid dataset name")
|
|
|
|
else:
|
|
|
|
# reset the list to just the dataset
|
|
|
|
dataset_list = [dataset_element]
|
2022-02-11 14:04:53 -05:00
|
|
|
|
2021-08-09 09:52:06 -04:00
|
|
|
return dataset_list
|
|
|
|
|
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
def _run_one_dataset(dataset: dict) -> None:
|
|
|
|
"""Runs one etl process."""
|
|
|
|
etl_module = importlib.import_module(
|
|
|
|
f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
|
|
|
|
)
|
|
|
|
etl_class = getattr(etl_module, dataset["class_name"])
|
|
|
|
etl_instance = etl_class()
|
2021-08-09 09:52:06 -04:00
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
# run extract
|
|
|
|
etl_instance.extract()
|
2021-08-09 09:52:06 -04:00
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
# run transform
|
|
|
|
etl_instance.transform()
|
2021-07-12 15:50:44 -04:00
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
# run load
|
|
|
|
etl_instance.load()
|
2021-07-12 15:50:44 -04:00
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
# run validate
|
|
|
|
etl_instance.validate()
|
2021-07-12 15:50:44 -04:00
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
# cleanup
|
|
|
|
etl_instance.cleanup()
|
2021-07-12 15:50:44 -04:00
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
logger.info(f"Finished `etl-run` for dataset `{dataset['name']}`.")
|
2021-07-12 15:50:44 -04:00
|
|
|
|
2022-02-08 19:05:32 -05:00
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
def etl_runner(dataset_to_run: str = None) -> None:
|
|
|
|
"""Runs all etl processes or a specific one
|
2021-07-12 15:50:44 -04:00
|
|
|
|
2022-02-11 14:04:53 -05:00
|
|
|
Args:
|
|
|
|
dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
None
|
|
|
|
"""
|
|
|
|
dataset_list = _get_datasets_to_run(dataset_to_run)
|
|
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
|
|
futures = {
|
|
|
|
executor.submit(_run_one_dataset, dataset=dataset)
|
|
|
|
for dataset in dataset_list
|
|
|
|
}
|
|
|
|
|
|
|
|
for fut in concurrent.futures.as_completed(futures):
|
|
|
|
# Calling result will raise an exception if one occurred.
|
|
|
|
# Otherwise, the exceptions are silently ignored.
|
|
|
|
fut.result()
|
2021-07-12 15:50:44 -04:00
|
|
|
|
|
|
|
|
|
|
|
def score_generate() -> None:
|
|
|
|
"""Generates the score and saves it on the local data directory
|
|
|
|
|
|
|
|
Args:
|
|
|
|
None
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
None
|
|
|
|
"""
|
|
|
|
|
2021-07-15 13:34:08 -04:00
|
|
|
# Score Gen
|
|
|
|
score_gen = ScoreETL()
|
|
|
|
score_gen.extract()
|
|
|
|
score_gen.transform()
|
|
|
|
score_gen.load()
|
|
|
|
|
2021-10-13 15:54:15 -04:00
|
|
|
|
2021-11-01 18:05:05 -04:00
|
|
|
def score_post(data_source: str = "local") -> None:
|
2021-10-01 15:04:37 -04:00
|
|
|
"""Posts the score files to the local directory
|
|
|
|
|
|
|
|
Args:
|
2021-11-01 18:05:05 -04:00
|
|
|
data_source (str): Source for the census data (optional)
|
|
|
|
Options:
|
|
|
|
- local (default): fetch census data from the local data directory
|
|
|
|
- aws: fetch census from AWS S3 J40 data repository
|
2021-10-01 15:04:37 -04:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
None
|
|
|
|
"""
|
2021-07-15 13:34:08 -04:00
|
|
|
# Post Score Processing
|
2021-11-01 18:05:05 -04:00
|
|
|
score_post = PostScoreETL(data_source=data_source)
|
2021-07-15 13:34:08 -04:00
|
|
|
score_post.extract()
|
|
|
|
score_post.transform()
|
|
|
|
score_post.load()
|
|
|
|
score_post.cleanup()
|
2021-07-12 15:50:44 -04:00
|
|
|
|
|
|
|
|
2021-10-13 16:00:33 -04:00
|
|
|
def score_geo(data_source: str = "local") -> None:
|
2021-07-28 16:07:28 -04:00
|
|
|
"""Generates the geojson files with score data baked in
|
|
|
|
|
|
|
|
Args:
|
2021-11-01 18:05:05 -04:00
|
|
|
data_source (str): Source for the census data (optional)
|
|
|
|
Options:
|
|
|
|
- local (default): fetch census data from the local data directory
|
|
|
|
- aws: fetch census from AWS S3 J40 data repository
|
2021-07-28 16:07:28 -04:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
None
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Score Geo
|
2021-10-13 16:00:33 -04:00
|
|
|
score_geo = GeoScoreETL(data_source=data_source)
|
2021-07-28 16:07:28 -04:00
|
|
|
score_geo.extract()
|
|
|
|
score_geo.transform()
|
|
|
|
score_geo.load()
|
|
|
|
|
|
|
|
|
2021-07-12 15:50:44 -04:00
|
|
|
def _find_dataset_index(dataset_list, key, value):
|
|
|
|
for i, element in enumerate(dataset_list):
|
|
|
|
if element[key] == value:
|
|
|
|
return i
|
|
|
|
return -1
|