mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-25 18:10:16 -07:00
Run ETL processes in parallel (#1253)
* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
This commit is contained in:
parent
389eb59ac4
commit
a0d6e55f0a
30 changed files with 286 additions and 160 deletions
|
@ -98,6 +98,17 @@ class ExtractTransformLoad:
|
|||
)
|
||||
return output_file_path
|
||||
|
||||
def get_tmp_path(self) -> pathlib.Path:
|
||||
"""Returns the temporary path associated with this ETL class."""
|
||||
# Note: the temporary path will be defined on `init`, because it uses the class
|
||||
# of the instance which is often a child class.
|
||||
tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
tmp_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
return tmp_path
|
||||
|
||||
def extract(
|
||||
self,
|
||||
source_url: str = None,
|
||||
|
@ -112,7 +123,7 @@ class ExtractTransformLoad:
|
|||
if source_url and extract_path:
|
||||
unzip_file_from_url(
|
||||
file_url=source_url,
|
||||
download_path=self.TMP_PATH,
|
||||
download_path=self.get_tmp_path(),
|
||||
unzipped_file_path=extract_path,
|
||||
verify=verify,
|
||||
)
|
||||
|
@ -265,4 +276,4 @@ class ExtractTransformLoad:
|
|||
|
||||
def cleanup(self) -> None:
|
||||
"""Clears out any files stored in the TMP folder"""
|
||||
remove_all_from_dir(self.TMP_PATH)
|
||||
remove_all_from_dir(self.get_tmp_path())
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue