mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-23 05:10:36 -07:00
Run ETL processes in parallel (#1253)
* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
This commit is contained in:
parent
389eb59ac4
commit
a0d6e55f0a
30 changed files with 286 additions and 160 deletions
|
@ -41,12 +41,12 @@ class ExampleETL(ExtractTransformLoad):
|
|||
|
||||
logger.info(f"Extracting {zip_file_path}")
|
||||
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
||||
zip_ref.extractall(self.TMP_PATH)
|
||||
zip_ref.extractall(self.get_tmp_path())
|
||||
|
||||
def transform(self):
|
||||
logger.info(f"Loading file from {self.TMP_PATH / 'input.csv'}.")
|
||||
logger.info(f"Loading file from {self.get_tmp_path() / 'input.csv'}.")
|
||||
df: pd.DataFrame = pd.read_csv(
|
||||
self.TMP_PATH / "input.csv",
|
||||
self.get_tmp_path() / "input.csv",
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
|
|
|
@ -543,7 +543,7 @@ class TestETL:
|
|||
f"Writing data to {self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME}"
|
||||
)
|
||||
copy_data_files(
|
||||
src=etl.TMP_PATH / "input.csv",
|
||||
src=etl.get_tmp_path() / "input.csv",
|
||||
dst=self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME,
|
||||
)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue