Run ETL processes in parallel (#1253)

* WIP on parallelizing

* switching to get_tmp_path for nri

* switching to get_tmp_path everywhere necessary

* fixing linter errors

* moving heavy ETLs to front of line

* add hold

* moving cdc places up

* removing unnecessary print

* moving h&t up

* adding parallel to geo post

* better census labels

* switching to concurrent futures

* fixing output
This commit is contained in:
Lucas Merrill Brown 2022-02-11 14:04:53 -05:00 committed by GitHub
commit a0d6e55f0a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
30 changed files with 286 additions and 160 deletions

View file

@ -41,12 +41,12 @@ class ExampleETL(ExtractTransformLoad):
logger.info(f"Extracting {zip_file_path}")
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(self.TMP_PATH)
zip_ref.extractall(self.get_tmp_path())
def transform(self):
logger.info(f"Loading file from {self.TMP_PATH / 'input.csv'}.")
logger.info(f"Loading file from {self.get_tmp_path() / 'input.csv'}.")
df: pd.DataFrame = pd.read_csv(
self.TMP_PATH / "input.csv",
self.get_tmp_path() / "input.csv",
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)

View file

@ -543,7 +543,7 @@ class TestETL:
f"Writing data to {self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME}"
)
copy_data_files(
src=etl.TMP_PATH / "input.csv",
src=etl.get_tmp_path() / "input.csv",
dst=self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME,
)

View file

@ -98,7 +98,9 @@ class TestNationalRiskIndexETL(TestETL):
# setup
etl = NationalRiskIndexETL()
data_path, tmp_path = mock_paths
input_csv = tmp_path / "NRI_Table_CensusTracts.csv"
input_csv = (
tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
)
# validation
assert etl.INPUT_CSV == input_csv
@ -141,7 +143,9 @@ class TestNationalRiskIndexETL(TestETL):
)
# Assert that the extracted file exists
extracted_file_path = tmp_path / "NRI_Table_CensusTracts.csv"
extracted_file_path = (
tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
)
assert extracted_file_path.is_file()
input_csv_path = (

View file

@ -1,9 +1,10 @@
# pylint: disable=protected-access
import pytest
from data_pipeline.etl import constants, runner
def test_get_datasets_to_run():
assert runner.get_datasets_to_run(None) == constants.DATASET_LIST
assert runner.get_datasets_to_run("census") == [constants.CENSUS_INFO]
assert runner._get_datasets_to_run(None) == constants.DATASET_LIST
assert runner._get_datasets_to_run("census") == [constants.CENSUS_INFO]
with pytest.raises(ValueError):
runner.get_datasets_to_run("doesnt_exist")
runner._get_datasets_to_run("doesnt_exist")