Run ETL processes in parallel (#1253)

* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
2025-09-10 06:01:00 -07:00 · 2022-02-11 14:04:53 -05:00 · 2022-02-11 14:04:53 -05:00 · a0d6e55f0a
commit a0d6e55f0a
parent 389eb59ac4
30 changed files with 286 additions and 160 deletions
--- a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
@ -41,12 +41,12 @@ class ExampleETL(ExtractTransformLoad):

        logger.info(f"Extracting {zip_file_path}")
        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
-            zip_ref.extractall(self.TMP_PATH)
+            zip_ref.extractall(self.get_tmp_path())

    def transform(self):
-        logger.info(f"Loading file from {self.TMP_PATH / 'input.csv'}.")
+        logger.info(f"Loading file from {self.get_tmp_path() / 'input.csv'}.")
        df: pd.DataFrame = pd.read_csv(
-            self.TMP_PATH / "input.csv",
+            self.get_tmp_path() / "input.csv",
            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
            low_memory=False,
        )
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@ -543,7 +543,7 @@ class TestETL:
            f"Writing data to {self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME}"
        )
        copy_data_files(
-            src=etl.TMP_PATH / "input.csv",
+            src=etl.get_tmp_path() / "input.csv",
            dst=self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME,
        )

--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@ -98,7 +98,9 @@ class TestNationalRiskIndexETL(TestETL):
        # setup
        etl = NationalRiskIndexETL()
        data_path, tmp_path = mock_paths
-        input_csv = tmp_path / "NRI_Table_CensusTracts.csv"
+        input_csv = (
+            tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
+        )

        # validation
        assert etl.INPUT_CSV == input_csv
@ -141,7 +143,9 @@ class TestNationalRiskIndexETL(TestETL):
        )

        # Assert that the extracted file exists
-        extracted_file_path = tmp_path / "NRI_Table_CensusTracts.csv"
+        extracted_file_path = (
+            tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
+        )
        assert extracted_file_path.is_file()

        input_csv_path = (
--- a/data/data-pipeline/data_pipeline/tests/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/test_etl.py
@ -1,9 +1,10 @@
+# pylint: disable=protected-access
 import pytest
 from data_pipeline.etl import constants, runner


 def test_get_datasets_to_run():
-    assert runner.get_datasets_to_run(None) == constants.DATASET_LIST
-    assert runner.get_datasets_to_run("census") == [constants.CENSUS_INFO]
+    assert runner._get_datasets_to_run(None) == constants.DATASET_LIST
+    assert runner._get_datasets_to_run("census") == [constants.CENSUS_INFO]
    with pytest.raises(ValueError):
-        runner.get_datasets_to_run("doesnt_exist")
+        runner._get_datasets_to_run("doesnt_exist")