Run ETL processes in parallel (#1253)

* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
2025-07-29 02:31:17 -07:00 · 2022-02-11 14:04:53 -05:00 · 2022-02-11 14:04:53 -05:00 · a0d6e55f0a
commit a0d6e55f0a
parent 389eb59ac4
30 changed files with 286 additions and 160 deletions
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@ -98,7 +98,9 @@ class TestNationalRiskIndexETL(TestETL):
        # setup
        etl = NationalRiskIndexETL()
        data_path, tmp_path = mock_paths
-        input_csv = tmp_path / "NRI_Table_CensusTracts.csv"
+        input_csv = (
+            tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
+        )

        # validation
        assert etl.INPUT_CSV == input_csv
@ -141,7 +143,9 @@ class TestNationalRiskIndexETL(TestETL):
        )

        # Assert that the extracted file exists
-        extracted_file_path = tmp_path / "NRI_Table_CensusTracts.csv"
+        extracted_file_path = (
+            tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
+        )
        assert extracted_file_path.is_file()

        input_csv_path = (