Run ETL processes in parallel (#1253)

* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
2025-09-30 07:13:18 -07:00 · 2022-02-11 14:04:53 -05:00 · 2022-02-11 14:04:53 -05:00 · a0d6e55f0a
commit a0d6e55f0a
parent 389eb59ac4
30 changed files with 286 additions and 160 deletions
--- a/data/data-pipeline/data_pipeline/tests/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/test_etl.py
@ -1,9 +1,10 @@
+# pylint: disable=protected-access
 import pytest
 from data_pipeline.etl import constants, runner


 def test_get_datasets_to_run():
-    assert runner.get_datasets_to_run(None) == constants.DATASET_LIST
-    assert runner.get_datasets_to_run("census") == [constants.CENSUS_INFO]
+    assert runner._get_datasets_to_run(None) == constants.DATASET_LIST
+    assert runner._get_datasets_to_run("census") == [constants.CENSUS_INFO]
    with pytest.raises(ValueError):
-        runner.get_datasets_to_run("doesnt_exist")
+        runner._get_datasets_to_run("doesnt_exist")