Run ETL processes in parallel (#1253)

* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
2025-08-04 01:14:18 -07:00 · 2022-02-11 14:04:53 -05:00 · 2022-02-11 14:04:53 -05:00 · a0d6e55f0a
commit a0d6e55f0a
parent 389eb59ac4
30 changed files with 286 additions and 160 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -20,7 +20,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT

    def __init__(self):
-        self.INPUT_CSV = self.TMP_PATH / "NRI_Table_CensusTracts.csv"
+        self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"

        self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
            "EAL_SCORE"
@ -68,7 +68,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        logger.info("Downloading 405MB National Risk Index Data")
        super().extract(
            source_url=self.SOURCE_URL,
-            extract_path=self.TMP_PATH,
+            extract_path=self.get_tmp_path(),
        )

    def transform(self) -> None: