Run ETL processes in parallel (#1253)

* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
2025-07-23 15:10:16 -07:00 · 2022-02-11 14:04:53 -05:00 · 2022-02-11 14:04:53 -05:00 · a0d6e55f0a
commit a0d6e55f0a
parent 389eb59ac4
30 changed files with 286 additions and 160 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
@ -75,12 +75,12 @@ class PersistentPovertyETL(ExtractTransformLoad):
    def extract(self) -> None:
        logger.info("Starting to download 86MB persistent poverty file.")

-        unzipped_file_path = self.TMP_PATH / "persistent_poverty"
+        unzipped_file_path = self.get_tmp_path() / "persistent_poverty"

        unzip_file_from_url(
            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/LTDB_Std_All_Sample.zip",
-            download_path=self.TMP_PATH,
+            download_path=self.get_tmp_path(),
            unzipped_file_path=unzipped_file_path,
        )

@ -93,7 +93,6 @@ class PersistentPovertyETL(ExtractTransformLoad):
        temporary_input_dfs = []

        for file_name in file_names:
-            print(file_name)
            temporary_input_df = pd.read_csv(
                filepath_or_buffer=unzipped_file_path
                / f"ltdb_std_all_sample/{file_name}",