Run ETL processes in parallel (#1253)

* WIP on parallelizing

* switching to get_tmp_path for nri

* switching to get_tmp_path everywhere necessary

* fixing linter errors

* moving heavy ETLs to front of line

* add hold

* moving cdc places up

* removing unnecessary print

* moving h&t up

* adding parallel to geo post

* better census labels

* switching to concurrent futures

* fixing output
This commit is contained in:
Lucas Merrill Brown 2022-02-11 14:04:53 -05:00 committed by GitHub
commit a0d6e55f0a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
30 changed files with 286 additions and 160 deletions

View file

@ -101,6 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
logger.info("Starting Census 2010 ACS Transform")
# Define the variables to retrieve
variables = (
self.UNEMPLOYED_FIELDS
@ -118,7 +119,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
)
def transform(self) -> None:
logger.info("Starting Census ACS Transform")
logger.info("Starting Census 2010 ACS Transform")
df = self.df
@ -184,7 +185,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
self.df = output_df
def load(self) -> None:
logger.info("Saving Census ACS Data")
logger.info("Saving Census 2010 ACS Data")
# mkdir census
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)