Run ETL processes in parallel (#1253)

* WIP on parallelizing

* switching to get_tmp_path for nri

* switching to get_tmp_path everywhere necessary

* fixing linter errors

* moving heavy ETLs to front of line

* add hold

* moving cdc places up

* removing unnecessary print

* moving h&t up

* adding parallel to geo post

* better census labels

* switching to concurrent futures

* fixing output
This commit is contained in:
Lucas Merrill Brown 2022-02-11 14:04:53 -05:00 committed by GitHub
commit a0d6e55f0a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
30 changed files with 286 additions and 160 deletions

View file

@ -16,7 +16,7 @@ class EJSCREENETL(ExtractTransformLoad):
def __init__(self):
self.EJSCREEN_FTP_URL = "https://edap-arcgiscloud-data-commons.s3.amazonaws.com/EJSCREEN2020/EJSCREEN_Tract_2020_USPR.csv.zip"
self.EJSCREEN_CSV = self.TMP_PATH / "EJSCREEN_Tract_2020_USPR.csv"
self.EJSCREEN_CSV = self.get_tmp_path() / "EJSCREEN_Tract_2020_USPR.csv"
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen_2019"
self.df: pd.DataFrame
@ -45,7 +45,7 @@ class EJSCREENETL(ExtractTransformLoad):
logger.info("Downloading EJScreen Data")
super().extract(
self.EJSCREEN_FTP_URL,
self.TMP_PATH,
self.get_tmp_path(),
verify=False, # EPA EJScreen end point has certificate issues often
)