Run ETL processes in parallel (#1253)

* WIP on parallelizing

* switching to get_tmp_path for nri

* switching to get_tmp_path everywhere necessary

* fixing linter errors

* moving heavy ETLs to front of line

* add hold

* moving cdc places up

* removing unnecessary print

* moving h&t up

* adding parallel to geo post

* better census labels

* switching to concurrent futures

* fixing output
This commit is contained in:
Lucas Merrill Brown 2022-02-11 14:04:53 -05:00 committed by GitHub
commit a0d6e55f0a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
30 changed files with 286 additions and 160 deletions

View file

@ -1,4 +1,6 @@
import concurrent.futures
import math
import pandas as pd
import geopandas as gpd
@ -204,14 +206,28 @@ class GeoScoreETL(ExtractTransformLoad):
return compressed
def load(self) -> None:
logger.info("Writing usa-high (~9 minutes)")
self.geojson_score_usa_high.to_file(
self.SCORE_HIGH_GEOJSON, driver="GeoJSON"
)
logger.info("Completed writing usa-high")
# Create separate threads to run each write to disk.
def write_high_to_file():
logger.info("Writing usa-high (~9 minutes)")
self.geojson_score_usa_high.to_file(
filename=self.SCORE_HIGH_GEOJSON, driver="GeoJSON"
)
logger.info("Completed writing usa-high")
logger.info("Writing usa-low (~9 minutes)")
self.geojson_score_usa_low.to_file(
self.SCORE_LOW_GEOJSON, driver="GeoJSON"
)
logger.info("Completed writing usa-low")
def write_low_to_file():
logger.info("Writing usa-low (~9 minutes)")
self.geojson_score_usa_low.to_file(
filename=self.SCORE_LOW_GEOJSON, driver="GeoJSON"
)
logger.info("Completed writing usa-low")
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {
executor.submit(task)
for task in [write_high_to_file, write_low_to_file]
}
for fut in concurrent.futures.as_completed(futures):
# Calling result will raise an exception if one occurred.
# Otherwise, the exceptions are silently ignored.
fut.result()

View file

@ -48,7 +48,7 @@ def check_score_data_source(
# check if score data is found locally
if not os.path.isfile(TILE_SCORE_CSV):
logger.info(
"No local score tiles data found. Please use '-d aws` to fetch from AWS"
"No local score tiles data found. Please use '-s aws` to fetch from AWS"
)
sys.exit()