mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 02:41:16 -07:00
Run ETL processes in parallel (#1253)
* WIP on parallelizing * switching to get_tmp_path for nri * switching to get_tmp_path everywhere necessary * fixing linter errors * moving heavy ETLs to front of line * add hold * moving cdc places up * removing unnecessary print * moving h&t up * adding parallel to geo post * better census labels * switching to concurrent futures * fixing output
This commit is contained in:
parent
389eb59ac4
commit
a0d6e55f0a
30 changed files with 286 additions and 160 deletions
|
@ -1,4 +1,6 @@
|
|||
import concurrent.futures
|
||||
import math
|
||||
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
|
||||
|
@ -204,14 +206,28 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
return compressed
|
||||
|
||||
def load(self) -> None:
|
||||
logger.info("Writing usa-high (~9 minutes)")
|
||||
self.geojson_score_usa_high.to_file(
|
||||
self.SCORE_HIGH_GEOJSON, driver="GeoJSON"
|
||||
)
|
||||
logger.info("Completed writing usa-high")
|
||||
# Create separate threads to run each write to disk.
|
||||
def write_high_to_file():
|
||||
logger.info("Writing usa-high (~9 minutes)")
|
||||
self.geojson_score_usa_high.to_file(
|
||||
filename=self.SCORE_HIGH_GEOJSON, driver="GeoJSON"
|
||||
)
|
||||
logger.info("Completed writing usa-high")
|
||||
|
||||
logger.info("Writing usa-low (~9 minutes)")
|
||||
self.geojson_score_usa_low.to_file(
|
||||
self.SCORE_LOW_GEOJSON, driver="GeoJSON"
|
||||
)
|
||||
logger.info("Completed writing usa-low")
|
||||
def write_low_to_file():
|
||||
logger.info("Writing usa-low (~9 minutes)")
|
||||
self.geojson_score_usa_low.to_file(
|
||||
filename=self.SCORE_LOW_GEOJSON, driver="GeoJSON"
|
||||
)
|
||||
logger.info("Completed writing usa-low")
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = {
|
||||
executor.submit(task)
|
||||
for task in [write_high_to_file, write_low_to_file]
|
||||
}
|
||||
|
||||
for fut in concurrent.futures.as_completed(futures):
|
||||
# Calling result will raise an exception if one occurred.
|
||||
# Otherwise, the exceptions are silently ignored.
|
||||
fut.result()
|
||||
|
|
|
@ -48,7 +48,7 @@ def check_score_data_source(
|
|||
# check if score data is found locally
|
||||
if not os.path.isfile(TILE_SCORE_CSV):
|
||||
logger.info(
|
||||
"No local score tiles data found. Please use '-d aws` to fetch from AWS"
|
||||
"No local score tiles data found. Please use '-s aws` to fetch from AWS"
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue