mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-27 17:01:17 -07:00
Add abandoned mine lands data (#1824)
* Add notebook to generate test data (#1780) * Add Abandoned Mine Land data (#1780) Using a similar structure but simpler apporach compared to FUDs, add an indicator for whether a tract has an abandonded mine. * Adding some detail to dataset readmes Just a thought! * Apply feedback from revieiw (#1780) * Fixup bad string that broke test (#1780) * Update a string that I should have renamed (#1780) * Reduce number of threads to reduce memory pressure (#1780) * Try not running geo data (#1780) * Run the high-memory sets separately (#1780) * Actually deduplicate (#1780) * Add flag for memory intensive ETLs (#1780) * Document new flag for datasets (#1780) * Add flag for new datasets fro rebase (#1780) Co-authored-by: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com>
This commit is contained in:
parent
5e378aea81
commit
49623e4da0
13 changed files with 2815 additions and 1 deletions
|
@ -77,10 +77,27 @@ def etl_runner(dataset_to_run: str = None) -> None:
|
|||
"""
|
||||
dataset_list = _get_datasets_to_run(dataset_to_run)
|
||||
|
||||
# Because we are memory constrained on our infrastructure,
|
||||
# we split datasets into those that are not memory intensive
|
||||
# (is_memory_intensive == False) and thereby can be safely
|
||||
# run in parallel, and those that require more RAM and thus
|
||||
# should be run sequentially. The is_memory_intensive_flag is
|
||||
# set manually in constants.py based on experience running
|
||||
# the pipeline
|
||||
concurrent_datasets = [
|
||||
dataset
|
||||
for dataset in dataset_list
|
||||
if not dataset["is_memory_intensive"]
|
||||
]
|
||||
high_memory_datasets = [
|
||||
dataset for dataset in dataset_list if dataset["is_memory_intensive"]
|
||||
]
|
||||
|
||||
logger.info("Running concurrent jobs")
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = {
|
||||
executor.submit(_run_one_dataset, dataset=dataset)
|
||||
for dataset in dataset_list
|
||||
for dataset in concurrent_datasets
|
||||
}
|
||||
|
||||
for fut in concurrent.futures.as_completed(futures):
|
||||
|
@ -88,6 +105,10 @@ def etl_runner(dataset_to_run: str = None) -> None:
|
|||
# Otherwise, the exceptions are silently ignored.
|
||||
fut.result()
|
||||
|
||||
logger.info("Running high-memory jobs")
|
||||
for dataset in high_memory_datasets:
|
||||
_run_one_dataset(dataset=dataset)
|
||||
|
||||
|
||||
def score_generate() -> None:
|
||||
"""Generates the score and saves it on the local data directory
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue