Add abandoned mine lands data (#1824)

* Add notebook to generate test data (#1780) * Add Abandoned Mine Land data (#1780) Using a similar structure but simpler apporach compared to FUDs, add an indicator for whether a tract has an abandonded mine. * Adding some detail to dataset readmes Just a thought! * Apply feedback from revieiw (#1780) * Fixup bad string that broke test (#1780) * Update a string that I should have renamed (#1780) * Reduce number of threads to reduce memory pressure (#1780) * Try not running geo data (#1780) * Run the high-memory sets separately (#1780) * Actually deduplicate (#1780) * Add flag for memory intensive ETLs (#1780) * Document new flag for datasets (#1780) * Add flag for new datasets fro rebase (#1780) Co-authored-by: Emma Nechamkin <97977170+emma-nechamkin@users.noreply.github.com>
2025-07-27 17:01:17 -07:00 · 2022-08-17 11:33:59 -04:00 · 2022-08-17 11:33:59 -04:00 · 49623e4da0
commit 49623e4da0
parent 5e378aea81
13 changed files with 2815 additions and 1 deletions
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -77,10 +77,27 @@ def etl_runner(dataset_to_run: str = None) -> None:
    """
    dataset_list = _get_datasets_to_run(dataset_to_run)

+    # Because we are memory constrained on our infrastructure,
+    # we split datasets into those that are not memory intensive
+    # (is_memory_intensive == False) and thereby can be safely
+    # run in parallel, and those that require more RAM and thus
+    # should be run sequentially. The is_memory_intensive_flag is
+    # set manually in constants.py based on experience running
+    # the pipeline
+    concurrent_datasets = [
+        dataset
+        for dataset in dataset_list
+        if not dataset["is_memory_intensive"]
+    ]
+    high_memory_datasets = [
+        dataset for dataset in dataset_list if dataset["is_memory_intensive"]
+    ]
+
+    logger.info("Running concurrent jobs")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {
            executor.submit(_run_one_dataset, dataset=dataset)
-            for dataset in dataset_list
+            for dataset in concurrent_datasets
        }

        for fut in concurrent.futures.as_completed(futures):
@ -88,6 +105,10 @@ def etl_runner(dataset_to_run: str = None) -> None:
            # Otherwise, the exceptions are silently ignored.
            fut.result()

+    logger.info("Running high-memory jobs")
+    for dataset in high_memory_datasets:
+        _run_one_dataset(dataset=dataset)
+

 def score_generate() -> None:
    """Generates the score and saves it on the local data directory