Adding back census to list of potential datasets, but separating out from standard list

Error this addresses: File "/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/etl/runner.py", line 71, in etl_runner f"data_pipeline.etl.sources.{dataset['module_dir']}.etl" TypeError: 'NoneType' object is not subscriptable
2025-02-22 17:44:20 -08:00 · 2021-08-06 18:21:37 -04:00 · 2021-08-06 18:21:37 -04:00 · 61d0624966
commit 61d0624966
parent f51b0d69d9
5 changed files with 80 additions and 51 deletions
--- a/data/data-pipeline/data_pipeline/etl/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@ -0,0 +1,42 @@
+DATASET_LIST = [
+    {
+        "name": "tree_equity_score",
+        "module_dir": "tree_equity_score",
+        "class_name": "TreeEquityScoreETL",
+    },
+    {
+        "name": "census_acs",
+        "module_dir": "census_acs",
+        "class_name": "CensusACSETL",
+    },
+    {
+        "name": "ejscreen",
+        "module_dir": "ejscreen",
+        "class_name": "EJScreenETL",
+    },
+    {
+        "name": "housing_and_transportation",
+        "module_dir": "housing_and_transportation",
+        "class_name": "HousingTransportationETL",
+    },
+    {
+        "name": "hud_housing",
+        "module_dir": "hud_housing",
+        "class_name": "HudHousingETL",
+    },
+    {
+        "name": "calenviroscreen",
+        "module_dir": "calenviroscreen",
+        "class_name": "CalEnviroScreenETL",
+    },
+    {
+        "name": "hud_recap",
+        "module_dir": "hud_recap",
+        "class_name": "HudRecapETL",
+    },
+]
+CENSUS_INFO = {
+    "name": "census",
+    "module_dir": "census",
+    "class_name": "CensusETL",
+}
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -4,6 +4,33 @@ from data_pipeline.etl.score.etl_score import ScoreETL
 from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
 from data_pipeline.etl.score.etl_score_post import PostScoreETL

+from . import constants
+
+
+def get_datasets_to_run(dataset_to_run: str):
+    """Returns a list of appropriate datasets to run given input args
+
+    Args:
+        dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
+
+    Returns:
+        None
+    """
+    dataset_list = constants.DATASET_LIST
+    etls_to_search = dataset_list + [constants.CENSUS_INFO]
+
+    if dataset_to_run:
+        dataset_element = next(
+            (item for item in etls_to_search if item["name"] == dataset_to_run),
+            None,
+        )
+        if not dataset_element:
+            raise ValueError("Invalid dataset name")
+        else:
+            # reset the list to just the dataset
+            dataset_list = [dataset_element]
+    return dataset_list
+

 def etl_runner(dataset_to_run: str = None) -> None:
    """Runs all etl processes or a specific one
@ -14,56 +41,7 @@ def etl_runner(dataset_to_run: str = None) -> None:
    Returns:
        None
    """
-
-    # this list comes from YAMLs
-    dataset_list = [
-        {
-            "name": "tree_equity_score",
-            "module_dir": "tree_equity_score",
-            "class_name": "TreeEquityScoreETL",
-        },
-        {
-            "name": "census_acs",
-            "module_dir": "census_acs",
-            "class_name": "CensusACSETL",
-        },
-        {
-            "name": "ejscreen",
-            "module_dir": "ejscreen",
-            "class_name": "EJScreenETL",
-        },
-        {
-            "name": "housing_and_transportation",
-            "module_dir": "housing_and_transportation",
-            "class_name": "HousingTransportationETL",
-        },
-        {
-            "name": "hud_housing",
-            "module_dir": "hud_housing",
-            "class_name": "HudHousingETL",
-        },
-        {
-            "name": "calenviroscreen",
-            "module_dir": "calenviroscreen",
-            "class_name": "CalEnviroScreenETL",
-        },
-        {
-            "name": "hud_recap",
-            "module_dir": "hud_recap",
-            "class_name": "HudRecapETL",
-        },
-    ]
-
-    if dataset_to_run:
-        dataset_element = next(
-            (item for item in dataset_list if item["name"] == dataset_to_run),
-            None,
-        )
-        if not dataset_list:
-            raise ValueError("Invalid dataset name")
-        else:
-            # reset the list to just the dataset
-            dataset_list = [dataset_element]
+    dataset_list = get_datasets_to_run(dataset_to_run)

    # Run the ETLs for the dataset_list
    for dataset in dataset_list:
--- a/data/data-pipeline/data_pipeline/etl/tests/init.py
+++ b/data/data-pipeline/data_pipeline/etl/tests/init.py
--- a/data/data-pipeline/data_pipeline/etl/tests/test_etl.py
+++ b/data/data-pipeline/data_pipeline/etl/tests/test_etl.py
@ -0,0 +1,9 @@
+import pytest
+from data_pipeline.etl import constants, runner
+
+
+def test_get_datasets_to_run():
+    assert runner.get_datasets_to_run(None) == constants.DATASET_LIST
+    assert runner.get_datasets_to_run("census") == [constants.CENSUS_INFO]
+    with pytest.raises(ValueError):
+        runner.get_datasets_to_run("doesnt_exist")