From 61d0624966300dc445e75b351cc6c5567250874f Mon Sep 17 00:00:00 2001
From: Nat Hillard <Nathaniel.K.Hillard@omb.eop.gov>
Date: Fri, 6 Aug 2021 18:21:37 -0400
Subject: [PATCH] Adding back census to list of potential datasets, but
 separating out from standard list

Error this addresses:
  File "/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/etl/runner.py", line 71, in etl_runner
    f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
TypeError: 'NoneType' object is not subscriptable
---
 .../data_pipeline/etl/constants.py            | 42 ++++++++++
 .../data-pipeline/data_pipeline/etl/runner.py | 78 +++++++------------
 .../data_pipeline/etl/sources/census/etl.py   |  2 +-
 .../data_pipeline/etl/tests/__init__.py       |  0
 .../data_pipeline/etl/tests/test_etl.py       |  9 +++
 5 files changed, 80 insertions(+), 51 deletions(-)
 create mode 100644 data/data-pipeline/data_pipeline/etl/constants.py
 create mode 100644 data/data-pipeline/data_pipeline/etl/tests/__init__.py
 create mode 100644 data/data-pipeline/data_pipeline/etl/tests/test_etl.py

diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py
new file mode 100644
index 00000000..cef7c80c
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/etl/constants.py
@@ -0,0 +1,42 @@
+DATASET_LIST = [
+    {
+        "name": "tree_equity_score",
+        "module_dir": "tree_equity_score",
+        "class_name": "TreeEquityScoreETL",
+    },
+    {
+        "name": "census_acs",
+        "module_dir": "census_acs",
+        "class_name": "CensusACSETL",
+    },
+    {
+        "name": "ejscreen",
+        "module_dir": "ejscreen",
+        "class_name": "EJScreenETL",
+    },
+    {
+        "name": "housing_and_transportation",
+        "module_dir": "housing_and_transportation",
+        "class_name": "HousingTransportationETL",
+    },
+    {
+        "name": "hud_housing",
+        "module_dir": "hud_housing",
+        "class_name": "HudHousingETL",
+    },
+    {
+        "name": "calenviroscreen",
+        "module_dir": "calenviroscreen",
+        "class_name": "CalEnviroScreenETL",
+    },
+    {
+        "name": "hud_recap",
+        "module_dir": "hud_recap",
+        "class_name": "HudRecapETL",
+    },
+]
+CENSUS_INFO = {
+    "name": "census",
+    "module_dir": "census",
+    "class_name": "CensusETL",
+}
diff --git a/data/data-pipeline/data_pipeline/etl/runner.py b/data/data-pipeline/data_pipeline/etl/runner.py
index 093012de..71e2b5a9 100644
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@@ -4,6 +4,33 @@ from data_pipeline.etl.score.etl_score import ScoreETL
 from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
 from data_pipeline.etl.score.etl_score_post import PostScoreETL
 
+from . import constants
+
+
+def get_datasets_to_run(dataset_to_run: str):
+    """Returns a list of appropriate datasets to run given input args
+
+    Args:
+        dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
+
+    Returns:
+        None
+    """
+    dataset_list = constants.DATASET_LIST
+    etls_to_search = dataset_list + [constants.CENSUS_INFO]
+
+    if dataset_to_run:
+        dataset_element = next(
+            (item for item in etls_to_search if item["name"] == dataset_to_run),
+            None,
+        )
+        if not dataset_element:
+            raise ValueError("Invalid dataset name")
+        else:
+            # reset the list to just the dataset
+            dataset_list = [dataset_element]
+    return dataset_list
+
 
 def etl_runner(dataset_to_run: str = None) -> None:
     """Runs all etl processes or a specific one
@@ -14,56 +41,7 @@ def etl_runner(dataset_to_run: str = None) -> None:
     Returns:
         None
     """
-
-    # this list comes from YAMLs
-    dataset_list = [
-        {
-            "name": "tree_equity_score",
-            "module_dir": "tree_equity_score",
-            "class_name": "TreeEquityScoreETL",
-        },
-        {
-            "name": "census_acs",
-            "module_dir": "census_acs",
-            "class_name": "CensusACSETL",
-        },
-        {
-            "name": "ejscreen",
-            "module_dir": "ejscreen",
-            "class_name": "EJScreenETL",
-        },
-        {
-            "name": "housing_and_transportation",
-            "module_dir": "housing_and_transportation",
-            "class_name": "HousingTransportationETL",
-        },
-        {
-            "name": "hud_housing",
-            "module_dir": "hud_housing",
-            "class_name": "HudHousingETL",
-        },
-        {
-            "name": "calenviroscreen",
-            "module_dir": "calenviroscreen",
-            "class_name": "CalEnviroScreenETL",
-        },
-        {
-            "name": "hud_recap",
-            "module_dir": "hud_recap",
-            "class_name": "HudRecapETL",
-        },
-    ]
-
-    if dataset_to_run:
-        dataset_element = next(
-            (item for item in dataset_list if item["name"] == dataset_to_run),
-            None,
-        )
-        if not dataset_list:
-            raise ValueError("Invalid dataset name")
-        else:
-            # reset the list to just the dataset
-            dataset_list = [dataset_element]
+    dataset_list = get_datasets_to_run(dataset_to_run)
 
     # Run the ETLs for the dataset_list
     for dataset in dataset_list:
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
index 5a482ee0..929fc76d 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
@@ -43,7 +43,7 @@ class CensusETL(ExtractTransformLoad):
         Returns:
             Path on disk to the file_type file corresponding to this FIPS
         """
-        file_path : Path
+        file_path: Path
         if file_type == GeoFileType.SHP:
             file_path = Path(
                 self.SHP_BASE_PATH / fips_code / f"tl_2010_{fips_code}_bg10.shp"
diff --git a/data/data-pipeline/data_pipeline/etl/tests/__init__.py b/data/data-pipeline/data_pipeline/etl/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/data/data-pipeline/data_pipeline/etl/tests/test_etl.py b/data/data-pipeline/data_pipeline/etl/tests/test_etl.py
new file mode 100644
index 00000000..0698ee50
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/etl/tests/test_etl.py
@@ -0,0 +1,9 @@
+import pytest
+from data_pipeline.etl import constants, runner
+
+
+def test_get_datasets_to_run():
+    assert runner.get_datasets_to_run(None) == constants.DATASET_LIST
+    assert runner.get_datasets_to_run("census") == [constants.CENSUS_INFO]
+    with pytest.raises(ValueError):
+        runner.get_datasets_to_run("doesnt_exist")