From 61d0624966300dc445e75b351cc6c5567250874f Mon Sep 17 00:00:00 2001 From: Nat Hillard Date: Fri, 6 Aug 2021 18:21:37 -0400 Subject: [PATCH] Adding back census to list of potential datasets, but separating out from standard list Error this addresses: File "/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/etl/runner.py", line 71, in etl_runner f"data_pipeline.etl.sources.{dataset['module_dir']}.etl" TypeError: 'NoneType' object is not subscriptable --- .../data_pipeline/etl/constants.py | 42 ++++++++++ .../data-pipeline/data_pipeline/etl/runner.py | 78 +++++++------------ .../data_pipeline/etl/sources/census/etl.py | 2 +- .../data_pipeline/etl/tests/__init__.py | 0 .../data_pipeline/etl/tests/test_etl.py | 9 +++ 5 files changed, 80 insertions(+), 51 deletions(-) create mode 100644 data/data-pipeline/data_pipeline/etl/constants.py create mode 100644 data/data-pipeline/data_pipeline/etl/tests/__init__.py create mode 100644 data/data-pipeline/data_pipeline/etl/tests/test_etl.py diff --git a/data/data-pipeline/data_pipeline/etl/constants.py b/data/data-pipeline/data_pipeline/etl/constants.py new file mode 100644 index 00000000..cef7c80c --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/constants.py @@ -0,0 +1,42 @@ +DATASET_LIST = [ + { + "name": "tree_equity_score", + "module_dir": "tree_equity_score", + "class_name": "TreeEquityScoreETL", + }, + { + "name": "census_acs", + "module_dir": "census_acs", + "class_name": "CensusACSETL", + }, + { + "name": "ejscreen", + "module_dir": "ejscreen", + "class_name": "EJScreenETL", + }, + { + "name": "housing_and_transportation", + "module_dir": "housing_and_transportation", + "class_name": "HousingTransportationETL", + }, + { + "name": "hud_housing", + "module_dir": "hud_housing", + "class_name": "HudHousingETL", + }, + { + "name": "calenviroscreen", + "module_dir": "calenviroscreen", + "class_name": "CalEnviroScreenETL", + }, + { + "name": "hud_recap", + "module_dir": "hud_recap", + "class_name": "HudRecapETL", + }, +] +CENSUS_INFO = { + "name": "census", + "module_dir": "census", + "class_name": "CensusETL", +} diff --git a/data/data-pipeline/data_pipeline/etl/runner.py b/data/data-pipeline/data_pipeline/etl/runner.py index 093012de..71e2b5a9 100644 --- a/data/data-pipeline/data_pipeline/etl/runner.py +++ b/data/data-pipeline/data_pipeline/etl/runner.py @@ -4,6 +4,33 @@ from data_pipeline.etl.score.etl_score import ScoreETL from data_pipeline.etl.score.etl_score_geo import GeoScoreETL from data_pipeline.etl.score.etl_score_post import PostScoreETL +from . import constants + + +def get_datasets_to_run(dataset_to_run: str): + """Returns a list of appropriate datasets to run given input args + + Args: + dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional) + + Returns: + None + """ + dataset_list = constants.DATASET_LIST + etls_to_search = dataset_list + [constants.CENSUS_INFO] + + if dataset_to_run: + dataset_element = next( + (item for item in etls_to_search if item["name"] == dataset_to_run), + None, + ) + if not dataset_element: + raise ValueError("Invalid dataset name") + else: + # reset the list to just the dataset + dataset_list = [dataset_element] + return dataset_list + def etl_runner(dataset_to_run: str = None) -> None: """Runs all etl processes or a specific one @@ -14,56 +41,7 @@ def etl_runner(dataset_to_run: str = None) -> None: Returns: None """ - - # this list comes from YAMLs - dataset_list = [ - { - "name": "tree_equity_score", - "module_dir": "tree_equity_score", - "class_name": "TreeEquityScoreETL", - }, - { - "name": "census_acs", - "module_dir": "census_acs", - "class_name": "CensusACSETL", - }, - { - "name": "ejscreen", - "module_dir": "ejscreen", - "class_name": "EJScreenETL", - }, - { - "name": "housing_and_transportation", - "module_dir": "housing_and_transportation", - "class_name": "HousingTransportationETL", - }, - { - "name": "hud_housing", - "module_dir": "hud_housing", - "class_name": "HudHousingETL", - }, - { - "name": "calenviroscreen", - "module_dir": "calenviroscreen", - "class_name": "CalEnviroScreenETL", - }, - { - "name": "hud_recap", - "module_dir": "hud_recap", - "class_name": "HudRecapETL", - }, - ] - - if dataset_to_run: - dataset_element = next( - (item for item in dataset_list if item["name"] == dataset_to_run), - None, - ) - if not dataset_list: - raise ValueError("Invalid dataset name") - else: - # reset the list to just the dataset - dataset_list = [dataset_element] + dataset_list = get_datasets_to_run(dataset_to_run) # Run the ETLs for the dataset_list for dataset in dataset_list: diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py index 5a482ee0..929fc76d 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py @@ -43,7 +43,7 @@ class CensusETL(ExtractTransformLoad): Returns: Path on disk to the file_type file corresponding to this FIPS """ - file_path : Path + file_path: Path if file_type == GeoFileType.SHP: file_path = Path( self.SHP_BASE_PATH / fips_code / f"tl_2010_{fips_code}_bg10.shp" diff --git a/data/data-pipeline/data_pipeline/etl/tests/__init__.py b/data/data-pipeline/data_pipeline/etl/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data/data-pipeline/data_pipeline/etl/tests/test_etl.py b/data/data-pipeline/data_pipeline/etl/tests/test_etl.py new file mode 100644 index 00000000..0698ee50 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/tests/test_etl.py @@ -0,0 +1,9 @@ +import pytest +from data_pipeline.etl import constants, runner + + +def test_get_datasets_to_run(): + assert runner.get_datasets_to_run(None) == constants.DATASET_LIST + assert runner.get_datasets_to_run("census") == [constants.CENSUS_INFO] + with pytest.raises(ValueError): + runner.get_datasets_to_run("doesnt_exist")