Adding back census to list of potential datasets, but separating out from standard list

Error this addresses:
  File "/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/etl/runner.py", line 71, in etl_runner
    f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
TypeError: 'NoneType' object is not subscriptable
This commit is contained in:
Nat Hillard 2021-08-06 18:21:37 -04:00
parent f51b0d69d9
commit 61d0624966
5 changed files with 80 additions and 51 deletions

View file

@ -0,0 +1,42 @@
DATASET_LIST = [
{
"name": "tree_equity_score",
"module_dir": "tree_equity_score",
"class_name": "TreeEquityScoreETL",
},
{
"name": "census_acs",
"module_dir": "census_acs",
"class_name": "CensusACSETL",
},
{
"name": "ejscreen",
"module_dir": "ejscreen",
"class_name": "EJScreenETL",
},
{
"name": "housing_and_transportation",
"module_dir": "housing_and_transportation",
"class_name": "HousingTransportationETL",
},
{
"name": "hud_housing",
"module_dir": "hud_housing",
"class_name": "HudHousingETL",
},
{
"name": "calenviroscreen",
"module_dir": "calenviroscreen",
"class_name": "CalEnviroScreenETL",
},
{
"name": "hud_recap",
"module_dir": "hud_recap",
"class_name": "HudRecapETL",
},
]
CENSUS_INFO = {
"name": "census",
"module_dir": "census",
"class_name": "CensusETL",
}

View file

@ -4,6 +4,33 @@ from data_pipeline.etl.score.etl_score import ScoreETL
from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
from data_pipeline.etl.score.etl_score_post import PostScoreETL
from . import constants
def get_datasets_to_run(dataset_to_run: str):
"""Returns a list of appropriate datasets to run given input args
Args:
dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
Returns:
None
"""
dataset_list = constants.DATASET_LIST
etls_to_search = dataset_list + [constants.CENSUS_INFO]
if dataset_to_run:
dataset_element = next(
(item for item in etls_to_search if item["name"] == dataset_to_run),
None,
)
if not dataset_element:
raise ValueError("Invalid dataset name")
else:
# reset the list to just the dataset
dataset_list = [dataset_element]
return dataset_list
def etl_runner(dataset_to_run: str = None) -> None:
"""Runs all etl processes or a specific one
@ -14,56 +41,7 @@ def etl_runner(dataset_to_run: str = None) -> None:
Returns:
None
"""
# this list comes from YAMLs
dataset_list = [
{
"name": "tree_equity_score",
"module_dir": "tree_equity_score",
"class_name": "TreeEquityScoreETL",
},
{
"name": "census_acs",
"module_dir": "census_acs",
"class_name": "CensusACSETL",
},
{
"name": "ejscreen",
"module_dir": "ejscreen",
"class_name": "EJScreenETL",
},
{
"name": "housing_and_transportation",
"module_dir": "housing_and_transportation",
"class_name": "HousingTransportationETL",
},
{
"name": "hud_housing",
"module_dir": "hud_housing",
"class_name": "HudHousingETL",
},
{
"name": "calenviroscreen",
"module_dir": "calenviroscreen",
"class_name": "CalEnviroScreenETL",
},
{
"name": "hud_recap",
"module_dir": "hud_recap",
"class_name": "HudRecapETL",
},
]
if dataset_to_run:
dataset_element = next(
(item for item in dataset_list if item["name"] == dataset_to_run),
None,
)
if not dataset_list:
raise ValueError("Invalid dataset name")
else:
# reset the list to just the dataset
dataset_list = [dataset_element]
dataset_list = get_datasets_to_run(dataset_to_run)
# Run the ETLs for the dataset_list
for dataset in dataset_list:

View file

@ -0,0 +1,9 @@
import pytest
from data_pipeline.etl import constants, runner
def test_get_datasets_to_run():
assert runner.get_datasets_to_run(None) == constants.DATASET_LIST
assert runner.get_datasets_to_run("census") == [constants.CENSUS_INFO]
with pytest.raises(ValueError):
runner.get_datasets_to_run("doesnt_exist")