mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 01:54:18 -08:00
Adding back census to list of potential datasets, but separating out from standard list
Error this addresses: File "/Users/lucas/Documents/usds/repos/justice40-tool/data/data-pipeline/data_pipeline/etl/runner.py", line 71, in etl_runner f"data_pipeline.etl.sources.{dataset['module_dir']}.etl" TypeError: 'NoneType' object is not subscriptable
This commit is contained in:
parent
f51b0d69d9
commit
61d0624966
5 changed files with 80 additions and 51 deletions
42
data/data-pipeline/data_pipeline/etl/constants.py
Normal file
42
data/data-pipeline/data_pipeline/etl/constants.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
DATASET_LIST = [
|
||||||
|
{
|
||||||
|
"name": "tree_equity_score",
|
||||||
|
"module_dir": "tree_equity_score",
|
||||||
|
"class_name": "TreeEquityScoreETL",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "census_acs",
|
||||||
|
"module_dir": "census_acs",
|
||||||
|
"class_name": "CensusACSETL",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "ejscreen",
|
||||||
|
"module_dir": "ejscreen",
|
||||||
|
"class_name": "EJScreenETL",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "housing_and_transportation",
|
||||||
|
"module_dir": "housing_and_transportation",
|
||||||
|
"class_name": "HousingTransportationETL",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "hud_housing",
|
||||||
|
"module_dir": "hud_housing",
|
||||||
|
"class_name": "HudHousingETL",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "calenviroscreen",
|
||||||
|
"module_dir": "calenviroscreen",
|
||||||
|
"class_name": "CalEnviroScreenETL",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "hud_recap",
|
||||||
|
"module_dir": "hud_recap",
|
||||||
|
"class_name": "HudRecapETL",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
CENSUS_INFO = {
|
||||||
|
"name": "census",
|
||||||
|
"module_dir": "census",
|
||||||
|
"class_name": "CensusETL",
|
||||||
|
}
|
|
@ -4,6 +4,33 @@ from data_pipeline.etl.score.etl_score import ScoreETL
|
||||||
from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
|
from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
|
||||||
from data_pipeline.etl.score.etl_score_post import PostScoreETL
|
from data_pipeline.etl.score.etl_score_post import PostScoreETL
|
||||||
|
|
||||||
|
from . import constants
|
||||||
|
|
||||||
|
|
||||||
|
def get_datasets_to_run(dataset_to_run: str):
|
||||||
|
"""Returns a list of appropriate datasets to run given input args
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
dataset_list = constants.DATASET_LIST
|
||||||
|
etls_to_search = dataset_list + [constants.CENSUS_INFO]
|
||||||
|
|
||||||
|
if dataset_to_run:
|
||||||
|
dataset_element = next(
|
||||||
|
(item for item in etls_to_search if item["name"] == dataset_to_run),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if not dataset_element:
|
||||||
|
raise ValueError("Invalid dataset name")
|
||||||
|
else:
|
||||||
|
# reset the list to just the dataset
|
||||||
|
dataset_list = [dataset_element]
|
||||||
|
return dataset_list
|
||||||
|
|
||||||
|
|
||||||
def etl_runner(dataset_to_run: str = None) -> None:
|
def etl_runner(dataset_to_run: str = None) -> None:
|
||||||
"""Runs all etl processes or a specific one
|
"""Runs all etl processes or a specific one
|
||||||
|
@ -14,56 +41,7 @@ def etl_runner(dataset_to_run: str = None) -> None:
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
|
dataset_list = get_datasets_to_run(dataset_to_run)
|
||||||
# this list comes from YAMLs
|
|
||||||
dataset_list = [
|
|
||||||
{
|
|
||||||
"name": "tree_equity_score",
|
|
||||||
"module_dir": "tree_equity_score",
|
|
||||||
"class_name": "TreeEquityScoreETL",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "census_acs",
|
|
||||||
"module_dir": "census_acs",
|
|
||||||
"class_name": "CensusACSETL",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "ejscreen",
|
|
||||||
"module_dir": "ejscreen",
|
|
||||||
"class_name": "EJScreenETL",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "housing_and_transportation",
|
|
||||||
"module_dir": "housing_and_transportation",
|
|
||||||
"class_name": "HousingTransportationETL",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "hud_housing",
|
|
||||||
"module_dir": "hud_housing",
|
|
||||||
"class_name": "HudHousingETL",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "calenviroscreen",
|
|
||||||
"module_dir": "calenviroscreen",
|
|
||||||
"class_name": "CalEnviroScreenETL",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "hud_recap",
|
|
||||||
"module_dir": "hud_recap",
|
|
||||||
"class_name": "HudRecapETL",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
if dataset_to_run:
|
|
||||||
dataset_element = next(
|
|
||||||
(item for item in dataset_list if item["name"] == dataset_to_run),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
if not dataset_list:
|
|
||||||
raise ValueError("Invalid dataset name")
|
|
||||||
else:
|
|
||||||
# reset the list to just the dataset
|
|
||||||
dataset_list = [dataset_element]
|
|
||||||
|
|
||||||
# Run the ETLs for the dataset_list
|
# Run the ETLs for the dataset_list
|
||||||
for dataset in dataset_list:
|
for dataset in dataset_list:
|
||||||
|
|
|
@ -43,7 +43,7 @@ class CensusETL(ExtractTransformLoad):
|
||||||
Returns:
|
Returns:
|
||||||
Path on disk to the file_type file corresponding to this FIPS
|
Path on disk to the file_type file corresponding to this FIPS
|
||||||
"""
|
"""
|
||||||
file_path : Path
|
file_path: Path
|
||||||
if file_type == GeoFileType.SHP:
|
if file_type == GeoFileType.SHP:
|
||||||
file_path = Path(
|
file_path = Path(
|
||||||
self.SHP_BASE_PATH / fips_code / f"tl_2010_{fips_code}_bg10.shp"
|
self.SHP_BASE_PATH / fips_code / f"tl_2010_{fips_code}_bg10.shp"
|
||||||
|
|
0
data/data-pipeline/data_pipeline/etl/tests/__init__.py
Normal file
0
data/data-pipeline/data_pipeline/etl/tests/__init__.py
Normal file
9
data/data-pipeline/data_pipeline/etl/tests/test_etl.py
Normal file
9
data/data-pipeline/data_pipeline/etl/tests/test_etl.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
import pytest
|
||||||
|
from data_pipeline.etl import constants, runner
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_datasets_to_run():
|
||||||
|
assert runner.get_datasets_to_run(None) == constants.DATASET_LIST
|
||||||
|
assert runner.get_datasets_to_run("census") == [constants.CENSUS_INFO]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
runner.get_datasets_to_run("doesnt_exist")
|
Loading…
Add table
Reference in a new issue