mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-03 22:14:18 -07:00
Add ETL Contract Checks (#619)
* Adds dev dependencies to requirements.txt and re-runs black on codebase * Adds test and code for national risk index etl, still in progress * Removes test_data from .gitignore * Adds test data to nation_risk_index tests * Creates tests and ETL class for NRI data * Adds tests for load() and transform() methods of NationalRiskIndexETL * Updates README.md with info about the NRI dataset * Adds to dos * Moves tests and test data into a tests/ dir in national_risk_index * Moves tmp_dir for tests into data/tmp/tests/ * Promotes fixtures to conftest and relocates national_risk_index tests: The relocation of national_risk_index tests is necessary because tests can only use fixtures specified in conftests within the same package * Fixes issue with df.equals() in test_transform() * Files reformatted by black * Commit changes to other files after re-running black * Fixes unused import that caused lint checks to fail * Moves tests/ directory to app root for data_pipeline * Adds new methods to ExtractTransformLoad base class: - __init__() Initializes class attributes - _get_census_fips_codes() Loads a dataframe with the fips codes for census block group and tract - validate_init() Checks that the class was initialized correctly - validate_output() Checks that the output was loaded correctly * Adds test for ExtractTransformLoad.__init__() and base.py * Fixes failing flake8 test * Changes geo_col to geoid_col and changes is_dataset to is_census in yaml * Adds test for validate_output() * Adds remaining tests * Removes is_dataset from init method * Makes CENSUS_CSV a class attribute instead of a class global: This ensures that CENSUS_CSV is only set when the ETL class is for a non-census dataset and removes the need to overwrite the value in mock_etl fixture * Re-formats files with black and fixes broken tox tests
This commit is contained in:
parent
1f78920f63
commit
d1273b63c5
13 changed files with 358 additions and 32 deletions
10
data/data-pipeline/data_pipeline/tests/base/config.yaml
Normal file
10
data/data-pipeline/data_pipeline/tests/base/config.yaml
Normal file
|
@ -0,0 +1,10 @@
|
|||
name: Template
|
||||
year: null
|
||||
is_census: false
|
||||
source_url: https://github.com/usds/justice40-tool/
|
||||
geo_level: Census Block Group
|
||||
geoid_col: GEO COL
|
||||
score_cols:
|
||||
- COL 1
|
||||
- COL 2
|
||||
- COL 3
|
11
data/data-pipeline/data_pipeline/tests/base/data/census.csv
Normal file
11
data/data-pipeline/data_pipeline/tests/base/data/census.csv
Normal file
|
@ -0,0 +1,11 @@
|
|||
GEOID10,POPULATION
|
||||
050070403001,1000
|
||||
050070403002,1500
|
||||
050010201001,1000
|
||||
050010201002,1500
|
||||
150070405001,2000
|
||||
150070405002,2250
|
||||
150010210101,2000
|
||||
150010210102,1500
|
||||
150010211011,1750
|
||||
150010211012,1500
|
|
11
data/data-pipeline/data_pipeline/tests/base/data/output.csv
Normal file
11
data/data-pipeline/data_pipeline/tests/base/data/output.csv
Normal file
|
@ -0,0 +1,11 @@
|
|||
GEOID10,GEOID10_TRACT,COL 1,COL 2,COL 3
|
||||
050070403001,05007040300,10,10,10
|
||||
050070403002,05007040300,20,20,20
|
||||
050010201001,05001020100,30,30,30
|
||||
050010201002,05001020100,40,40,40
|
||||
150070405001,15007040500,50,50,50
|
||||
150070405002,15007040500,60,60,60
|
||||
150010210101,15001021010,70,70,70
|
||||
150010210102,15001021010,80,80,80
|
||||
150010211011,15001021101,90,90,90
|
||||
150010211012,15001021101,100,100,100
|
|
|
@ -0,0 +1,10 @@
|
|||
name = Template # uses equal sign instead of colon
|
||||
year: null
|
||||
is_dataset: true
|
||||
source_url: https://github.com/usds/justice40-tool/
|
||||
geo_level: Census Block Group
|
||||
geoid_col: GEO COL
|
||||
score_cols:
|
||||
- COL 1
|
||||
- COL 2
|
||||
- COL 3
|
161
data/data-pipeline/data_pipeline/tests/base/test_base.py
Normal file
161
data/data-pipeline/data_pipeline/tests/base/test_base.py
Normal file
|
@ -0,0 +1,161 @@
|
|||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
import pytest
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
|
||||
TEST_DIR = settings.APP_ROOT / "tests" / "base"
|
||||
DATA_DIR = TEST_DIR / "data"
|
||||
CONFIG_PATH = TEST_DIR / "config.yaml"
|
||||
OUTPUT_SRC = DATA_DIR / "output.csv"
|
||||
|
||||
|
||||
def remove_output(etl):
|
||||
"""Clears output.csv if it is exists"""
|
||||
etl = TemplateETL(CONFIG_PATH)
|
||||
if etl.OUTPUT_PATH.exists():
|
||||
etl.OUTPUT_PATH.unlink()
|
||||
assert etl.OUTPUT_PATH.exists() is False
|
||||
|
||||
|
||||
def load_output_source(etl):
|
||||
"""Loads output csv so that it can be modified"""
|
||||
df = pd.read_csv(
|
||||
OUTPUT_SRC,
|
||||
dtype={
|
||||
etl.GEOID_FIELD_NAME: "string",
|
||||
etl.GEOID_TRACT_FIELD_NAME: "string",
|
||||
},
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
class TemplateETL(ExtractTransformLoad):
|
||||
"""Mock ETL class that inherits from the base ETL"""
|
||||
|
||||
def __init__(self, config_path: Path) -> None:
|
||||
super().__init__(config_path)
|
||||
self.EXTRACTED_CSV: Path = DATA_DIR / "output.csv"
|
||||
self.df: pd.DataFrame = None
|
||||
|
||||
|
||||
class TestInit:
|
||||
"""Tests the super.init() method in a class that inherits from
|
||||
ExtractTransformLoad"""
|
||||
|
||||
def test_init(self, mock_paths, mock_etl):
|
||||
"""Tests that the init method executes successfully
|
||||
|
||||
Validates the following conditions:
|
||||
- The class was instantiated with no errors
|
||||
- All of the class attributes were set correctly by _get_yaml_config()
|
||||
"""
|
||||
# setup
|
||||
data_path, tmp_path = mock_paths
|
||||
etl = TemplateETL(CONFIG_PATH)
|
||||
# validation
|
||||
assert etl.NAME == "Template"
|
||||
assert etl.SOURCE_URL == "https://github.com/usds/justice40-tool/"
|
||||
assert etl.GEOID_COL == "GEO COL"
|
||||
assert etl.GEO_LEVEL == "Census Block Group"
|
||||
assert etl.SCORE_COLS == ["COL 1", "COL 2", "COL 3"]
|
||||
assert etl.OUTPUT_PATH == data_path / "dataset" / "template" / "usa.csv"
|
||||
assert etl.CENSUS_CSV.exists()
|
||||
|
||||
def test_init_missing_config(self, mock_etl):
|
||||
"""Tests that FileNotFoundError is raised when the class is instantiated
|
||||
with a path to a config.yaml file that doesn't exist
|
||||
"""
|
||||
# setup
|
||||
config_path = settings.APP_ROOT / "fake_path"
|
||||
assert config_path.exists() is False
|
||||
# execute
|
||||
with pytest.raises(FileNotFoundError):
|
||||
TemplateETL(config_path)
|
||||
|
||||
def test_init_bad_config(self, mock_etl):
|
||||
"""Tests that YAMLError is raised when the class is instantiated with
|
||||
a yaml file that has errors in it
|
||||
"""
|
||||
# setup
|
||||
config_path = TEST_DIR / "invalid_config.yaml"
|
||||
assert config_path.exists()
|
||||
# execute
|
||||
with pytest.raises(yaml.YAMLError):
|
||||
TemplateETL(config_path)
|
||||
|
||||
|
||||
class TestValidateOutput:
|
||||
"""Tests the ExtractTransformLoad.validate_output() method"""
|
||||
|
||||
def test_validate_output_success(self, mock_etl):
|
||||
"""Tests that validate_output() runs successfully with valid output"""
|
||||
# setup - instantiate etl class
|
||||
etl = TemplateETL(CONFIG_PATH)
|
||||
# setup - load output file
|
||||
shutil.copyfile(OUTPUT_SRC, etl.OUTPUT_PATH)
|
||||
# validation
|
||||
etl.validate_output()
|
||||
|
||||
def test_validate_output_missing_output(self, mock_etl):
|
||||
"""Tests that validate_output() fails if the output isn't written to
|
||||
the location at self.OUTPUT_PATH
|
||||
"""
|
||||
# setup - remove output file
|
||||
etl = TemplateETL(CONFIG_PATH)
|
||||
remove_output(etl)
|
||||
# validation
|
||||
with pytest.raises(AssertionError):
|
||||
etl.validate_output()
|
||||
|
||||
def test_validate_missing_geoid_col(self, mock_etl):
|
||||
"""Tests that validate_output() fails if the output is missing one of
|
||||
census fips codes columns
|
||||
"""
|
||||
# setup - remove output file
|
||||
etl = TemplateETL(CONFIG_PATH)
|
||||
remove_output(etl)
|
||||
# setup - delete GEOID10 col from output
|
||||
df = load_output_source(etl)
|
||||
df.drop(etl.GEOID_FIELD_NAME, axis=1, inplace=True)
|
||||
assert etl.GEOID_FIELD_NAME not in df.columns
|
||||
df.to_csv(etl.OUTPUT_PATH)
|
||||
# validation
|
||||
with pytest.raises(KeyError):
|
||||
etl.validate_output()
|
||||
|
||||
def test_validate_missing_census_block_group(self, mock_etl):
|
||||
"""Tests that validate_output() fails if the output is missing one of
|
||||
census block group rows
|
||||
"""
|
||||
# setup - remove output file
|
||||
etl = TemplateETL(CONFIG_PATH)
|
||||
remove_output(etl)
|
||||
# setup - remove the first Census Block Group
|
||||
df = load_output_source(etl)
|
||||
df.drop(index=df.index[0], axis=0, inplace=True) # delete row 1
|
||||
assert len(df) == 9
|
||||
df.to_csv(etl.OUTPUT_PATH)
|
||||
# validation
|
||||
with pytest.raises(AssertionError):
|
||||
etl.validate_output()
|
||||
|
||||
def test_validate_missing_score_col(self, mock_etl):
|
||||
"""Tests that validate_output() fails if the output is missing one of
|
||||
the columns used in the score
|
||||
"""
|
||||
# setup - remove output file
|
||||
etl = TemplateETL(CONFIG_PATH)
|
||||
remove_output(etl)
|
||||
# setup - delete one of the score columns
|
||||
df = load_output_source(etl)
|
||||
df.drop("COL 1", axis=1, inplace=True)
|
||||
assert "COL 1" not in df.columns
|
||||
df.to_csv(etl.OUTPUT_PATH)
|
||||
# validation
|
||||
with pytest.raises(AssertionError):
|
||||
etl.validate_output()
|
|
@ -1,4 +1,6 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
from shutil import copyfile
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -8,6 +10,22 @@ from data_pipeline.etl.base import ExtractTransformLoad
|
|||
TMP_DIR = settings.APP_ROOT / "data" / "tmp" / "tests"
|
||||
|
||||
|
||||
def copy_data_files(src: Path, dst: Path) -> None:
|
||||
"""Copies test data from src Path to dst Path for use in testing
|
||||
|
||||
Args
|
||||
src: pathlib.Path instance. The location of the source data file.
|
||||
dst: pathlib.Path instance. Where to copy the source data file to.
|
||||
|
||||
Returns
|
||||
None. This is a void function
|
||||
"""
|
||||
if not dst.exists():
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
copyfile(src, dst)
|
||||
assert dst.exists()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def mock_paths(tmp_path_factory) -> tuple:
|
||||
"""Creates new DATA_PATH and TMP_PATH that point to a temporary local
|
||||
|
@ -23,8 +41,17 @@ def mock_paths(tmp_path_factory) -> tuple:
|
|||
return data_path, tmp_path
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def mock_census(mock_paths) -> Path:
|
||||
data_path, tmp_path = mock_paths
|
||||
census_src = settings.APP_ROOT / "tests" / "base" / "data" / "census.csv"
|
||||
census_dst = data_path / "census" / "csv" / "us.csv"
|
||||
copy_data_files(census_src, census_dst)
|
||||
return census_dst
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_etl(monkeypatch, mock_paths) -> None:
|
||||
def mock_etl(monkeypatch, mock_paths, mock_census) -> None:
|
||||
"""Creates a mock version of the base ExtractTransformLoad class and resets
|
||||
global the variables for DATA_PATH and TMP_PATH to the local mock_paths
|
||||
"""
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
from pathlib import Path
|
||||
from shutil import copyfile
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.tests.conftest import copy_data_files
|
||||
from data_pipeline.etl.sources.national_risk_index.etl import (
|
||||
NationalRiskIndexETL,
|
||||
)
|
||||
|
@ -13,22 +11,6 @@ DATA_DIR = (
|
|||
)
|
||||
|
||||
|
||||
def copy_data_files(src: Path, dst: Path) -> None:
|
||||
"""Copies test data from src Path to dst Path for use in testing
|
||||
|
||||
Args
|
||||
src: pathlib.Path instance. The location of the source data file.
|
||||
dst: pathlib.Path instance. Where to copy the source data file to.
|
||||
|
||||
Returns
|
||||
None. This is a void function
|
||||
"""
|
||||
if not dst.exists():
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
copyfile(src, dst)
|
||||
assert dst.exists()
|
||||
|
||||
|
||||
class TestNationalRiskIndexETL:
|
||||
def test_init(self, mock_etl, mock_paths):
|
||||
"""Tests that the mock NationalRiskIndexETL class instance was
|
||||
|
@ -45,6 +27,7 @@ class TestNationalRiskIndexETL:
|
|||
data_path, tmp_path = mock_paths
|
||||
input_csv = tmp_path / "NRI_Table_CensusTracts.csv"
|
||||
output_dir = data_path / "dataset" / "national_risk_index_2020"
|
||||
print(input_csv)
|
||||
# validation
|
||||
assert etl.DATA_PATH == data_path
|
||||
assert etl.TMP_PATH == tmp_path
|
||||
|
@ -66,7 +49,7 @@ class TestNationalRiskIndexETL:
|
|||
input_src = DATA_DIR / "input.csv"
|
||||
input_dst = etl.INPUT_CSV
|
||||
acs_src = DATA_DIR / "acs.csv"
|
||||
acs_dst = DATA_DIR / etl.BLOCK_GROUP_CSV
|
||||
acs_dst = etl.BLOCK_GROUP_CSV
|
||||
for src, dst in [(input_src, input_dst), (acs_src, acs_dst)]:
|
||||
copy_data_files(src, dst)
|
||||
# setup - read in sample output as dataframe
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue