Add ETL Contract Checks (#619)

* Adds dev dependencies to requirements.txt and re-runs black on codebase

* Adds test and code for national risk index etl, still in progress

* Removes test_data from .gitignore

* Adds test data to nation_risk_index tests

* Creates tests and ETL class for NRI data

* Adds tests for load() and transform() methods of NationalRiskIndexETL

* Updates README.md with info about the NRI dataset

* Adds to dos

* Moves tests and test data into a tests/ dir in national_risk_index

* Moves tmp_dir for tests into data/tmp/tests/

* Promotes fixtures to conftest and relocates national_risk_index tests:
The relocation of national_risk_index tests is necessary because tests 
can only use fixtures specified in conftests within the same package

* Fixes issue with df.equals() in test_transform()

* Files reformatted by black

* Commit changes to other files after re-running black

* Fixes unused import that caused lint checks to fail

* Moves tests/ directory to app root for data_pipeline

* Adds new methods to ExtractTransformLoad base class:
- __init__() Initializes class attributes
- _get_census_fips_codes() Loads a dataframe with the fips codes for 
census block group and tract
- validate_init() Checks that the class was initialized correctly
- validate_output() Checks that the output was loaded correctly

* Adds test for ExtractTransformLoad.__init__() and base.py

* Fixes failing flake8 test

* Changes geo_col to geoid_col and changes is_dataset to is_census in yaml

* Adds test for validate_output()

* Adds remaining tests

* Removes is_dataset from init method

* Makes CENSUS_CSV a class attribute instead of a class global:
This ensures that CENSUS_CSV is only set when the ETL class is for a 
non-census dataset and removes the need to overwrite the value in 
mock_etl fixture

* Re-formats files with black and fixes broken tox tests
This commit is contained in:
Billy Daly 2021-10-13 15:54:15 -04:00 committed by GitHub
commit d1273b63c5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 358 additions and 32 deletions

View file

@ -1,9 +1,7 @@
from pathlib import Path
from shutil import copyfile
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.tests.conftest import copy_data_files
from data_pipeline.etl.sources.national_risk_index.etl import (
NationalRiskIndexETL,
)
@ -13,22 +11,6 @@ DATA_DIR = (
)
def copy_data_files(src: Path, dst: Path) -> None:
"""Copies test data from src Path to dst Path for use in testing
Args
src: pathlib.Path instance. The location of the source data file.
dst: pathlib.Path instance. Where to copy the source data file to.
Returns
None. This is a void function
"""
if not dst.exists():
dst.parent.mkdir(parents=True, exist_ok=True)
copyfile(src, dst)
assert dst.exists()
class TestNationalRiskIndexETL:
def test_init(self, mock_etl, mock_paths):
"""Tests that the mock NationalRiskIndexETL class instance was
@ -45,6 +27,7 @@ class TestNationalRiskIndexETL:
data_path, tmp_path = mock_paths
input_csv = tmp_path / "NRI_Table_CensusTracts.csv"
output_dir = data_path / "dataset" / "national_risk_index_2020"
print(input_csv)
# validation
assert etl.DATA_PATH == data_path
assert etl.TMP_PATH == tmp_path
@ -66,7 +49,7 @@ class TestNationalRiskIndexETL:
input_src = DATA_DIR / "input.csv"
input_dst = etl.INPUT_CSV
acs_src = DATA_DIR / "acs.csv"
acs_dst = DATA_DIR / etl.BLOCK_GROUP_CSV
acs_dst = etl.BLOCK_GROUP_CSV
for src, dst in [(input_src, input_dst), (acs_src, acs_dst)]:
copy_data_files(src, dst)
# setup - read in sample output as dataframe