Add ETL Contract Checks (#619)

* Adds dev dependencies to requirements.txt and re-runs black on codebase * Adds test and code for national risk index etl, still in progress * Removes test_data from .gitignore * Adds test data to nation_risk_index tests * Creates tests and ETL class for NRI data * Adds tests for load() and transform() methods of NationalRiskIndexETL * Updates README.md with info about the NRI dataset * Adds to dos * Moves tests and test data into a tests/ dir in national_risk_index * Moves tmp_dir for tests into data/tmp/tests/ * Promotes fixtures to conftest and relocates national_risk_index tests: The relocation of national_risk_index tests is necessary because tests can only use fixtures specified in conftests within the same package * Fixes issue with df.equals() in test_transform() * Files reformatted by black * Commit changes to other files after re-running black * Fixes unused import that caused lint checks to fail * Moves tests/ directory to app root for data_pipeline * Adds new methods to ExtractTransformLoad base class: - __init__() Initializes class attributes - _get_census_fips_codes() Loads a dataframe with the fips codes for census block group and tract - validate_init() Checks that the class was initialized correctly - validate_output() Checks that the output was loaded correctly * Adds test for ExtractTransformLoad.__init__() and base.py * Fixes failing flake8 test * Changes geo_col to geoid_col and changes is_dataset to is_census in yaml * Adds test for validate_output() * Adds remaining tests * Removes is_dataset from init method * Makes CENSUS_CSV a class attribute instead of a class global: This ensures that CENSUS_CSV is only set when the ETL class is for a non-census dataset and removes the need to overwrite the value in mock_etl fixture * Re-formats files with black and fixes broken tox tests
2025-09-11 20:28:19 -07:00 · 2021-10-13 15:54:15 -04:00 · 2021-10-13 15:54:15 -04:00 · d1273b63c5
commit d1273b63c5
parent 1f78920f63
13 changed files with 358 additions and 32 deletions
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@ -1,9 +1,7 @@
-from pathlib import Path
-from shutil import copyfile
-
 import pandas as pd

 from data_pipeline.config import settings
+from data_pipeline.tests.conftest import copy_data_files
 from data_pipeline.etl.sources.national_risk_index.etl import (
    NationalRiskIndexETL,
 )
@ -13,22 +11,6 @@ DATA_DIR = (
 )


-def copy_data_files(src: Path, dst: Path) -> None:
-    """Copies test data from src Path to dst Path for use in testing
-
-    Args
-        src: pathlib.Path instance. The location of the source data file.
-        dst: pathlib.Path instance. Where to copy the source data file to.
-
-    Returns
-        None. This is a void function
-    """
-    if not dst.exists():
-        dst.parent.mkdir(parents=True, exist_ok=True)
-        copyfile(src, dst)
-        assert dst.exists()
-
-
 class TestNationalRiskIndexETL:
    def test_init(self, mock_etl, mock_paths):
        """Tests that the mock NationalRiskIndexETL class instance was
@ -45,6 +27,7 @@ class TestNationalRiskIndexETL:
        data_path, tmp_path = mock_paths
        input_csv = tmp_path / "NRI_Table_CensusTracts.csv"
        output_dir = data_path / "dataset" / "national_risk_index_2020"
+        print(input_csv)
        # validation
        assert etl.DATA_PATH == data_path
        assert etl.TMP_PATH == tmp_path
@ -66,7 +49,7 @@ class TestNationalRiskIndexETL:
        input_src = DATA_DIR / "input.csv"
        input_dst = etl.INPUT_CSV
        acs_src = DATA_DIR / "acs.csv"
-        acs_dst = DATA_DIR / etl.BLOCK_GROUP_CSV
+        acs_dst = etl.BLOCK_GROUP_CSV
        for src, dst in [(input_src, input_dst), (acs_src, acs_dst)]:
            copy_data_files(src, dst)
        # setup - read in sample output as dataframe