Updating testing to include pytest-snapshot (#1355)

In this commit, we slightly change the testing to use `pytest-snapshot`. This is for `ETL`s only.
This commit is contained in:
Emma Nechamkin 2022-03-11 21:34:07 -05:00 committed by GitHub
commit 9d920d4db4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 328 additions and 415 deletions

View file

@ -0,0 +1,16 @@
GEOID10_TRACT,Input Field 1
06007040300,2.00000
06001020100,6.10000
06007040500,-7.80000
15001021010,12.00000
15001021101,12.05525
15007040603,13.51418
15007040700,13.11989
15009030100,13.60947
15009030201,13.73235
15001021402,14.73305
15001021800,16.60834
15009030402,16.00254
15009030800,15.34818
15003010201,14.58789
15007040604,14.27705
1 GEOID10_TRACT Input Field 1
2 06007040300 2.00000
3 06001020100 6.10000
4 06007040500 -7.80000
5 15001021010 12.00000
6 15001021101 12.05525
7 15007040603 13.51418
8 15007040700 13.11989
9 15009030100 13.60947
10 15009030201 13.73235
11 15001021402 14.73305
12 15001021800 16.60834
13 15009030402 16.00254
14 15009030800 15.34818
15 15003010201 14.58789
16 15007040604 14.27705

View file

@ -1,16 +0,0 @@
GEOID10_TRACT,Input Field 1
06007040300,2
06001020100,6.1
06007040500,-7.8
15001021010,12
15001021101,12.05524783
15007040603,13.51417578
15007040700,13.11988976
15009030100,13.60946983
15009030201,13.73235164
15001021402,14.73305116
15001021800,16.60833857
15009030402,16.002535
15009030800,15.34818251
15003010201,14.58788769
15007040604,14.27704917
1 GEOID10_TRACT Input Field 1
2 06007040300 2
3 06001020100 6.1
4 06007040500 -7.8
5 15001021010 12
6 15001021101 12.05524783
7 15007040603 13.51417578
8 15007040700 13.11988976
9 15009030100 13.60946983
10 15009030201 13.73235164
11 15001021402 14.73305116
12 15001021800 16.60833857
13 15009030402 16.002535
14 15009030800 15.34818251
15 15003010201 14.58788769
16 15007040604 14.27704917

View file

@ -1,16 +1,16 @@
GEOID10_TRACT,Example Field 1
06007040300,4.0
06001020100,12.2
06007040500,-15.6
15001021010,24.0
15001021101,24.11049566
15007040603,27.02835156
15007040700,26.23977952
15009030100,27.21893966
15009030201,27.46470328
15001021402,29.46610232
15001021800,33.21667714
06007040300,4.00000
06001020100,12.20000
06007040500,-15.60000
15001021010,24.00000
15001021101,24.11050
15007040603,27.02835
15007040700,26.23978
15009030100,27.21894
15009030201,27.46470
15001021402,29.46610
15001021800,33.21668
15009030402,32.00507
15009030800,30.69636502
15003010201,29.17577538
15007040604,28.55409834
15009030800,30.69637
15003010201,29.17578
15007040604,28.55410

1 GEOID10_TRACT Example Field 1
2 06007040300 4.0 4.00000
3 06001020100 12.2 12.20000
4 06007040500 -15.6 -15.60000
5 15001021010 24.0 24.00000
6 15001021101 24.11049566 24.11050
7 15007040603 27.02835156 27.02835
8 15007040700 26.23977952 26.23978
9 15009030100 27.21893966 27.21894
10 15009030201 27.46470328 27.46470
11 15001021402 29.46610232 29.46610
12 15001021800 33.21667714 33.21668
13 15009030402 32.00507
14 15009030800 30.69636502 30.69637
15 15003010201 29.17577538 29.17578
16 15007040604 28.55409834 28.55410

View file

@ -1,16 +1,16 @@
GEOID10_TRACT,Input Field 1,Example Field 1
06007040300,2.0,4.0
06001020100,6.1,12.2
06007040500,-7.8,-15.6
15001021010,12.0,24.0
15001021101,12.05524783,24.11049566
15007040603,13.51417578,27.02835156
15007040700,13.11988976,26.23977952
15009030100,13.60946983,27.21893966
15009030201,13.73235164,27.46470328
15001021402,14.73305116,29.46610232
15001021800,16.60833857,33.21667714
15009030402,16.002535,32.00507
15009030800,15.34818251,30.69636502
15003010201,14.58788769,29.17577538
15007040604,14.27704917,28.55409834
06007040300,2.00000,4.00000
06001020100,6.10000,12.20000
06007040500,-7.80000,-15.60000
15001021010,12.00000,24.00000
15001021101,12.05525,24.11050
15007040603,13.51418,27.02835
15007040700,13.11989,26.23978
15009030100,13.60947,27.21894
15009030201,13.73235,27.46470
15001021402,14.73305,29.46610
15001021800,16.60834,33.21668
15009030402,16.00254,32.00507
15009030800,15.34818,30.69637
15003010201,14.58789,29.17578
15007040604,14.27705,28.55410

1 GEOID10_TRACT Input Field 1 Example Field 1
2 06007040300 2.0 2.00000 4.0 4.00000
3 06001020100 6.1 6.10000 12.2 12.20000
4 06007040500 -7.8 -7.80000 -15.6 -15.60000
5 15001021010 12.0 12.00000 24.0 24.00000
6 15001021101 12.05524783 12.05525 24.11049566 24.11050
7 15007040603 13.51417578 13.51418 27.02835156 27.02835
8 15007040700 13.11988976 13.11989 26.23977952 26.23978
9 15009030100 13.60946983 13.60947 27.21893966 27.21894
10 15009030201 13.73235164 13.73235 27.46470328 27.46470
11 15001021402 14.73305116 14.73305 29.46610232 29.46610
12 15001021800 16.60833857 16.60834 33.21667714 33.21668
13 15009030402 16.002535 16.00254 32.00507
14 15009030800 15.34818251 15.34818 30.69636502 30.69637
15 15003010201 14.58788769 14.58789 29.17577538 29.17578
16 15007040604 14.27704917 14.27705 28.55409834 28.55410

View file

@ -2,13 +2,13 @@
import copy
import os
import pathlib
from typing import Type
import pytest
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.tests.conftest import copy_data_files
from data_pipeline.tests.sources.example.etl import ExampleETL
from data_pipeline.utils import get_module_logger
@ -17,19 +17,30 @@ logger = get_module_logger(__name__)
class TestETL:
"""A base class that can be inherited by all other ETL tests.
Note: every method that does *not* need to be reimplemented by child classes has
the test name pattern of `test_*_base`. All other tests need to be reimplemented.
This uses pytest-snapshot.
To update individual snapshots: $ poetry run pytest
data_pipeline/tests/sources/national_risk_index/test_etl.py::TestClassNameETL::<testname>
--snapshot-update
"""
# In every child test class, change this to the class of the ETL being tested.
_ETL_CLASS = ExampleETL
# The following constants do not need to be updated in child class.
_INPUT_CSV_FILE_NAME = "input.csv"
_EXTRACT_CSV_FILE_NAME = "extract.csv"
_TRANSFORM_CSV_FILE_NAME = "transform.csv"
_OUTPUT_CSV_FILE_NAME = "output.csv"
# This *does* need to be updated in the child class. It specifies where the "sample data" is
# so that we do not have to manually copy the "sample data" when we run the tests.
_SAMPLE_DATA_PATH = pathlib.Path(__file__).parents[0] / "data"
_SAMPLE_DATA_FILE_NAME = "input.csv"
_SAMPLE_DATA_ZIP_FILE_NAME = "input.zip"
_EXTRACT_TMP_FOLDER_NAME = "ExampleETL"
# Note: We used shared census tract IDs so that later our tests can join all the
# ETL results together and generate a test score. This join is only possible if
# we use the same tract IDs across fixtures.
@ -56,19 +67,14 @@ class TestETL:
def setup_method(self, _method, filename=__file__):
"""Before every test, set the data directory for the test.
Uses the directory of the test class to infer the data directory.
pytest does not support classes with an `__init__`. Instead, we use this
`setup_method` which pytest will run before every test method is run.
For now, all child classes inheriting this need to reimplement this, but can
use the same line of code regardless of the child class:
```
def setup_method(self, _method, filename=__file__):
'''Invoke `setup_method` from Parent, but using the current file name
This code can be copied identically between all child classes.
'''
super().setup_method(_method=_method, filename=filename)
@ -76,18 +82,17 @@ class TestETL:
"""
self._DATA_DIRECTORY_FOR_TEST = pathlib.Path(filename).parent / "data"
def _get_instance_of_etl_class(self) -> type(ExtractTransformLoad):
def _get_instance_of_etl_class(self) -> Type[ExtractTransformLoad]:
return self._ETL_CLASS()
def _setup_etl_instance_and_run_extract(
self, mock_etl, mock_paths
) -> ExtractTransformLoad:
"""Method to setup an ETL instance with proper upstream mocks to run extract.
This must be re-implemented in every child class.
This method can be used by multiple tests that need to run the same fixtures
that need these same mocks, and by `test_update_test_fixtures`.
that need these same mocks.
In order to re-implement this method, usually it will involve a
decent amount of work to monkeypatch `requests` or another method that's
@ -107,22 +112,8 @@ class TestETL:
return etl
def test_existence_of_test_fixtures_base(self):
"""Every ETL test should have these two test fixture files.
Can be run without modification for all child classes.
"""
assert (
self._DATA_DIRECTORY_FOR_TEST / self._TRANSFORM_CSV_FILE_NAME
).exists()
assert (
self._DATA_DIRECTORY_FOR_TEST / self._OUTPUT_CSV_FILE_NAME
).exists()
def test_init_base(self, mock_etl, mock_paths):
"""Test whether class has appropriate parameters set.
Can be run without modification for all child classes.
"""
# Setup
@ -150,8 +141,8 @@ class TestETL:
def test_get_output_file_path_base(self, mock_etl, mock_paths):
"""Test file path method.
Can be run without modification for all child classes.
Can be run without modification for all child classes,
except those that do not produce usa.csv files.
"""
etl = self._get_instance_of_etl_class()
data_path, tmp_path = mock_paths
@ -171,11 +162,9 @@ class TestETL:
def test_fixtures_contain_shared_tract_ids_base(self, mock_etl, mock_paths):
"""Check presence of necessary shared tract IDs.
Note: We used shared census tract IDs so that later our tests can join all the
ETL results together and generate a test score. This join is only possible if
we use the same tract IDs across fixtures.
Can be run without modification for all child classes.
"""
etl = self._setup_etl_instance_and_run_extract(
@ -198,9 +187,68 @@ class TestETL:
else:
raise NotImplementedError("This geo level not tested yet.")
def test_sample_data_exists(self):
"""This will test that the sample data exists where it's supposed to as it's supposed to
As per conversation with Jorge, here we can *just* test that the zip file exists.
"""
assert (
self._SAMPLE_DATA_PATH / self._SAMPLE_DATA_ZIP_FILE_NAME
).exists()
def test_extract_unzips_base(self, mock_etl, mock_paths):
"""Tests the extract method.
As per conversation with Jorge, no longer includes snapshot. Instead, verifies that the
file was unzipped from a "fake" downloaded zip (located in data) in a temporary path.
"""
tmp_path = mock_paths[1]
_ = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl,
mock_paths=mock_paths,
)
assert (
tmp_path
/ self._EXTRACT_TMP_FOLDER_NAME
/ self._SAMPLE_DATA_FILE_NAME
).exists()
def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
"""Tests the extract method.
Here we are verifying that the data that we extract is "readable". I added a snapshot to be thorough,
but @Jorge -- do you think this is necessary?
"""
etl = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl,
mock_paths=mock_paths,
)
tmp_df = pd.read_csv(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
)
snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
snapshot.assert_match(
tmp_df.to_csv(index=False, float_format="%.5f"),
self._EXTRACT_CSV_FILE_NAME,
)
def test_transform_base(self, snapshot, mock_etl, mock_paths):
"""Tests the transform method.
This verifies that when we extract the data, we can then read it in"""
# setup - copy sample data into tmp_dir
etl = self._setup_etl_instance_and_run_extract(mock_etl, mock_paths)
etl.transform()
snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
snapshot.assert_match(
etl.output_df.to_csv(index=False, float_format="%.5f"),
self._TRANSFORM_CSV_FILE_NAME,
)
def test_transform_sets_output_df_base(self, mock_etl, mock_paths):
"""This test ensures that the transform step sets its results to `output_df`.
Can be run without modification for all child classes.
"""
etl = self._setup_etl_instance_and_run_extract(
@ -217,33 +265,8 @@ class TestETL:
for col in etl.COLUMNS_TO_KEEP:
assert col in etl.output_df.columns, f"{col} is missing from output"
def test_transform_base(self, mock_etl):
"""Tests the transform method.
Can be run without modification for all child classes.
"""
# setup - copy sample data into tmp_dir
etl = self._get_instance_of_etl_class()
etl.transform()
transform_csv_path = (
self._DATA_DIRECTORY_FOR_TEST / self._TRANSFORM_CSV_FILE_NAME
)
# Compare to expected.
expected = pd.read_csv(
filepath_or_buffer=transform_csv_path,
dtype={
ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: "string",
ExtractTransformLoad.GEOID_FIELD_NAME: "string",
},
)
pd.testing.assert_frame_equal(etl.output_df, expected)
def test_load_base(self, mock_etl):
def test_load_base(self, snapshot, mock_etl, mock_paths):
"""Test load method.
Can be run without modification for all child classes.
"""
# setup - input variables
@ -263,29 +286,23 @@ class TestETL:
actual_output_path = etl._get_output_file_path()
assert actual_output_path.exists()
# Check COLUMNS_TO_KEEP remain
actual_output = pd.read_csv(
actual_output_path, dtype={etl.GEOID_TRACT_FIELD_NAME: str}
)
expected_output_csv_path = (
self._DATA_DIRECTORY_FOR_TEST / self._OUTPUT_CSV_FILE_NAME
)
# setup - load expected output
expected_output = pd.read_csv(
filepath_or_buffer=expected_output_csv_path,
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
)
# check that the `COLUMNS_TO_KEEP` are in the output
for col in etl.COLUMNS_TO_KEEP:
assert col in actual_output.columns, f"{col} is missing from output"
# validation
pd.testing.assert_frame_equal(actual_output, expected_output)
# Check the snapshots
snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
snapshot.assert_match(
actual_output.to_csv(index=False, float_format="%.5f"),
self._OUTPUT_CSV_FILE_NAME,
)
def test_validate_base(self, mock_etl, mock_paths):
"""Every ETL class should have proper validation.
Can be run without modification for all child classes.
"""
etl = self._setup_etl_instance_and_run_extract(
@ -436,9 +453,7 @@ class TestETL:
def test_full_etl_base(self, mock_etl, mock_paths):
"""Every ETL class should be able to run end-to-end.
Run extract, transform, validate, load, and get without error.
Can be run without modification for all child classes.
"""
etl = self._setup_etl_instance_and_run_extract(
@ -451,7 +466,6 @@ class TestETL:
def test_get_data_frame_base(self, mock_etl, mock_paths):
"""Every ETL class should be able to return its data frame.
Can be run without modification for all child classes.
"""
etl = self._setup_etl_instance_and_run_extract(
@ -500,71 +514,3 @@ class TestETL:
else:
raise NotImplementedError("This geo level not tested yet.")
# This decorator means that this "test" will only be run by passing that flag to
# pytest, for instance by running `pytest . -rsx --update_snapshots`.
@pytest.mark.update_snapshots
def test_update_test_fixtures(self, mock_etl, mock_paths):
"""Update the test fixtures (the data files) used by the test.
This needs to be reimplemented for every child class. This is because there
are not strict contracts on the outputs of the `extract` step so this method
needs to explicitly define how to update the `input` fixture that comes after
the extract step.
Using this method to update fixtures can be helpful if you expect the
results to change because you changed the logic of the ETL class and need to
quickly update the fixtures.
However, note a few things first:
1. Do *not* update these fixtures if you did not expect the ETL results to
change!
2. If the source data itself changes (e.g., the external source renames a
column), update the "furthest upstream" test fixture which, in many cases,
is a .zip file. Then running this method will update all subsequent files.
If you're confused by any of this, ask for help, it's confusing :).
"""
# When running this in child classes, make sure the child class re-implements
# this method.
if self._ETL_CLASS is not ExampleETL:
raise NotImplementedError(
"Update fixtures method not defined for this class."
)
# The rest of this method applies for `ExampleETL` only.
etl = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl, mock_paths=mock_paths
)
# After running extract, write the results as the "input.csv" in the test
# directory.
logger.info(
f"Writing data to {self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME}"
)
copy_data_files(
src=etl.get_tmp_path() / "input.csv",
dst=self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME,
)
# After running transform, write the results as the "transform.csv" in the test
# directory.
etl.transform()
etl.output_df.to_csv(
path_or_buf=self._DATA_DIRECTORY_FOR_TEST
/ self._TRANSFORM_CSV_FILE_NAME,
index=False,
)
# Run validate, just to check.
etl.validate()
# After running load, write the results as the "output.csv" in the test
# directory.
etl.load()
copy_data_files(
src=etl._get_output_file_path(),
dst=self._DATA_DIRECTORY_FOR_TEST / self._OUTPUT_CSV_FILE_NAME,
)