mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-28 00:01:16 -07:00
Issue 1075: Add refactored ETL tests to NRI (#1088)
* Adds a substantially refactored ETL test to the National Risk Index, to be used as a model for other tests
This commit is contained in:
parent
f5fe8d90e2
commit
43e005cc10
41 changed files with 1155 additions and 619 deletions
|
@ -0,0 +1,568 @@
|
|||
# pylint: disable=protected-access,unsubscriptable-object
|
||||
import copy
|
||||
import os
|
||||
import pathlib
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
|
||||
from data_pipeline.tests.conftest import copy_data_files
|
||||
from data_pipeline.tests.sources.example.etl import ExampleETL
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
||||
class TestETL:
|
||||
"""A base class that can be inherited by all other ETL tests.
|
||||
|
||||
Note: every method that does *not* need to be reimplemented by child classes has
|
||||
the test name pattern of `test_*_base`. All other tests need to be reimplemented.
|
||||
"""
|
||||
|
||||
# In every child test class, change this to the class of the ETL being tested.
|
||||
_ETL_CLASS = ExampleETL
|
||||
|
||||
# The following constants do not need to be updated in child class.
|
||||
_INPUT_CSV_FILE_NAME = "input.csv"
|
||||
_TRANSFORM_CSV_FILE_NAME = "transform.csv"
|
||||
_OUTPUT_CSV_FILE_NAME = "output.csv"
|
||||
|
||||
# Note: We used shared census tract IDs so that later our tests can join all the
|
||||
# ETL results together and generate a test score. This join is only possible if
|
||||
# we use the same tract IDs across fixtures.
|
||||
# The test fixtures may also contain other tract IDs that are not on this list.
|
||||
_FIXTURES_SHARED_TRACT_IDS = [
|
||||
"06007040300",
|
||||
"06001020100",
|
||||
"06007040500",
|
||||
"15001021010",
|
||||
"15001021101",
|
||||
"15007040603",
|
||||
"15007040700",
|
||||
"15009030100",
|
||||
"15009030201",
|
||||
"15001021402",
|
||||
"15001021800",
|
||||
"15009030402",
|
||||
"15009030800",
|
||||
"15003010201",
|
||||
"15007040604",
|
||||
]
|
||||
|
||||
_DATA_DIRECTORY_FOR_TEST: pathlib.PosixPath
|
||||
|
||||
def setup_method(self, _method, filename=__file__):
|
||||
"""Before every test, set the data directory for the test.
|
||||
|
||||
Uses the directory of the test class to infer the data directory.
|
||||
|
||||
pytest does not support classes with an `__init__`. Instead, we use this
|
||||
`setup_method` which pytest will run before every test method is run.
|
||||
|
||||
For now, all child classes inheriting this need to reimplement this, but can
|
||||
use the same line of code regardless of the child class:
|
||||
|
||||
```
|
||||
def setup_method(self, _method, filename=__file__):
|
||||
'''Invoke `setup_method` from Parent, but using the current file name
|
||||
|
||||
This code can be copied identically between all child classes.
|
||||
'''
|
||||
super().setup_method(_method=_method, filename=filename)
|
||||
```
|
||||
"""
|
||||
self._DATA_DIRECTORY_FOR_TEST = pathlib.Path(filename).parent / "data"
|
||||
|
||||
def _get_instance_of_etl_class(self) -> type(ExtractTransformLoad):
|
||||
return self._ETL_CLASS()
|
||||
|
||||
def _setup_etl_instance_and_run_extract(
|
||||
self, mock_etl, mock_paths
|
||||
) -> ExtractTransformLoad:
|
||||
"""Method to setup an ETL instance with proper upstream mocks to run extract.
|
||||
|
||||
This must be re-implemented in every child class.
|
||||
|
||||
This method can be used by multiple tests that need to run the same fixtures
|
||||
that need these same mocks, and by `test_update_test_fixtures`.
|
||||
|
||||
In order to re-implement this method, usually it will involve a
|
||||
decent amount of work to monkeypatch `requests` or another method that's
|
||||
used to retrieve data in order to force that method to retrieve the fixture
|
||||
data.
|
||||
"""
|
||||
# When running this in child classes, make sure the child class re-implements
|
||||
# this method.
|
||||
if self._ETL_CLASS is not ExampleETL:
|
||||
raise NotImplementedError(
|
||||
"Prepare and run extract method not defined for this class."
|
||||
)
|
||||
|
||||
# The rest of this method applies for `ExampleETL` only.
|
||||
etl = self._get_instance_of_etl_class()
|
||||
etl.extract()
|
||||
|
||||
return etl
|
||||
|
||||
def test_existence_of_test_fixtures_base(self):
|
||||
"""Every ETL test should have these two test fixture files.
|
||||
|
||||
Can be run without modification for all child classes.
|
||||
"""
|
||||
assert (
|
||||
self._DATA_DIRECTORY_FOR_TEST / self._TRANSFORM_CSV_FILE_NAME
|
||||
).exists()
|
||||
|
||||
assert (
|
||||
self._DATA_DIRECTORY_FOR_TEST / self._OUTPUT_CSV_FILE_NAME
|
||||
).exists()
|
||||
|
||||
def test_init_base(self, mock_etl, mock_paths):
|
||||
"""Test whether class has appropriate parameters set.
|
||||
|
||||
Can be run without modification for all child classes.
|
||||
"""
|
||||
# Setup
|
||||
etl = self._get_instance_of_etl_class()
|
||||
data_path, tmp_path = mock_paths
|
||||
|
||||
assert etl.DATA_PATH == data_path
|
||||
assert etl.TMP_PATH == tmp_path
|
||||
|
||||
# Also make sure all parameters that need to be non-null are non-null
|
||||
assert etl.NAME is not None
|
||||
assert etl.LAST_UPDATED_YEAR is not None
|
||||
assert etl.SOURCE_URL is not None
|
||||
assert etl.GEO_LEVEL is not None
|
||||
assert etl.COLUMNS_TO_KEEP is not None
|
||||
assert len(etl.COLUMNS_TO_KEEP) > 0
|
||||
# No duplicate columns to keep
|
||||
assert len(etl.COLUMNS_TO_KEEP) == len(set(etl.COLUMNS_TO_KEEP))
|
||||
|
||||
# Check certain parameters are set.
|
||||
assert etl.EXPECTED_MAX_CENSUS_BLOCK_GROUPS == 250000
|
||||
assert etl.EXPECTED_MAX_CENSUS_TRACTS == 74160
|
||||
assert etl.EXPECTED_CENSUS_TRACTS_CHARACTER_LENGTH == 11
|
||||
assert etl.EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH == 13
|
||||
|
||||
def test_get_output_file_path_base(self, mock_etl, mock_paths):
|
||||
"""Test file path method.
|
||||
|
||||
Can be run without modification for all child classes.
|
||||
"""
|
||||
etl = self._get_instance_of_etl_class()
|
||||
data_path, tmp_path = mock_paths
|
||||
|
||||
actual_file_path = etl._get_output_file_path()
|
||||
|
||||
expected_file_path = (
|
||||
data_path
|
||||
/ "dataset"
|
||||
/ f"{etl.NAME}_{etl.LAST_UPDATED_YEAR}"
|
||||
/ "usa.csv"
|
||||
)
|
||||
|
||||
logger.info(f"Expected: {expected_file_path}")
|
||||
|
||||
assert actual_file_path == expected_file_path
|
||||
|
||||
def test_fixtures_contain_shared_tract_ids_base(self, mock_etl, mock_paths):
|
||||
"""Check presence of necessary shared tract IDs.
|
||||
|
||||
Note: We used shared census tract IDs so that later our tests can join all the
|
||||
ETL results together and generate a test score. This join is only possible if
|
||||
we use the same tract IDs across fixtures.
|
||||
|
||||
Can be run without modification for all child classes.
|
||||
"""
|
||||
etl = self._setup_etl_instance_and_run_extract(
|
||||
mock_etl=mock_etl, mock_paths=mock_paths
|
||||
)
|
||||
etl.transform()
|
||||
|
||||
# These tests work differently based on the ValidGeoLevel of the ETL class.
|
||||
if etl.GEO_LEVEL == ValidGeoLevel.CENSUS_TRACT:
|
||||
missing_tract_ids = np.setdiff1d(
|
||||
self._FIXTURES_SHARED_TRACT_IDS,
|
||||
etl.output_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME],
|
||||
)
|
||||
|
||||
if len(missing_tract_ids) > 0:
|
||||
assert False, (
|
||||
"Fixture data is missing the following necessary tract "
|
||||
f"IDs: {missing_tract_ids}"
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError("This geo level not tested yet.")
|
||||
|
||||
def test_transform_sets_output_df_base(self, mock_etl, mock_paths):
|
||||
"""This test ensures that the transform step sets its results to `output_df`.
|
||||
|
||||
Can be run without modification for all child classes.
|
||||
"""
|
||||
etl = self._setup_etl_instance_and_run_extract(
|
||||
mock_etl=mock_etl, mock_paths=mock_paths
|
||||
)
|
||||
etl.transform()
|
||||
|
||||
assert etl.output_df is not None
|
||||
|
||||
# Assert it has some rows
|
||||
assert etl.output_df.shape[0] > 0
|
||||
|
||||
# Check that it has all columns
|
||||
for col in etl.COLUMNS_TO_KEEP:
|
||||
assert col in etl.output_df.columns, f"{col} is missing from output"
|
||||
|
||||
def test_transform_base(self, mock_etl):
|
||||
"""Tests the transform method.
|
||||
|
||||
Can be run without modification for all child classes.
|
||||
"""
|
||||
# setup - copy sample data into tmp_dir
|
||||
etl = self._get_instance_of_etl_class()
|
||||
etl.transform()
|
||||
|
||||
transform_csv_path = (
|
||||
self._DATA_DIRECTORY_FOR_TEST / self._TRANSFORM_CSV_FILE_NAME
|
||||
)
|
||||
|
||||
# Compare to expected.
|
||||
expected = pd.read_csv(
|
||||
filepath_or_buffer=transform_csv_path,
|
||||
dtype={
|
||||
ExtractTransformLoad.GEOID_TRACT_FIELD_NAME: "string",
|
||||
ExtractTransformLoad.GEOID_FIELD_NAME: "string",
|
||||
},
|
||||
)
|
||||
|
||||
pd.testing.assert_frame_equal(etl.output_df, expected)
|
||||
|
||||
def test_load_base(self, mock_etl):
|
||||
"""Test load method.
|
||||
|
||||
Can be run without modification for all child classes.
|
||||
"""
|
||||
# setup - input variables
|
||||
etl = self._get_instance_of_etl_class()
|
||||
|
||||
# setup - mock transform step
|
||||
df_transform = pd.read_csv(
|
||||
self._DATA_DIRECTORY_FOR_TEST / self._TRANSFORM_CSV_FILE_NAME,
|
||||
dtype={etl.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
)
|
||||
etl.output_df = df_transform
|
||||
|
||||
# execution
|
||||
etl.load()
|
||||
|
||||
# Make sure it creates the file.
|
||||
actual_output_path = etl._get_output_file_path()
|
||||
assert actual_output_path.exists()
|
||||
|
||||
actual_output = pd.read_csv(
|
||||
actual_output_path, dtype={etl.GEOID_TRACT_FIELD_NAME: str}
|
||||
)
|
||||
expected_output_csv_path = (
|
||||
self._DATA_DIRECTORY_FOR_TEST / self._OUTPUT_CSV_FILE_NAME
|
||||
)
|
||||
|
||||
# setup - load expected output
|
||||
expected_output = pd.read_csv(
|
||||
filepath_or_buffer=expected_output_csv_path,
|
||||
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
|
||||
)
|
||||
|
||||
# check that the `COLUMNS_TO_KEEP` are in the output
|
||||
for col in etl.COLUMNS_TO_KEEP:
|
||||
assert col in actual_output.columns, f"{col} is missing from output"
|
||||
|
||||
# validation
|
||||
pd.testing.assert_frame_equal(actual_output, expected_output)
|
||||
|
||||
def test_validate_base(self, mock_etl, mock_paths):
|
||||
"""Every ETL class should have proper validation.
|
||||
|
||||
Can be run without modification for all child classes.
|
||||
"""
|
||||
etl = self._setup_etl_instance_and_run_extract(
|
||||
mock_etl=mock_etl, mock_paths=mock_paths
|
||||
)
|
||||
etl.transform()
|
||||
|
||||
# Transform is guaranteed to set a dataframe on etl.output_df.
|
||||
# We can modify this data frame to test validation steps.
|
||||
actual_output_df = etl.output_df
|
||||
|
||||
# These tests work differently based on the ValidGeoLevel of the ETL class.
|
||||
if etl.GEO_LEVEL == ValidGeoLevel.CENSUS_TRACT:
|
||||
# Remove geo field and make sure error occurs.
|
||||
etl_without_geo_field = copy.deepcopy(etl)
|
||||
columns_to_keep = [
|
||||
column_to_keep
|
||||
for column_to_keep in actual_output_df.columns
|
||||
if column_to_keep != ExtractTransformLoad.GEOID_TRACT_FIELD_NAME
|
||||
]
|
||||
etl_without_geo_field.output_df = actual_output_df[columns_to_keep]
|
||||
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_without_geo_field.validate()
|
||||
assert str(error.value).startswith("Missing column:")
|
||||
|
||||
# Make sure too many rows throws error.
|
||||
etl_with_too_many_rows = copy.deepcopy(etl)
|
||||
etl_with_too_many_rows.EXPECTED_MAX_CENSUS_TRACTS = (
|
||||
actual_output_df.shape[0] - 1
|
||||
)
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_too_many_rows.validate()
|
||||
assert str(error.value).startswith("Too many rows:")
|
||||
|
||||
# Make sure multiple geo field character length throws error.
|
||||
etl_with_multiple_char_lengths = copy.deepcopy(etl)
|
||||
etl_with_multiple_char_lengths.output_df = actual_output_df.copy(
|
||||
deep=True
|
||||
)
|
||||
etl_with_multiple_char_lengths.output_df.loc[
|
||||
0, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME
|
||||
] = "060070403001"
|
||||
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_multiple_char_lengths.validate()
|
||||
assert str(error.value).startswith("Multiple character lengths")
|
||||
|
||||
# Make sure wrong geo field character length throws error.
|
||||
etl_with_wrong_geo_field_character_length = copy.deepcopy(etl)
|
||||
etl_with_wrong_geo_field_character_length.output_df = (
|
||||
actual_output_df.copy(deep=True)
|
||||
)
|
||||
etl_with_wrong_geo_field_character_length.output_df[
|
||||
ExtractTransformLoad.GEOID_TRACT_FIELD_NAME
|
||||
] = "060070403001"
|
||||
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_wrong_geo_field_character_length.validate()
|
||||
assert str(error.value).startswith("Wrong character length")
|
||||
|
||||
# Make duplicate tract IDs throws error.
|
||||
etl_with_duplicate_geo_field = copy.deepcopy(etl)
|
||||
etl_with_duplicate_geo_field.output_df = actual_output_df.copy(
|
||||
deep=True
|
||||
)
|
||||
etl_with_duplicate_geo_field.output_df.loc[
|
||||
0:1, ExtractTransformLoad.GEOID_TRACT_FIELD_NAME
|
||||
] = "06007040300"
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_duplicate_geo_field.validate()
|
||||
assert str(error.value).startswith("Duplicate values:")
|
||||
|
||||
elif etl.GEO_LEVEL == ValidGeoLevel.CENSUS_BLOCK_GROUP:
|
||||
# Remove geo field and make sure error occurs.
|
||||
etl_without_geo_field = copy.deepcopy(etl)
|
||||
columns_to_keep = [
|
||||
column_to_keep
|
||||
for column_to_keep in actual_output_df.columns
|
||||
if column_to_keep != ExtractTransformLoad.GEOID_FIELD_NAME
|
||||
]
|
||||
etl_without_geo_field.output_df = actual_output_df[columns_to_keep]
|
||||
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_without_geo_field.validate()
|
||||
assert str(error.value).startswith("Missing column:")
|
||||
|
||||
# Make sure too many rows throws error.
|
||||
etl_with_too_many_rows = copy.deepcopy(etl)
|
||||
etl_with_too_many_rows.EXPECTED_MAX_CENSUS_BLOCK_GROUPS = (
|
||||
actual_output_df.shape[0] - 1
|
||||
)
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_too_many_rows.validate()
|
||||
assert str(error.value).startswith("Too many rows:")
|
||||
|
||||
# Make sure multiple geo field character length throws error.
|
||||
etl_with_multiple_char_lengths = copy.deepcopy(etl)
|
||||
etl_with_multiple_char_lengths.output_df = actual_output_df.copy(
|
||||
deep=True
|
||||
)
|
||||
etl_with_multiple_char_lengths.output_df.loc[
|
||||
0, ExtractTransformLoad.GEOID_FIELD_NAME
|
||||
] = "06007040300123"
|
||||
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_multiple_char_lengths.validate()
|
||||
assert str(error.value).startswith("Multiple character lengths")
|
||||
|
||||
# Make sure wrong geo field character length throws error.
|
||||
etl_with_wrong_geo_field_character_length = copy.deepcopy(etl)
|
||||
etl_with_wrong_geo_field_character_length.output_df = (
|
||||
actual_output_df.copy(deep=True)
|
||||
)
|
||||
etl_with_wrong_geo_field_character_length.output_df[
|
||||
ExtractTransformLoad.GEOID_FIELD_NAME
|
||||
] = "06007040300123"
|
||||
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_wrong_geo_field_character_length.validate()
|
||||
assert str(error.value).startswith("Wrong character length")
|
||||
|
||||
# Make duplicate block group IDs throws error.
|
||||
etl_with_duplicate_geo_field = copy.deepcopy(etl)
|
||||
etl_with_duplicate_geo_field.output_df = actual_output_df.copy(
|
||||
deep=True
|
||||
)
|
||||
etl_with_duplicate_geo_field.output_df.loc[
|
||||
0:1, ExtractTransformLoad.GEOID_FIELD_NAME
|
||||
] = "0600704030012"
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_duplicate_geo_field.validate()
|
||||
assert str(error.value).startswith("Duplicate values:")
|
||||
|
||||
else:
|
||||
raise NotImplementedError("This geo level not tested yet.")
|
||||
|
||||
# Remove another column to keep and make sure error occurs.
|
||||
etl_with_missing_column = copy.deepcopy(etl)
|
||||
columns_to_keep = actual_output_df.columns[:-1]
|
||||
etl_with_missing_column.output_df = actual_output_df[columns_to_keep]
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl_with_missing_column.validate()
|
||||
assert str(error.value).startswith("Missing column:")
|
||||
|
||||
# Test that validation on the original ETL works fine.
|
||||
etl.validate()
|
||||
|
||||
def test_full_etl_base(self, mock_etl, mock_paths):
|
||||
"""Every ETL class should be able to run end-to-end.
|
||||
|
||||
Run extract, transform, validate, load, and get without error.
|
||||
|
||||
Can be run without modification for all child classes.
|
||||
"""
|
||||
etl = self._setup_etl_instance_and_run_extract(
|
||||
mock_etl=mock_etl, mock_paths=mock_paths
|
||||
)
|
||||
etl.transform()
|
||||
etl.validate()
|
||||
etl.load()
|
||||
etl.get_data_frame()
|
||||
|
||||
def test_get_data_frame_base(self, mock_etl, mock_paths):
|
||||
"""Every ETL class should be able to return its data frame.
|
||||
|
||||
Can be run without modification for all child classes.
|
||||
"""
|
||||
etl = self._setup_etl_instance_and_run_extract(
|
||||
mock_etl=mock_etl, mock_paths=mock_paths
|
||||
)
|
||||
|
||||
# TODO: look into moving this file deletion to a setup/teardown method that
|
||||
# applies to all methods. I struggled to get that to work because I couldn't
|
||||
# pass `mock_etl` and `mock_paths`
|
||||
# Delete output file.
|
||||
output_file_path = etl._get_output_file_path()
|
||||
if os.path.exists(output_file_path):
|
||||
logger.info("Deleting output file created by other tests.")
|
||||
os.remove(output_file_path)
|
||||
|
||||
# Run more steps to generate test data.
|
||||
etl.transform()
|
||||
etl.validate()
|
||||
|
||||
# At this point, `get_data_frame` should error since file hasn't been written.
|
||||
with pytest.raises(ValueError) as error:
|
||||
etl.get_data_frame()
|
||||
assert str(error.value).startswith("Make sure to run ETL")
|
||||
|
||||
# Run `load` step to write it to disk.
|
||||
etl.load()
|
||||
|
||||
output_df = etl.get_data_frame()
|
||||
|
||||
# Check that all columns are present
|
||||
for column_to_keep in etl.COLUMNS_TO_KEEP:
|
||||
assert (
|
||||
column_to_keep in output_df.columns
|
||||
), f"Missing column: `{column_to_keep}` is missing from output"
|
||||
|
||||
# Make sure geo fields are read in as strings:
|
||||
if etl.GEO_LEVEL == ValidGeoLevel.CENSUS_TRACT:
|
||||
assert pd.api.types.is_string_dtype(
|
||||
output_df[ExtractTransformLoad.GEOID_TRACT_FIELD_NAME]
|
||||
)
|
||||
|
||||
elif etl.GEO_LEVEL == ValidGeoLevel.CENSUS_BLOCK_GROUP:
|
||||
assert pd.api.types.is_string_dtype(
|
||||
output_df[ExtractTransformLoad.GEOID_FIELD_NAME]
|
||||
)
|
||||
|
||||
else:
|
||||
raise NotImplementedError("This geo level not tested yet.")
|
||||
|
||||
# TODO: Add a flag to make this run only when pytest is run with an argument.
|
||||
def test_update_test_fixtures(self, mock_etl, mock_paths):
|
||||
"""Update the test fixtures (the data files) used by the test.
|
||||
|
||||
This needs to be reimplemented for every child class. This is because there
|
||||
are not strict contracts on the outputs of the `extract` step so this method
|
||||
needs to explicitly define how to update the `input` fixture that comes after
|
||||
the extract step.
|
||||
|
||||
Using this method to update fixtures can be helpful if you expect the
|
||||
results to change because you changed the logic of the ETL class and need to
|
||||
quickly update the fixtures.
|
||||
|
||||
However, note a few things first:
|
||||
|
||||
1. Do *not* update these fixtures if you did not expect the ETL results to
|
||||
change!
|
||||
|
||||
2. If the source data itself changes (e.g., the external source renames a
|
||||
column), update the "furthest upstream" test fixture which, in many cases,
|
||||
is a .zip file. Then running this method will update all subsequent files.
|
||||
|
||||
If you're confused by any of this, ask for help, it's confusing :).
|
||||
"""
|
||||
# When running this in child classes, make sure the child class re-implements
|
||||
# this method.
|
||||
if self._ETL_CLASS is not ExampleETL:
|
||||
raise NotImplementedError(
|
||||
"Update fixtures method not defined for this class."
|
||||
)
|
||||
|
||||
# The rest of this method applies for `ExampleETL` only.
|
||||
etl = self._setup_etl_instance_and_run_extract(
|
||||
mock_etl=mock_etl, mock_paths=mock_paths
|
||||
)
|
||||
|
||||
# After running extract, write the results as the "input.csv" in the test
|
||||
# directory.
|
||||
logger.info(
|
||||
f"Writing data to {self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME}"
|
||||
)
|
||||
copy_data_files(
|
||||
src=etl.TMP_PATH / "input.csv",
|
||||
dst=self._DATA_DIRECTORY_FOR_TEST / self._INPUT_CSV_FILE_NAME,
|
||||
)
|
||||
|
||||
# After running transform, write the results as the "transform.csv" in the test
|
||||
# directory.
|
||||
etl.transform()
|
||||
etl.output_df.to_csv(
|
||||
path_or_buf=self._DATA_DIRECTORY_FOR_TEST
|
||||
/ self._TRANSFORM_CSV_FILE_NAME,
|
||||
index=False,
|
||||
)
|
||||
|
||||
# Run validate, just to check.
|
||||
etl.validate()
|
||||
|
||||
# After running load, write the results as the "output.csv" in the test
|
||||
# directory.
|
||||
etl.load()
|
||||
copy_data_files(
|
||||
src=etl._get_output_file_path(),
|
||||
dst=self._DATA_DIRECTORY_FOR_TEST / self._OUTPUT_CSV_FILE_NAME,
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue