mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-04 02:24:19 -07:00
Add ability to cache ETL data sources (#2169)
* Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline
This commit is contained in:
parent
4d9c1dd11e
commit
6f39033dde
52 changed files with 1787 additions and 686 deletions
|
@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
|
|||
data. A basic version of that patching is included here for classes that can use it.
|
||||
"""
|
||||
|
||||
data_path, tmp_path = mock_paths
|
||||
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
|
||||
sources_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with mock.patch(
|
||||
"data_pipeline.utils.requests"
|
||||
"data_pipeline.etl.downloader.requests"
|
||||
) as requests_mock, mock.patch(
|
||||
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
|
||||
) as sources_mock, mock.patch(
|
||||
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
|
||||
) as mock_get_state_fips_codes:
|
||||
tmp_path = mock_paths[1]
|
||||
|
||||
# requests mock
|
||||
def fake_get(url, *args, **kwargs):
|
||||
file_path = url.split("/")[-1]
|
||||
with open(
|
||||
|
@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
|
|||
return response_mock
|
||||
|
||||
requests_mock.get = fake_get
|
||||
|
||||
# fips codes mock
|
||||
mock_get_state_fips_codes.return_value = [
|
||||
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
|
||||
]
|
||||
|
||||
# sources mock
|
||||
sources_mock.return_value = sources_path
|
||||
|
||||
# Instantiate the ETL class.
|
||||
etl = self._get_instance_of_etl_class()
|
||||
|
||||
# Monkey-patch the temporary directory to the one used in the test
|
||||
etl.TMP_PATH = tmp_path
|
||||
etl.SOURCES_PATH = data_path / "sources"
|
||||
|
||||
# Run the extract method.
|
||||
etl.extract()
|
||||
|
||||
def fake_get_sources_path() -> pathlib.PosixPath:
|
||||
return sources_path
|
||||
|
||||
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
|
||||
|
||||
return etl
|
||||
|
||||
def test_init(self, mock_etl, mock_paths):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue