Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs

* Update code to be more production-ish

* Move fetch to Extract part of ETL
* Create a downloader to house all downloading operations
* Remove unnecessary "name" in data source

* Format source files with black

* Fix issues from pylint and get the tests working with the new folder structure

* Clean up files with black

* Fix unzip test

* Add caching notes to README

* Fix tests (linting and case sensitivity bug)

* Address PR comments and add API keys for census where missing

* Merging comparator changes from main into this branch for the sake of the PR

* Add note on using cache (-u) during pipeline
This commit is contained in:
Travis Newby 2023-03-03 12:26:24 -06:00 committed by GitHub
commit 6f39033dde
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
52 changed files with 1787 additions and 686 deletions

View file

@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
data. A basic version of that patching is included here for classes that can use it.
"""
data_path, tmp_path = mock_paths
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
sources_path.mkdir(parents=True, exist_ok=True)
with mock.patch(
"data_pipeline.utils.requests"
"data_pipeline.etl.downloader.requests"
) as requests_mock, mock.patch(
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
) as sources_mock, mock.patch(
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
) as mock_get_state_fips_codes:
tmp_path = mock_paths[1]
# requests mock
def fake_get(url, *args, **kwargs):
file_path = url.split("/")[-1]
with open(
@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
return response_mock
requests_mock.get = fake_get
# fips codes mock
mock_get_state_fips_codes.return_value = [
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
]
# sources mock
sources_mock.return_value = sources_path
# Instantiate the ETL class.
etl = self._get_instance_of_etl_class()
# Monkey-patch the temporary directory to the one used in the test
etl.TMP_PATH = tmp_path
etl.SOURCES_PATH = data_path / "sources"
# Run the extract method.
etl.extract()
def fake_get_sources_path() -> pathlib.PosixPath:
return sources_path
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
return etl
def test_init(self, mock_etl, mock_paths):