Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs

* Update code to be more production-ish

* Move fetch to Extract part of ETL
* Create a downloader to house all downloading operations
* Remove unnecessary "name" in data source

* Format source files with black

* Fix issues from pylint and get the tests working with the new folder structure

* Clean up files with black

* Fix unzip test

* Add caching notes to README

* Fix tests (linting and case sensitivity bug)

* Address PR comments and add API keys for census where missing

* Merging comparator changes from main into this branch for the sake of the PR

* Add note on using cache (-u) during pipeline
This commit is contained in:
Travis Newby 2023-03-03 12:26:24 -06:00 committed by GitHub
commit 6f39033dde
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
52 changed files with 1787 additions and 686 deletions

View file

@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
data. A basic version of that patching is included here for classes that can use it.
"""
data_path, tmp_path = mock_paths
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
sources_path.mkdir(parents=True, exist_ok=True)
with mock.patch(
"data_pipeline.utils.requests"
"data_pipeline.etl.downloader.requests"
) as requests_mock, mock.patch(
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
) as sources_mock, mock.patch(
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
) as mock_get_state_fips_codes:
tmp_path = mock_paths[1]
# requests mock
def fake_get(url, *args, **kwargs):
file_path = url.split("/")[-1]
with open(
@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
return response_mock
requests_mock.get = fake_get
# fips codes mock
mock_get_state_fips_codes.return_value = [
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
]
# sources mock
sources_mock.return_value = sources_path
# Instantiate the ETL class.
etl = self._get_instance_of_etl_class()
# Monkey-patch the temporary directory to the one used in the test
etl.TMP_PATH = tmp_path
etl.SOURCES_PATH = data_path / "sources"
# Run the extract method.
etl.extract()
def fake_get_sources_path() -> pathlib.PosixPath:
return sources_path
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
return etl
def test_init(self, mock_etl, mock_paths):

View file

@ -28,7 +28,7 @@ class TestTravelCompositeETL(TestETL):
mock_paths=mock_paths,
)
df = gpd.read_file(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
)
assert df.shape[0] == 30

View file

@ -5,6 +5,7 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__)
@ -30,6 +31,9 @@ class ExampleETL(ExtractTransformLoad):
self.EXAMPLE_FIELD_NAME,
]
def get_data_sources(self) -> [DataSource]:
return []
def extract(self):
# Pretend to download zip from external URL, write it to CSV.
zip_file_path = (
@ -42,11 +46,11 @@ class ExampleETL(ExtractTransformLoad):
)
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(self.get_tmp_path())
zip_ref.extractall(self.get_sources_path())
def transform(self):
df: pd.DataFrame = pd.read_csv(
self.get_tmp_path() / "input.csv",
self.get_sources_path() / "input.csv",
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)

View file

@ -124,12 +124,18 @@ class TestETL:
data. A basic version of that patching is included here for classes that can use it.
"""
data_path, tmp_path = mock_paths
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
sources_path.mkdir(parents=True, exist_ok=True)
with mock.patch(
"data_pipeline.utils.requests"
"data_pipeline.etl.downloader.requests"
) as requests_mock, mock.patch(
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
) as sources_mock, mock.patch(
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
) as mock_get_state_fips_codes:
tmp_path = mock_paths[1]
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
zip_file_fixture_src = (
self._DATA_DIRECTORY_FOR_TEST
@ -145,6 +151,7 @@ class TestETL:
"rb",
) as file:
file_contents = file.read()
response_mock = requests.Response()
response_mock.status_code = 200
# pylint: disable=protected-access
@ -154,15 +161,25 @@ class TestETL:
mock_get_state_fips_codes.return_value = [
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
]
# sources mock
sources_mock.return_value = sources_path
# Instantiate the ETL class.
etl = self._get_instance_of_etl_class()
# Monkey-patch the temporary directory to the one used in the test
etl.TMP_PATH = tmp_path
etl.SOURCES_PATH = data_path / "sources"
# Run the extract method.
etl.extract()
def fake_get_sources_path() -> pathlib.PosixPath:
return sources_path
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
return etl
def test_init_base(self, mock_etl, mock_paths):
@ -263,17 +280,12 @@ class TestETL:
file was unzipped from a "fake" downloaded zip (located in data) in a temporary path.
"""
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
tmp_path = mock_paths[1]
_ = self._setup_etl_instance_and_run_extract(
etl = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl,
mock_paths=mock_paths,
)
assert (
tmp_path
/ self._EXTRACT_TMP_FOLDER_NAME
/ self._SAMPLE_DATA_FILE_NAME
).exists()
assert (etl.get_sources_path()).exists()
def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
"""Tests the extract method.
@ -285,8 +297,11 @@ class TestETL:
mock_etl=mock_etl,
mock_paths=mock_paths,
)
data_path, tmp_path = mock_paths
tmp_df = pd.read_csv(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
)
snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST

View file

@ -29,7 +29,7 @@ class TestHistoricRedliningETL(TestETL):
mock_paths=mock_paths,
)
tmp_df = pd.read_excel(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
)
assert tmp_df.shape == (15, 5)

View file

@ -36,23 +36,12 @@ class TestNationalRiskIndexETL(TestETL):
def test_init(self, mock_etl, mock_paths):
"""Tests that the mock NationalRiskIndexETL class instance was
initiliazed correctly.
Validates the following conditions:
- self.DATA_PATH points to the "data" folder in the temp directory
- self.TMP_PATH points to the "data/tmp" folder in the temp directory
- self.INPUT_PATH points to the correct path in the temp directory
- self.OUTPUT_PATH points to the correct path in the temp directory
initialized correctly.
"""
# setup
etl = NationalRiskIndexETL()
data_path, tmp_path = mock_paths
input_csv = (
tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
)
# validation
assert etl.INPUT_CSV == input_csv
assert etl.GEOID_FIELD_NAME == "GEOID10"
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
assert etl.NAME == "national_risk_index"