mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-29 02:31:17 -07:00
Add ability to cache ETL data sources (#2169)
* Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline
This commit is contained in:
parent
4d9c1dd11e
commit
6f39033dde
52 changed files with 1787 additions and 686 deletions
|
@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
|
|||
data. A basic version of that patching is included here for classes that can use it.
|
||||
"""
|
||||
|
||||
data_path, tmp_path = mock_paths
|
||||
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
|
||||
sources_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with mock.patch(
|
||||
"data_pipeline.utils.requests"
|
||||
"data_pipeline.etl.downloader.requests"
|
||||
) as requests_mock, mock.patch(
|
||||
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
|
||||
) as sources_mock, mock.patch(
|
||||
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
|
||||
) as mock_get_state_fips_codes:
|
||||
tmp_path = mock_paths[1]
|
||||
|
||||
# requests mock
|
||||
def fake_get(url, *args, **kwargs):
|
||||
file_path = url.split("/")[-1]
|
||||
with open(
|
||||
|
@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
|
|||
return response_mock
|
||||
|
||||
requests_mock.get = fake_get
|
||||
|
||||
# fips codes mock
|
||||
mock_get_state_fips_codes.return_value = [
|
||||
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
|
||||
]
|
||||
|
||||
# sources mock
|
||||
sources_mock.return_value = sources_path
|
||||
|
||||
# Instantiate the ETL class.
|
||||
etl = self._get_instance_of_etl_class()
|
||||
|
||||
# Monkey-patch the temporary directory to the one used in the test
|
||||
etl.TMP_PATH = tmp_path
|
||||
etl.SOURCES_PATH = data_path / "sources"
|
||||
|
||||
# Run the extract method.
|
||||
etl.extract()
|
||||
|
||||
def fake_get_sources_path() -> pathlib.PosixPath:
|
||||
return sources_path
|
||||
|
||||
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
|
||||
|
||||
return etl
|
||||
|
||||
def test_init(self, mock_etl, mock_paths):
|
||||
|
|
|
@ -28,7 +28,7 @@ class TestTravelCompositeETL(TestETL):
|
|||
mock_paths=mock_paths,
|
||||
)
|
||||
df = gpd.read_file(
|
||||
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
|
||||
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
|
||||
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
|
||||
)
|
||||
assert df.shape[0] == 30
|
||||
|
|
|
@ -5,6 +5,7 @@ from data_pipeline.config import settings
|
|||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.base import ValidGeoLevel
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.etl.datasource import DataSource
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -30,6 +31,9 @@ class ExampleETL(ExtractTransformLoad):
|
|||
self.EXAMPLE_FIELD_NAME,
|
||||
]
|
||||
|
||||
def get_data_sources(self) -> [DataSource]:
|
||||
return []
|
||||
|
||||
def extract(self):
|
||||
# Pretend to download zip from external URL, write it to CSV.
|
||||
zip_file_path = (
|
||||
|
@ -42,11 +46,11 @@ class ExampleETL(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
||||
zip_ref.extractall(self.get_tmp_path())
|
||||
zip_ref.extractall(self.get_sources_path())
|
||||
|
||||
def transform(self):
|
||||
df: pd.DataFrame = pd.read_csv(
|
||||
self.get_tmp_path() / "input.csv",
|
||||
self.get_sources_path() / "input.csv",
|
||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||
low_memory=False,
|
||||
)
|
||||
|
|
|
@ -124,12 +124,18 @@ class TestETL:
|
|||
data. A basic version of that patching is included here for classes that can use it.
|
||||
"""
|
||||
|
||||
data_path, tmp_path = mock_paths
|
||||
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
|
||||
sources_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with mock.patch(
|
||||
"data_pipeline.utils.requests"
|
||||
"data_pipeline.etl.downloader.requests"
|
||||
) as requests_mock, mock.patch(
|
||||
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
|
||||
) as sources_mock, mock.patch(
|
||||
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
|
||||
) as mock_get_state_fips_codes:
|
||||
tmp_path = mock_paths[1]
|
||||
|
||||
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
|
||||
zip_file_fixture_src = (
|
||||
self._DATA_DIRECTORY_FOR_TEST
|
||||
|
@ -145,6 +151,7 @@ class TestETL:
|
|||
"rb",
|
||||
) as file:
|
||||
file_contents = file.read()
|
||||
|
||||
response_mock = requests.Response()
|
||||
response_mock.status_code = 200
|
||||
# pylint: disable=protected-access
|
||||
|
@ -154,15 +161,25 @@ class TestETL:
|
|||
mock_get_state_fips_codes.return_value = [
|
||||
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
|
||||
]
|
||||
|
||||
# sources mock
|
||||
sources_mock.return_value = sources_path
|
||||
|
||||
# Instantiate the ETL class.
|
||||
etl = self._get_instance_of_etl_class()
|
||||
|
||||
# Monkey-patch the temporary directory to the one used in the test
|
||||
etl.TMP_PATH = tmp_path
|
||||
etl.SOURCES_PATH = data_path / "sources"
|
||||
|
||||
# Run the extract method.
|
||||
etl.extract()
|
||||
|
||||
def fake_get_sources_path() -> pathlib.PosixPath:
|
||||
return sources_path
|
||||
|
||||
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
|
||||
|
||||
return etl
|
||||
|
||||
def test_init_base(self, mock_etl, mock_paths):
|
||||
|
@ -263,17 +280,12 @@ class TestETL:
|
|||
file was unzipped from a "fake" downloaded zip (located in data) in a temporary path.
|
||||
"""
|
||||
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
|
||||
tmp_path = mock_paths[1]
|
||||
|
||||
_ = self._setup_etl_instance_and_run_extract(
|
||||
etl = self._setup_etl_instance_and_run_extract(
|
||||
mock_etl=mock_etl,
|
||||
mock_paths=mock_paths,
|
||||
)
|
||||
assert (
|
||||
tmp_path
|
||||
/ self._EXTRACT_TMP_FOLDER_NAME
|
||||
/ self._SAMPLE_DATA_FILE_NAME
|
||||
).exists()
|
||||
assert (etl.get_sources_path()).exists()
|
||||
|
||||
def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
|
||||
"""Tests the extract method.
|
||||
|
@ -285,8 +297,11 @@ class TestETL:
|
|||
mock_etl=mock_etl,
|
||||
mock_paths=mock_paths,
|
||||
)
|
||||
|
||||
data_path, tmp_path = mock_paths
|
||||
|
||||
tmp_df = pd.read_csv(
|
||||
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
|
||||
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
|
||||
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
|
||||
)
|
||||
snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
|
||||
|
|
|
@ -29,7 +29,7 @@ class TestHistoricRedliningETL(TestETL):
|
|||
mock_paths=mock_paths,
|
||||
)
|
||||
tmp_df = pd.read_excel(
|
||||
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
|
||||
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
|
||||
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
|
||||
)
|
||||
assert tmp_df.shape == (15, 5)
|
||||
|
|
|
@ -36,23 +36,12 @@ class TestNationalRiskIndexETL(TestETL):
|
|||
|
||||
def test_init(self, mock_etl, mock_paths):
|
||||
"""Tests that the mock NationalRiskIndexETL class instance was
|
||||
initiliazed correctly.
|
||||
|
||||
Validates the following conditions:
|
||||
- self.DATA_PATH points to the "data" folder in the temp directory
|
||||
- self.TMP_PATH points to the "data/tmp" folder in the temp directory
|
||||
- self.INPUT_PATH points to the correct path in the temp directory
|
||||
- self.OUTPUT_PATH points to the correct path in the temp directory
|
||||
initialized correctly.
|
||||
"""
|
||||
# setup
|
||||
etl = NationalRiskIndexETL()
|
||||
data_path, tmp_path = mock_paths
|
||||
input_csv = (
|
||||
tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
|
||||
)
|
||||
|
||||
# validation
|
||||
assert etl.INPUT_CSV == input_csv
|
||||
assert etl.GEOID_FIELD_NAME == "GEOID10"
|
||||
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
|
||||
assert etl.NAME == "national_risk_index"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue