Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline
2025-09-30 06:33:18 -07:00 · 2023-03-03 12:26:24 -06:00 · 2023-03-03 12:26:24 -06:00 · 6f39033dde
commit 6f39033dde
parent 4d9c1dd11e
52 changed files with 1787 additions and 686 deletions
--- a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
        data. A basic version of that patching is included here for classes that can use it.
        """

+        data_path, tmp_path = mock_paths
+        sources_path = data_path / "sources" / self._ETL_CLASS.__name__
+        sources_path.mkdir(parents=True, exist_ok=True)
+
        with mock.patch(
-            "data_pipeline.utils.requests"
+            "data_pipeline.etl.downloader.requests"
        ) as requests_mock, mock.patch(
+            "data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
+        ) as sources_mock, mock.patch(
            "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
        ) as mock_get_state_fips_codes:
-            tmp_path = mock_paths[1]

+            # requests mock
            def fake_get(url, *args, **kwargs):
                file_path = url.split("/")[-1]
                with open(
@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
                return response_mock

            requests_mock.get = fake_get
+
+            # fips codes mock
            mock_get_state_fips_codes.return_value = [
                x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
            ]
+
+            # sources mock
+            sources_mock.return_value = sources_path
+
            # Instantiate the ETL class.
            etl = self._get_instance_of_etl_class()

            # Monkey-patch the temporary directory to the one used in the test
            etl.TMP_PATH = tmp_path
+            etl.SOURCES_PATH = data_path / "sources"

            # Run the extract method.
            etl.extract()
+
+        def fake_get_sources_path() -> pathlib.PosixPath:
+            return sources_path
+
+        mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
+
        return etl

    def test_init(self, mock_etl, mock_paths):
--- a/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py
@ -28,7 +28,7 @@ class TestTravelCompositeETL(TestETL):
            mock_paths=mock_paths,
        )
        df = gpd.read_file(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
            dtype={etl.GEOID_TRACT_FIELD_NAME: str},
        )
        assert df.shape[0] == 30
--- a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
@ -5,6 +5,7 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource

 logger = get_module_logger(__name__)

@ -30,6 +31,9 @@ class ExampleETL(ExtractTransformLoad):
            self.EXAMPLE_FIELD_NAME,
        ]

+    def get_data_sources(self) -> [DataSource]:
+        return []
+
    def extract(self):
        # Pretend to download zip from external URL, write it to CSV.
        zip_file_path = (
@ -42,11 +46,11 @@ class ExampleETL(ExtractTransformLoad):
        )

        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
-            zip_ref.extractall(self.get_tmp_path())
+            zip_ref.extractall(self.get_sources_path())

    def transform(self):
        df: pd.DataFrame = pd.read_csv(
-            self.get_tmp_path() / "input.csv",
+            self.get_sources_path() / "input.csv",
            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
            low_memory=False,
        )
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@ -124,12 +124,18 @@ class TestETL:
        data. A basic version of that patching is included here for classes that can use it.
        """

+        data_path, tmp_path = mock_paths
+        sources_path = data_path / "sources" / self._ETL_CLASS.__name__
+        sources_path.mkdir(parents=True, exist_ok=True)
+
        with mock.patch(
-            "data_pipeline.utils.requests"
+            "data_pipeline.etl.downloader.requests"
        ) as requests_mock, mock.patch(
+            "data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
+        ) as sources_mock, mock.patch(
            "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
        ) as mock_get_state_fips_codes:
-            tmp_path = mock_paths[1]
+
            if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
                zip_file_fixture_src = (
                    self._DATA_DIRECTORY_FOR_TEST
@ -145,6 +151,7 @@ class TestETL:
                    "rb",
                ) as file:
                    file_contents = file.read()
+
            response_mock = requests.Response()
            response_mock.status_code = 200
            # pylint: disable=protected-access
@ -154,15 +161,25 @@ class TestETL:
            mock_get_state_fips_codes.return_value = [
                x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
            ]
+
+            # sources mock
+            sources_mock.return_value = sources_path
+
            # Instantiate the ETL class.
            etl = self._get_instance_of_etl_class()

            # Monkey-patch the temporary directory to the one used in the test
            etl.TMP_PATH = tmp_path
+            etl.SOURCES_PATH = data_path / "sources"

            # Run the extract method.
            etl.extract()

+        def fake_get_sources_path() -> pathlib.PosixPath:
+            return sources_path
+
+        mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
+
        return etl

    def test_init_base(self, mock_etl, mock_paths):
@ -263,17 +280,12 @@ class TestETL:
        file was unzipped from a "fake" downloaded zip (located in data) in a  temporary path.
        """
        if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
-            tmp_path = mock_paths[1]

-            _ = self._setup_etl_instance_and_run_extract(
+            etl = self._setup_etl_instance_and_run_extract(
                mock_etl=mock_etl,
                mock_paths=mock_paths,
            )
-            assert (
-                tmp_path
-                / self._EXTRACT_TMP_FOLDER_NAME
-                / self._SAMPLE_DATA_FILE_NAME
-            ).exists()
+            assert (etl.get_sources_path()).exists()

    def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
        """Tests the extract method.
@ -285,8 +297,11 @@ class TestETL:
            mock_etl=mock_etl,
            mock_paths=mock_paths,
        )
+
+        data_path, tmp_path = mock_paths
+
        tmp_df = pd.read_csv(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
            dtype={etl.GEOID_TRACT_FIELD_NAME: str},
        )
        snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
--- a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
@ -29,7 +29,7 @@ class TestHistoricRedliningETL(TestETL):
            mock_paths=mock_paths,
        )
        tmp_df = pd.read_excel(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
            dtype={etl.GEOID_TRACT_FIELD_NAME: str},
        )
        assert tmp_df.shape == (15, 5)
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@ -36,23 +36,12 @@ class TestNationalRiskIndexETL(TestETL):

    def test_init(self, mock_etl, mock_paths):
        """Tests that the mock NationalRiskIndexETL class instance was
-        initiliazed correctly.
-
-        Validates the following conditions:
-        - self.DATA_PATH points to the "data" folder in the temp directory
-        - self.TMP_PATH points to the "data/tmp" folder in the temp directory
-        - self.INPUT_PATH points to the correct path in the temp directory
-        - self.OUTPUT_PATH points to the correct path in the temp directory
+        initialized correctly.
        """
        # setup
        etl = NationalRiskIndexETL()
-        data_path, tmp_path = mock_paths
-        input_csv = (
-            tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
-        )

        # validation
-        assert etl.INPUT_CSV == input_csv
        assert etl.GEOID_FIELD_NAME == "GEOID10"
        assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
        assert etl.NAME == "national_risk_index"