Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline
2025-08-03 09:34:19 -07:00 · 2023-03-03 12:26:24 -06:00 · 2023-03-03 12:26:24 -06:00 · 6f39033dde
commit 6f39033dde
parent 4d9c1dd11e
52 changed files with 1787 additions and 686 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -22,6 +22,8 @@ from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
 from data_pipeline.score import field_names
 from data_pipeline.score.score_runner import ScoreRunner
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+

 logger = get_module_logger(__name__)

@ -55,7 +57,13 @@ class ScoreETL(ExtractTransformLoad):

        self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return (
+            []
+        )  # we have all prerequisite sources locally as a result of running the ETLs
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
        # EJSCreen csv Load
        ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
        self.ejscreen_df = pd.read_csv(
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -15,6 +15,7 @@ from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_dict_from_yaml_object_fields
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import zip_files
+from data_pipeline.etl.datasource import DataSource

 logger = get_module_logger(__name__)

@ -68,7 +69,13 @@ class GeoScoreETL(ExtractTransformLoad):
        self.geojson_score_usa_high: gpd.GeoDataFrame
        self.geojson_score_usa_low: gpd.GeoDataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return (
+            []
+        )  # we have all prerequisite sources locally as a result of generating the previous steps in the pipeline
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
        # check census data
        check_census_data_source(
            census_data_path=self.DATA_PATH / "census",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -2,7 +2,9 @@ import json
 from pathlib import Path

 import numpy as np
+from numpy import float64
 import pandas as pd
+
 from data_pipeline.content.schemas.download_schemas import CodebookConfig
 from data_pipeline.content.schemas.download_schemas import CSVConfig
 from data_pipeline.content.schemas.download_schemas import ExcelConfig
@ -16,7 +18,8 @@ from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_dict_from_yaml_object_fields
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import zip_files
-from numpy import float64
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.downloader import Downloader

 from . import constants

@ -61,6 +64,11 @@ class PostScoreETL(ExtractTransformLoad):
        self.yaml_global_config_sort_by_label = "sort_by_label"
        # End YAML definition constants

+    def get_data_sources(self) -> [DataSource]:
+        return (
+            []
+        )  # we have all prerequisite sources locally as a result of generating the score
+
    def _extract_counties(self, county_path: Path) -> pd.DataFrame:
        logger.debug("Reading Counties CSV")
        return pd.read_csv(
@ -97,17 +105,23 @@ class PostScoreETL(ExtractTransformLoad):

        return df

-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        # check census data
        check_census_data_source(
            census_data_path=self.DATA_PATH / "census",
            census_data_source=self.DATA_SOURCE,
        )

-        super().extract(
-            constants.CENSUS_COUNTIES_ZIP_URL,
-            constants.TMP_PATH,
+        # TODO would could probably add this to the data sources for this file
+        Downloader.download_zip_file_from_url(
+            constants.CENSUS_COUNTIES_ZIP_URL, constants.TMP_PATH
        )
+
        self.input_counties_df = self._extract_counties(
            constants.CENSUS_COUNTIES_FILE_NAME
        )
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -13,7 +13,7 @@ from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
 from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
+from data_pipeline.etl.downloader import Downloader
 from data_pipeline.utils import get_module_logger

 from . import constants
@ -48,7 +48,7 @@ def check_score_data_source(
    # download from s3 if census_data_source is aws
    if score_data_source == "aws":
        logger.debug("Fetching Score Tile data from AWS S3")
-        download_file_from_url(
+        Downloader.download_file_from_url(
            file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
        )
    else: