Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline
2025-07-28 10:51:16 -07:00 · 2023-03-03 12:26:24 -06:00 · 2023-03-03 12:26:24 -06:00 · 6f39033dde
commit 6f39033dde
parent 4d9c1dd11e
52 changed files with 1787 additions and 686 deletions
--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@ -92,7 +92,6 @@ If you want to run specific data tasks, you can open a terminal window, navigate
 - Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles`

 To learn more about these commands and when they should be run, refer to [Running for Local Development](#running-for-local-development).
-
 </details>

 ---
@ -136,6 +135,9 @@ Once you've downloaded the census data, run the following commands – in order

 Many commands have options. For example, you can run a single dataset with `etl-run` by passing the command line parameter `-d name-of-dataset-to-run`. Please use the `--help` option to find out more.

+> :bulb: **NOTE**  
+> One important command line option is enabling cached data sources. Pass the command line parameter `-u` to many commands (e.g. `etl-run`) to use locally cached data sources within the ETL portion of the pipeline. This will ensure that you don't download many GB of data with each run of the data pipeline.
+
 ## How Scoring Works

 Scores are generated by running the `score-run` command via Poetry or Docker. This command executes [`data_pipeline/etl/score/etl_score.py`](data_pipeline/etl/score/etl_score.py). During execution,
--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@ -7,6 +7,9 @@ from data_pipeline.etl.runner import etl_runner
 from data_pipeline.etl.runner import score_generate
 from data_pipeline.etl.runner import score_geo
 from data_pipeline.etl.runner import score_post
+from data_pipeline.etl.runner import get_data_sources
+from data_pipeline.etl.runner import extract_data_sources as extract_ds
+from data_pipeline.etl.runner import clear_data_source_cache as clear_ds_cache
 from data_pipeline.etl.sources.census.etl_utils import check_census_data_source
 from data_pipeline.etl.sources.census.etl_utils import (
    reset_data_directories as census_reset,
@ -79,7 +82,14 @@ def data_cleanup():
    is_flag=True,
    help="Upload to AWS S3 a zipped archive of the census data.",
 )
-def census_data_download(zip_compress):
+@click.option(
+    "-u",
+    "--use-cache",
+    is_flag=True,
+    default=False,
+    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+)
+def census_data_download(zip_compress, use_cache):
    """CLI command to download all census shape files from the Census FTP and extract the geojson
    to generate national and by state Census Block Group CSVs"""
    log_title("Download Census Data ")
@ -88,7 +98,7 @@ def census_data_download(zip_compress):
    census_reset(data_path)

    log_info("Downloading census data")
-    etl_runner("census")
+    etl_runner("census", use_cache)

    if zip_compress:
        log_info("Zipping census data")
@ -129,7 +139,14 @@ def pull_census_data(data_source: str):
    type=str,
    help=dataset_cli_help,
 )
-def etl_run(dataset: str):
+@click.option(
+    "-u",
+    "--use-cache",
+    is_flag=True,
+    default=False,
+    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+)
+def etl_run(dataset: str, use_cache: bool):
    """Run a specific or all ETL processes

    Args:
@ -141,7 +158,7 @@ def etl_run(dataset: str):
    log_title("Run ETL")

    log_info("Running dataset(s)")
-    etl_runner(dataset)
+    etl_runner(dataset, use_cache)

    log_goodbye()
    sys.exit()
@ -167,7 +184,14 @@ def score_run():
@cli.command(
    help="Run ETL + Score Generation",
 )
-def score_full_run():
+@click.option(
+    "-u",
+    "--use-cache",
+    is_flag=True,
+    default=False,
+    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+)
+def score_full_run(use_cache: bool):
    """CLI command to run ETL and generate the score in one command"""
    log_title("Score Full Run", "Run ETL and Generate Score (no tiles)")

@ -177,7 +201,7 @@ def score_full_run():
    temp_folder_cleanup()

    log_info("Running all ETLs")
-    etl_runner()
+    etl_runner(use_cache=use_cache)

    log_info("Generating score")
    score_generate()
@ -297,7 +321,14 @@ def generate_map_tiles(generate_tribal_layer):
    type=str,
    help=dataset_cli_help,
 )
-def data_full_run(check: bool, data_source: str):
+@click.option(
+    "-u",
+    "--use-cache",
+    is_flag=True,
+    default=False,
+    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+)
+def data_full_run(check: bool, data_source: str, use_cache: bool):
    """CLI command to run ETL, score, JSON combine and generate tiles in one command

    Args:
@ -330,10 +361,10 @@ def data_full_run(check: bool, data_source: str):

        if data_source == "local":
            log_info("Downloading census data")
-            etl_runner("census")
+            etl_runner("census", use_cache)

        log_info("Running all ETLs")
-        etl_runner()
+        etl_runner(use_cache=use_cache)

        log_info("Generating score")
        score_generate()
@ -357,6 +388,103 @@ def data_full_run(check: bool, data_source: str):
    sys.exit()


+@cli.command(
+    help="Print data sources for all ETL processes (or a specific one)",
+)
+@click.option(
+    "-d",
+    "--dataset",
+    required=False,
+    type=str,
+    help=dataset_cli_help,
+)
+def print_data_sources(dataset: str):
+    """Print data sources for all ETL processes (or a specific one)
+
+    Args:
+        dataset (str): Name of the ETL module to be run (optional)
+
+    Returns:
+        None
+    """
+    log_title("Print ETL Datasources")
+
+    log_info("Retrieving dataset(s)")
+    sources = get_data_sources(dataset)
+
+    log_info(f"Discovered {len(sources)} files")
+
+    for s in sources:
+        log_info(s)
+
+    log_goodbye()
+    sys.exit()
+
+
+@cli.command(
+    help="Fetch data sources for all ETL processes (or a specific one)",
+)
+@click.option(
+    "-d",
+    "--dataset",
+    required=False,
+    type=str,
+    help=dataset_cli_help,
+)
+@click.option(
+    "-u",
+    "--use-cache",
+    is_flag=True,
+    default=False,
+    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+)
+def extract_data_sources(dataset: str, use_cache: bool):
+    """Extract and cache data source(s) for all ETL processes (or a specific one)
+
+    Args:
+        dataset (str): Name of the ETL module whose data sources you wish to fetch
+        use_cache (bool): Use this flag if you wish to use the cached data sources (if they exist)
+
+    Returns:
+        None
+    """
+    log_title("Fetch ETL Datasources")
+
+    log_info("Fetching data source(s)")
+    extract_ds(dataset, use_cache)
+
+    log_goodbye()
+    sys.exit()
+
+
+@cli.command(
+    help="Clear data source cache for all ETL processes (or a specific one)",
+)
+@click.option(
+    "-d",
+    "--dataset",
+    required=False,
+    type=str,
+    help=dataset_cli_help,
+)
+def clear_data_source_cache(dataset: str):
+    """Clear data source(s) cache for all ETL processes (or a specific one)
+
+    Args:
+        dataset (str): Name of the ETL module whose cache you wish to clear
+
+    Returns:
+        None
+    """
+    log_title("Fetch ETL Datasources")
+
+    log_info("Clear data source cache")
+    clear_ds_cache(dataset)
+
+    log_goodbye()
+    sys.exit()
+
+
 def log_title(title: str, subtitle: str = None):
    """Logs a title in our fancy title format"""
    logger.info("-" * LOG_LINE_WIDTH)
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -2,7 +2,9 @@ import enum
 import pathlib
 import sys
 import typing
+import shutil
 from typing import Optional
+from abc import ABC, abstractmethod

 import pandas as pd
 from data_pipeline.config import settings
@ -13,7 +15,7 @@ from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import remove_all_from_dir
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource

 logger = get_module_logger(__name__)

@ -25,7 +27,7 @@ class ValidGeoLevel(enum.Enum):
    CENSUS_BLOCK_GROUP = enum.auto()


-class ExtractTransformLoad:
+class ExtractTransformLoad(ABC):
    """
    A class used to instantiate an ETL object to retrieve and process data from
    datasets.
@ -45,6 +47,7 @@ class ExtractTransformLoad:
    # Directories
    DATA_PATH: pathlib.Path = settings.DATA_PATH
    TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
+    SOURCES_PATH: pathlib.Path = DATA_PATH / "sources"
    CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
    DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
    DATASET_CONFIG: Optional[dict] = None
@ -177,45 +180,60 @@ class ExtractTransformLoad:
        output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
        return output_file_path

-    def get_tmp_path(self) -> pathlib.Path:
-        """Returns the temporary path associated with this ETL class."""
-        # Note: the temporary path will be defined on `init`, because it uses the class
-        # of the instance which is often a child class.
-        tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
+    def get_sources_path(self) -> pathlib.Path:
+        """Returns the sources path associated with this ETL class. The sources path
+        is the home for cached data sources used by this ETL."""
+
+        sources_path = self.SOURCES_PATH / str(self.__class__.__name__)

        # Create directory if it doesn't exist
-        tmp_path.mkdir(parents=True, exist_ok=True)
+        sources_path.mkdir(parents=True, exist_ok=True)

-        return tmp_path
+        return sources_path

-    def extract(
-        self,
-        source_url: str = None,
-        extract_path: pathlib.Path = None,
-        verify: Optional[bool] = True,
-    ) -> None:
-        """Extract the data from a remote source. By default it provides code
-        to get the file from a source url, unzips it and stores it on an
-        extract_path."""
+    @abstractmethod
+    def get_data_sources(self) -> [DataSource]:
+        pass

-        if source_url is None:
-            source_url = self.SOURCE_URL
+    def _fetch(self) -> None:
+        """Fetch all data sources for this ETL. When data sources are fetched, they
+        are stored in a cache directory for consistency between runs."""
+        for ds in self.get_data_sources():
+            ds.fetch()

-        if extract_path is None:
-            extract_path = self.get_tmp_path()
+    def clear_data_source_cache(self) -> None:
+        """Clears the cache for this ETLs data source(s)"""
+        shutil.rmtree(self.get_sources_path())

-        unzip_file_from_url(
-            file_url=source_url,
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=extract_path,
-            verify=verify,
-        )
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+        """Extract (download) data from a remote source, and validate
+        that data. By default, this method fetches data from the set of
+        data sources returned by get_data_sources.

+        If use_cached_data_sources is true, this method attempts to use cached data
+        rather than re-downloading from the original source. The cache algorithm is very
+        simple: it just looks to see if the directory has any contents. If so, it uses
+        that content. If not, it downloads all data sources.
+
+        Subclasses should call super() before performing any work if they wish to take
+        advantage of the automatic downloading and caching ability of this superclass.
+        """
+
+        if use_cached_data_sources and any(self.get_sources_path().iterdir()):
+            logger.info(
+                f"Using cached data sources for {self.__class__.__name__}"
+            )
+        else:
+            self.clear_data_source_cache()
+            self._fetch()
+
+        # the rest of the work should be performed here
+
+    @abstractmethod
    def transform(self) -> None:
        """Transform the data extracted into a format that can be consumed by the
        score generator"""
-
-        raise NotImplementedError
+        pass

    def validate(self) -> None:
        """Validates the output.
@ -380,3 +398,14 @@ class ExtractTransformLoad:
    def cleanup(self) -> None:
        """Clears out any files stored in the TMP folder"""
        remove_all_from_dir(self.get_tmp_path())
+
+    def get_tmp_path(self) -> pathlib.Path:
+        """Returns the temporary path associated with this ETL class."""
+        # Note: the temporary path will be defined on `init`, because it uses the class
+        # of the instance which is often a child class.
+        tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
+
+        # Create directory if it doesn't exist
+        tmp_path.mkdir(parents=True, exist_ok=True)
+
+        return tmp_path
--- a/data/data-pipeline/data_pipeline/etl/datasource.py
+++ b/data/data-pipeline/data_pipeline/etl/datasource.py
@ -0,0 +1,124 @@
+"""This module defines a set of classes that can be used to fetch data
+from a remote source. They are meant to be used in conjuction with ETLs
+or other classes that require downloading data.
+
+There are three types of data sources defined in this file:
+
+FileDataSource – meant to be used when you have a single file to
+retrive from a remote location and save to a destination.
+
+ZipDataSource – used when you need to fetch and unzip a file, and save
+the contents of that file to a destination.
+
+CensusDataSource – used to download data from the Census API and store
+the contents to a destination.
+
+DataSource subclasses must implement the fetch method to define how
+they will reach out to a remote source, download the data, and save
+that data to the destination.
+"""
+
+from pathlib import Path
+from typing import List
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+
+from data_pipeline.etl.downloader import Downloader
+from data_pipeline.etl.sources.census_acs.etl_utils import (
+    retrieve_census_acs_data,
+)
+
+
+@dataclass
+class DataSource(ABC):
+    """A data source represents any source of data that is fetchable
+    from a remote location.
+
+    Attributes:
+    source : str
+            the location of this data source, as a url
+    destination : Path
+            the Path where the data source should be saved locally upon being fetched
+
+    """
+
+    source: str
+    destination: Path
+
+    @abstractmethod
+    def fetch(self) -> None:
+        pass
+
+
+@dataclass
+class FileDataSource(DataSource):
+    """A data source representing a single file.
+
+    This single file will be fetched from the source and saved to a single
+    destination.
+    """
+
+    def fetch(self) -> None:
+        """Fetches a single file from a source and saves it to a destination."""
+
+        self.destination.parent.mkdir(parents=True, exist_ok=True)
+        Downloader.download_file_from_url(
+            file_url=self.source,
+            download_file_name=self.destination,
+            verify=True,
+        )
+
+    def __str__(self):
+        return f"File – {self.source}"
+
+
+@dataclass
+class ZIPDataSource(DataSource):
+    """A data source representing ZIP files.
+
+    Zip files will be fetched and placed in the destination folder, then unzipped.
+    """
+
+    def fetch(self) -> None:
+
+        self.destination.mkdir(parents=True, exist_ok=True)
+        Downloader.download_zip_file_from_url(
+            file_url=self.source,
+            unzipped_file_path=self.destination,
+            verify=True,
+        )
+
+    def __str__(self):
+        return f"Zip – {self.source}"
+
+
+@dataclass
+class CensusDataSource(DataSource):
+    """A data source representing census data.
+
+    Data will be fetched using the Census API and saved to the destination file. Source is ignored.
+    """
+
+    acs_year: int
+    variables: List[str]
+    tract_output_field_name: str
+    data_path_for_fips_codes: Path
+    acs_type: str
+
+    def fetch(self) -> None:
+
+        df = retrieve_census_acs_data(
+            acs_year=self.acs_year,
+            variables=self.variables,
+            tract_output_field_name=self.tract_output_field_name,
+            data_path_for_fips_codes=self.data_path_for_fips_codes,
+            acs_type=self.acs_type,
+        )
+
+        self.destination.parent.mkdir(parents=True, exist_ok=True)
+
+        # Write CSV representation of census data
+        df.to_csv(self.destination, index=False)
+
+    def __str__(self):
+        return f"Census – {self.acs_type}, {self.acs_year}"
--- a/data/data-pipeline/data_pipeline/etl/downloader.py
+++ b/data/data-pipeline/data_pipeline/etl/downloader.py
@ -0,0 +1,95 @@
+import uuid
+import urllib3
+import requests
+import zipfile
+import shutil
+
+from pathlib import Path
+from data_pipeline.config import settings
+
+
+class Downloader:
+    """A simple class to encapsulate the download capabilities of the application"""
+
+    @classmethod
+    def download_file_from_url(
+        cls,
+        file_url: str,
+        download_file_name: Path,
+        verify: bool = True,
+    ) -> str:
+        """Downloads a file from a remote URL location and returns the file location.
+
+        Args:
+                file_url (str): URL where the zip file is located
+                download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default)
+                verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
+                error (optional, default to False)
+
+        Returns:
+                None
+
+        """
+        # disable https warning
+        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+        download_file_name.parent.mkdir(parents=True, exist_ok=True)
+
+        response = requests.get(
+            file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
+        )
+        if response.status_code == 200:
+            file_contents = response.content
+        else:
+            raise Exception(
+                f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
+            )
+
+        # Write the contents to disk.
+        file = open(download_file_name, "wb")
+        file.write(file_contents)
+        file.close()
+
+        return download_file_name
+
+    @classmethod
+    def download_zip_file_from_url(
+        cls,
+        file_url: str,
+        unzipped_file_path: Path,
+        verify: bool = True,
+    ) -> None:
+        """Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after
+
+        Args:
+                file_url (str): URL where the zip file is located
+                unzipped_file_path (pathlib.Path): directory and name of the extracted file
+                verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
+                error (optional, default to False)
+
+        Returns:
+                None
+
+        """
+        # dir_id allows us to evade race conditions on parallel ETLs
+        dir_id = uuid.uuid4()
+
+        zip_download_path = (
+            settings.DATA_PATH
+            / "tmp"
+            / "downloads"
+            / f"{dir_id}"
+            / "download.zip"
+        )
+
+        zip_file_path = Downloader.download_file_from_url(
+            file_url=file_url,
+            download_file_name=zip_download_path,
+            verify=verify,
+        )
+
+        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+            zip_ref.extractall(unzipped_file_path)
+
+        # cleanup temporary file and directory
+        shutil.rmtree(zip_download_path.parent)
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -2,10 +2,14 @@ import concurrent.futures
 import importlib
 import typing

+from functools import reduce
+
 from data_pipeline.etl.score.etl_score import ScoreETL
 from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
 from data_pipeline.etl.score.etl_score_post import PostScoreETL
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource

 from . import constants

@ -40,20 +44,26 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
    return dataset_list


-def _run_one_dataset(dataset: dict) -> None:
-    """Runs one etl process."""
-
-    logger.info(f"Running ETL for {dataset['name']}")
-
+def _get_dataset(dataset: dict) -> ExtractTransformLoad:
+    """Instantiates a dataset object from a dictionary description of that object's class"""
    etl_module = importlib.import_module(
        f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
    )
    etl_class = getattr(etl_module, dataset["class_name"])
    etl_instance = etl_class()

+    return etl_instance
+
+
+def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
+    """Runs one etl process."""
+
+    logger.info(f"Running ETL for {dataset['name']}")
+    etl_instance = _get_dataset(dataset)
+
    # run extract
    logger.debug(f"Extracting {dataset['name']}")
-    etl_instance.extract()
+    etl_instance.extract(use_cache)

    # run transform
    logger.debug(f"Transforming {dataset['name']}")
@ -74,11 +84,12 @@ def _run_one_dataset(dataset: dict) -> None:
    logger.info(f"Finished ETL for dataset {dataset['name']}")


-def etl_runner(dataset_to_run: str = None) -> None:
+def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None:
    """Runs all etl processes or a specific one

    Args:
        dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
+        use_cache (bool): Use the cached data sources – if they exist – rather than downloading them all from scratch

    Returns:
        None
@ -105,7 +116,9 @@ def etl_runner(dataset_to_run: str = None) -> None:
        logger.info("Running concurrent ETL jobs")
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = {
-                executor.submit(_run_one_dataset, dataset=dataset)
+                executor.submit(
+                    _run_one_dataset, dataset=dataset, use_cache=use_cache
+                )
                for dataset in concurrent_datasets
            }

@ -119,7 +132,50 @@ def etl_runner(dataset_to_run: str = None) -> None:
    if high_memory_datasets:
        logger.info("Running high-memory ETL jobs")
        for dataset in high_memory_datasets:
-            _run_one_dataset(dataset=dataset)
+            _run_one_dataset(dataset=dataset, use_cache=use_cache)
+
+
+def get_data_sources(dataset_to_run: str = None) -> [DataSource]:
+
+    dataset_list = _get_datasets_to_run(dataset_to_run)
+
+    sources = []
+
+    for dataset in dataset_list:
+        etl_instance = _get_dataset(dataset)
+        sources.append(etl_instance.get_data_sources())
+
+    sources = reduce(
+        list.__add__, sources
+    )  # flatten the list of lists into a single list
+
+    return sources
+
+
+def extract_data_sources(
+    dataset_to_run: str = None, use_cache: bool = False
+) -> None:
+
+    dataset_list = _get_datasets_to_run(dataset_to_run)
+
+    for dataset in dataset_list:
+        etl_instance = _get_dataset(dataset)
+        logger.info(
+            f"Extracting data set for {etl_instance.__class__.__name__}"
+        )
+        etl_instance.extract(use_cache)
+
+
+def clear_data_source_cache(dataset_to_run: str = None) -> None:
+
+    dataset_list = _get_datasets_to_run(dataset_to_run)
+
+    for dataset in dataset_list:
+        etl_instance = _get_dataset(dataset)
+        logger.info(
+            f"Clearing data set cache for {etl_instance.__class__.__name__}"
+        )
+        etl_instance.clear_data_source_cache()


 def score_generate() -> None:
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -22,6 +22,8 @@ from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
 from data_pipeline.score import field_names
 from data_pipeline.score.score_runner import ScoreRunner
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+

 logger = get_module_logger(__name__)

@ -55,7 +57,13 @@ class ScoreETL(ExtractTransformLoad):

        self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return (
+            []
+        )  # we have all prerequisite sources locally as a result of running the ETLs
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
        # EJSCreen csv Load
        ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
        self.ejscreen_df = pd.read_csv(
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -15,6 +15,7 @@ from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_dict_from_yaml_object_fields
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import zip_files
+from data_pipeline.etl.datasource import DataSource

 logger = get_module_logger(__name__)

@ -68,7 +69,13 @@ class GeoScoreETL(ExtractTransformLoad):
        self.geojson_score_usa_high: gpd.GeoDataFrame
        self.geojson_score_usa_low: gpd.GeoDataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return (
+            []
+        )  # we have all prerequisite sources locally as a result of generating the previous steps in the pipeline
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
        # check census data
        check_census_data_source(
            census_data_path=self.DATA_PATH / "census",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -2,7 +2,9 @@ import json
 from pathlib import Path

 import numpy as np
+from numpy import float64
 import pandas as pd
+
 from data_pipeline.content.schemas.download_schemas import CodebookConfig
 from data_pipeline.content.schemas.download_schemas import CSVConfig
 from data_pipeline.content.schemas.download_schemas import ExcelConfig
@ -16,7 +18,8 @@ from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_dict_from_yaml_object_fields
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import zip_files
-from numpy import float64
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.downloader import Downloader

 from . import constants

@ -61,6 +64,11 @@ class PostScoreETL(ExtractTransformLoad):
        self.yaml_global_config_sort_by_label = "sort_by_label"
        # End YAML definition constants

+    def get_data_sources(self) -> [DataSource]:
+        return (
+            []
+        )  # we have all prerequisite sources locally as a result of generating the score
+
    def _extract_counties(self, county_path: Path) -> pd.DataFrame:
        logger.debug("Reading Counties CSV")
        return pd.read_csv(
@ -97,17 +105,23 @@ class PostScoreETL(ExtractTransformLoad):

        return df

-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        # check census data
        check_census_data_source(
            census_data_path=self.DATA_PATH / "census",
            census_data_source=self.DATA_SOURCE,
        )

-        super().extract(
-            constants.CENSUS_COUNTIES_ZIP_URL,
-            constants.TMP_PATH,
+        # TODO would could probably add this to the data sources for this file
+        Downloader.download_zip_file_from_url(
+            constants.CENSUS_COUNTIES_ZIP_URL, constants.TMP_PATH
        )
+
        self.input_counties_df = self._extract_counties(
            constants.CENSUS_COUNTIES_FILE_NAME
        )
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -13,7 +13,7 @@ from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
 from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
+from data_pipeline.etl.downloader import Downloader
 from data_pipeline.utils import get_module_logger

 from . import constants
@ -48,7 +48,7 @@ def check_score_data_source(
    # download from s3 if census_data_source is aws
    if score_data_source == "aws":
        logger.debug("Fetching Score Tile data from AWS S3")
-        download_file_from_url(
+        Downloader.download_file_from_url(
            file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
        )
    else:
--- a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
@ -1,23 +1,36 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)


 class CalEnviroScreenETL(ExtractTransformLoad):
+    """California environmental screen
+
+    TODO: Need good description
+    """
+
    def __init__(self):
-        self.CALENVIROSCREEN_FTP_URL = (
+
+        # fetch
+        self.calenviroscreen_ftp_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/CalEnviroScreen_4.0_2021.zip"
        )
-        self.CALENVIROSCREEN_CSV = (
-            self.get_tmp_path() / "CalEnviroScreen_4.0_2021.csv"
-        )
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"

-        # Definining some variable names
+        # input
+        self.calenviroscreen_source = (
+            self.get_sources_path() / "CalEnviroScreen_4.0_2021.csv"
+        )
+
+        # output
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
+
+        # Defining some variable names
        self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
        self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
            "calenviroscreen_percentile"
@ -32,19 +45,28 @@ class CalEnviroScreenETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.calenviroscreen_ftp_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
        super().extract(
-            self.CALENVIROSCREEN_FTP_URL,
-            self.get_tmp_path(),
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.calenviroscreen_source, dtype={"Census Tract": "string"}
        )

    def transform(self) -> None:
        # Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
        # https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
        # Load comparison index (CalEnviroScreen 4)
-        self.df = pd.read_csv(
-            self.CALENVIROSCREEN_CSV, dtype={"Census Tract": "string"}
-        )

        self.df.rename(
            columns={
@ -68,5 +90,5 @@ class CalEnviroScreenETL(ExtractTransformLoad):

    def load(self) -> None:
        # write nationwide csv
-        self.CSV_PATH.mkdir(parents=True, exist_ok=True)
-        self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+        self.df.to_csv(self.OUTPUT_PATH / "data06.csv", index=False)
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@ -7,8 +7,9 @@ from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.score.etl_utils import (
    compare_to_list_of_expected_state_fips_codes,
 )
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings

@ -17,59 +18,74 @@ logger = get_module_logger(__name__)


 class CDCLifeExpectancy(ExtractTransformLoad):
+    """#TODO: create description"""
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False

    NAME = "cdc_life_expectancy"

-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
-    else:
-        USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
-
    LOAD_YAML_CONFIG: bool = False
    LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
    INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"

    STATES_MISSING_FROM_USA_FILE = ["23", "55"]

-    # For some reason, LEEP does not include Maine or Wisconsin in its "All of
-    # USA" file. Load these separately.
-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
-        MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
-    else:
-        WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
-        MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
-
    TRACT_INPUT_COLUMN_NAME = "Tract ID"
    STATE_INPUT_COLUMN_NAME = "STATE2KX"

-    raw_df: pd.DataFrame
-    output_df: pd.DataFrame
+    raw_df: pd.DataFrame  # result of extraction
+    output_df: pd.DataFrame  # result of transformation

    def __init__(self):
+
+        # fetch
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.usa_file_url = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
+        else:
+            self.usa_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
+
+        # For some reason, LEEP does not include Maine or Wisconsin in its "All of USA" file. Load these separately.
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.wisconsin_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
+            self.maine_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
+        else:
+            self.wisconsin_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
+            self.maine_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
+
+        # input
+        self.usa_source = self.get_sources_path() / "US_A.CSV"
+        self.maine_source = self.get_sources_path() / "ME_A.CSV"
+        self.wisconsin_source = self.get_sources_path() / "WI_A.CSV"
+
+        # output
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "cdc_life_expectancy"
        )

-        # Constants for output
-        self.COLUMNS_TO_KEEP = [
+        self.COLUMNS_TO_KEEP = [  # the columns to save on output
            self.GEOID_TRACT_FIELD_NAME,
            field_names.LIFE_EXPECTANCY_FIELD,
        ]

-    def _download_and_prep_data(
-        self, file_url: str, download_file_name: pathlib.Path
-    ) -> pd.DataFrame:
-        download_file_from_url(
-            file_url=file_url,
-            download_file_name=download_file_name,
-            verify=True,
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.usa_file_url, destination=self.usa_source
+            ),
+            FileDataSource(
+                source=self.maine_file_url, destination=self.maine_source
+            ),
+            FileDataSource(
+                source=self.wisconsin_file_url,
+                destination=self.wisconsin_source,
+            ),
+        ]
+
+    def _read_data(self, file_name: pathlib.Path) -> pd.DataFrame:

        df = pd.read_csv(
-            filepath_or_buffer=download_file_name,
+            filepath_or_buffer=file_name,
            dtype={
                # The following need to remain as strings for all of their digits, not get converted to numbers.
                self.TRACT_INPUT_COLUMN_NAME: "string",
@ -80,12 +96,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):

        return df

-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:

-        all_usa_raw_df = self._download_and_prep_data(
-            file_url=self.USA_FILE_URL,
-            download_file_name=self.get_tmp_path() / "US_A.CSV",
-        )
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        all_usa_raw_df = self._read_data(self.usa_source)

        # Check which states are missing
        states_in_life_expectancy_usa_file = list(
@ -101,17 +118,11 @@ class CDCLifeExpectancy(ExtractTransformLoad):
            additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
        )

-        logger.debug("Downloading data for Maine")
-        maine_raw_df = self._download_and_prep_data(
-            file_url=self.MAINE_FILE_URL,
-            download_file_name=self.get_tmp_path() / "maine.csv",
+        maine_raw_df = self._read_data(
+            self.maine_source,
        )

-        logger.debug("Downloading data for Wisconsin")
-        wisconsin_raw_df = self._download_and_prep_data(
-            file_url=self.WISCONSIN_FILE_URL,
-            download_file_name=self.get_tmp_path() / "wisconsin.csv",
-        )
+        wisconsin_raw_df = self._read_data(self.wisconsin_source)

        combined_df = pd.concat(
            objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df],
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
@ -4,14 +4,17 @@ import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource

 logger = get_module_logger(__name__)


 class CDCPlacesETL(ExtractTransformLoad):
+    """#TODO: Need description"""
+
    NAME = "cdc_places"
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
@ -21,15 +24,21 @@ class CDCPlacesETL(ExtractTransformLoad):
    CDC_MEASURE_FIELD_NAME = "Measure"

    def __init__(self):
-        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"

+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.CDC_PLACES_URL = (
+            self.cdc_places_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
            )
        else:
-            self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
+            self.cdc_places_url = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
+
+        # input
+        self.places_source = self.get_sources_path() / "census_tract.csv"
+
+        # output
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"

        self.COLUMNS_TO_KEEP: typing.List[str] = [
            self.GEOID_TRACT_FIELD_NAME,
@ -43,19 +52,27 @@ class CDCPlacesETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        file_path = download_file_from_url(
-            file_url=self.CDC_PLACES_URL,
-            download_file_name=self.get_tmp_path() / "census_tract.csv",
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.cdc_places_url, destination=self.places_source
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

        self.df = pd.read_csv(
-            filepath_or_buffer=file_path,
+            filepath_or_buffer=self.places_source,
            dtype={self.CDC_GEOID_FIELD_NAME: "string"},
            low_memory=False,
        )

    def transform(self) -> None:
+
        # Rename GEOID field
        self.df.rename(
            columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py
@ -1,6 +1,8 @@
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -11,22 +13,28 @@ logger = get_module_logger(__name__)
 class CDCSVIIndex(ExtractTransformLoad):
    """CDC SVI Index class ingests 2018 dataset located
    here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html
+
    Please see the README in this module for further details.
    """

    def __init__(self):
-        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"

+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.CDC_SVI_INDEX_URL = (
+            self.cdc_svi_index_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "cdc_svi_index/SVI2018_US.csv"
            )
        else:
-            self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
+            self.cdc_svi_index_url = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
+
+        # input
+        self.svi_source = self.get_sources_path() / "SVI2018_US.csv"
+
+        # output
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"

        self.CDC_RPL_THEMES_THRESHOLD = 0.90
-
        self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS"

        self.COLUMNS_TO_KEEP = [
@ -47,9 +55,21 @@ class CDCSVIIndex(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.cdc_svi_index_url, destination=self.svi_source
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        self.df = pd.read_csv(
-            filepath_or_buffer=self.CDC_SVI_INDEX_URL,
+            filepath_or_buffer=self.svi_source,
            dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
            low_memory=False,
        )
@ -107,8 +127,8 @@ class CDCSVIIndex(ExtractTransformLoad):
            )

    def load(self) -> None:
-        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
        self.df[self.COLUMNS_TO_KEEP].to_csv(
            path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
@ -8,7 +8,8 @@ import geopandas as gpd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)

@ -20,7 +21,7 @@ class GeoFileType(Enum):


 class CensusETL(ExtractTransformLoad):
-    SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
+    # SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
    GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
    CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
    GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
@ -29,6 +30,9 @@ class CensusETL(ExtractTransformLoad):
    GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"

    def __init__(self):
+
+        self.shape_file_path = self.get_sources_path() / "shp"
+
        # the fips_states_2010.csv is generated from data here
        # https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
        self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
@ -50,7 +54,7 @@ class CensusETL(ExtractTransformLoad):
        file_path: Path
        if file_type == GeoFileType.SHP:
            file_path = Path(
-                self.SHP_BASE_PATH
+                self.shape_file_path
                / fips_code
                / f"tl_2010_{fips_code}_tract10.shp"
            )
@ -60,33 +64,22 @@ class CensusETL(ExtractTransformLoad):
            file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
        return file_path

-    def _extract_shp(self, fips_code: str) -> None:
-        """Download the SHP file for the provided FIPS code
+    def get_data_sources(self) -> [DataSource]:

-        Args:
-            fips_code (str): the FIPS code for the region of interest
+        sources = []

-        Returns:
-            None
-        """
-        shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
+        for fips_code in self.STATE_FIPS_CODES:

-        # check if file exists
-        if not shp_file_path.is_file():
            tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
-            unzip_file_from_url(
-                tract_state_url,
-                self.TMP_PATH,
-                self.DATA_PATH / "census" / "shp" / fips_code,
+            destination_path = self.shape_file_path / fips_code
+
+            sources.append(
+                ZIPDataSource(
+                    source=tract_state_url, destination=destination_path
+                )
            )

-    def extract(self) -> None:
-        logger.debug("Extracting census data")
-        for index, fips_code in enumerate(self.STATE_FIPS_CODES):
-            logger.debug(
-                f"Extracting shape for FIPS {fips_code} – {index+1} of {len(self.STATE_FIPS_CODES)}"
-            )
-            self._extract_shp(fips_code)
+        return sources

    def _transform_to_geojson(self, fips_code: str) -> None:
        """Convert the downloaded SHP file for the associated FIPS to geojson
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@ -56,6 +56,7 @@ def get_state_fips_codes(data_path: Path) -> list:
            else:
                fips = row[0].strip()
                fips_state_list.append(fips)
+
    return fips_state_list


--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -8,12 +8,11 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census_acs.etl_imputations import (
    calculate_income_measures,
 )
-from data_pipeline.etl.sources.census_acs.etl_utils import (
-    retrieve_census_acs_data,
-)
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import CensusDataSource

 logger = get_module_logger(__name__)

@ -28,6 +27,9 @@ class CensusACSETL(ExtractTransformLoad):
    MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1

    def __init__(self):
+
+        self.census_acs_source = self.get_sources_path() / "acs.csv"
+
        self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
        self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
        self.EMPLOYMENT_FIELDS = [
@ -311,6 +313,34 @@ class CensusACSETL(ExtractTransformLoad):

        self.df: pd.DataFrame

+    def get_data_sources(self) -> [DataSource]:
+        # Define the variables to retrieve
+        variables = (
+            [
+                self.MEDIAN_INCOME_FIELD,
+                self.MEDIAN_HOUSE_VALUE_FIELD,
+            ]
+            + self.EMPLOYMENT_FIELDS
+            + self.LINGUISTIC_ISOLATION_FIELDS
+            + self.POVERTY_FIELDS
+            + self.EDUCATIONAL_FIELDS
+            + self.RE_FIELDS
+            + self.COLLEGE_ATTENDANCE_FIELDS
+            + self.AGE_INPUT_FIELDS
+        )
+
+        return [
+            CensusDataSource(
+                source=None,
+                destination=self.census_acs_source,
+                acs_year=self.ACS_YEAR,
+                variables=variables,
+                tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
+                data_path_for_fips_codes=self.DATA_PATH,
+                acs_type="acs5",
+            )
+        ]
+
    # pylint: disable=too-many-arguments
    def _merge_geojson(
        self,
@ -339,27 +369,15 @@ class CensusACSETL(ExtractTransformLoad):
            )
        )

-    def extract(self) -> None:
-        # Define the variables to retrieve
-        variables = (
-            [
-                self.MEDIAN_INCOME_FIELD,
-                self.MEDIAN_HOUSE_VALUE_FIELD,
-            ]
-            + self.EMPLOYMENT_FIELDS
-            + self.LINGUISTIC_ISOLATION_FIELDS
-            + self.POVERTY_FIELDS
-            + self.EDUCATIONAL_FIELDS
-            + self.RE_FIELDS
-            + self.COLLEGE_ATTENDANCE_FIELDS
-            + self.AGE_INPUT_FIELDS
-        )
+    def extract(self, use_cached_data_sources: bool = False) -> None:

-        self.df = retrieve_census_acs_data(
-            acs_year=self.ACS_YEAR,
-            variables=variables,
-            tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
-            data_path_for_fips_codes=self.DATA_PATH,
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.census_acs_source,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
@ -1,10 +1,9 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.etl.sources.census_acs.etl_utils import (
-    retrieve_census_acs_data,
-)
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import CensusDataSource

 logger = get_module_logger(__name__)

@ -18,6 +17,9 @@ class CensusACS2010ETL(ExtractTransformLoad):
    """

    def __init__(self):
+
+        self.census_acs_source = self.get_sources_path() / "acs_2010.csv"
+
        self.ACS_YEAR = 2010
        self.ACS_TYPE = "acs5"
        self.OUTPUT_PATH = (
@ -99,7 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        # Define the variables to retrieve
        variables = (
            self.UNEMPLOYED_FIELDS
@ -107,13 +109,26 @@ class CensusACS2010ETL(ExtractTransformLoad):
            + self.POVERTY_FIELDS
        )

-        # Use the method defined on CensusACSETL to reduce coding redundancy.
-        self.df = retrieve_census_acs_data(
-            acs_year=self.ACS_YEAR,
-            variables=variables,
-            tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
-            data_path_for_fips_codes=self.DATA_PATH,
-            acs_type=self.ACS_TYPE,
+        return [
+            CensusDataSource(
+                source=None,
+                destination=self.census_acs_source,
+                acs_year=self.ACS_YEAR,
+                variables=variables,
+                tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
+                data_path_for_fips_codes=self.DATA_PATH,
+                acs_type=self.ACS_TYPE,
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.census_acs_source, dtype={"GEOID10_TRACT": "string"}
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
@ -1,14 +1,16 @@
+import os
 import json
 from pathlib import Path

 import numpy as np
 import pandas as pd
-import requests
+
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
+from data_pipeline.etl.datasource import FileDataSource

 logger = get_module_logger(__name__)

@ -22,6 +24,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            / f"census_acs_median_income_{self.ACS_YEAR}"
        )

+        self.GEOCORR_ALL_STATES_URL = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/geocorr2014_all_states_tracts_only.csv.zip"
+        )
+        self.GEOCORR_ALL_STATES_PATH = self.get_sources_path() / "geocorr"
+        self.GEOCORR_ALL_STATES_SOURCE = (
+            self.GEOCORR_ALL_STATES_PATH
+            / "geocorr2014_all_states_tracts_only.csv"
+        )
+
        # Set constants for Geocorr MSAs data.
        self.PLACE_FIELD_NAME: str = "Census Place Name"
        self.COUNTY_FIELD_NAME: str = "County Name"
@ -39,10 +51,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E"
            + "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area"
        )
+        self.MSA_MEDIAN_INCOME_SOURCE = (
+            self.get_sources_path() / "msa" / "msa_median_income.json"
+        )
        self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)"

        # Set constants for state median incomes
        self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state"
+        self.STATE_MEDIAN_INCOME_SOURCE = (
+            self.get_sources_path() / "state" / "state_median_income.json"
+        )
        self.STATE_GEOID_FIELD_NAME: str = "GEOID2"
        self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)"

@ -50,6 +68,18 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        self.PUERTO_RICO_S3_LINK: str = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv"
        )
+        self.PUERTO_RICO_ALL_STATES_SOURCE = (
+            self.get_sources_path() / "pr_tracts" / "pr_tracts.csv"
+        )
+
+        census_api_key = os.environ.get("CENSUS_API_KEY")
+        if census_api_key:
+            self.MSA_MEDIAN_INCOME_URL = (
+                self.MSA_MEDIAN_INCOME_URL + f"&key={census_api_key}"
+            )
+            self.STATE_MEDIAN_INCOME_URL = (
+                self.STATE_MEDIAN_INCOME_URL + f"&key={census_api_key}"
+            )

        # Constants for output
        self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
@ -76,6 +106,27 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        self.state_median_incomes: dict
        self.pr_tracts: pd.DataFrame

+    def get_data_sources(self) -> [DataSource]:
+
+        return [
+            ZIPDataSource(
+                source=self.GEOCORR_ALL_STATES_URL,
+                destination=self.GEOCORR_ALL_STATES_PATH,
+            ),
+            FileDataSource(
+                source=self.PUERTO_RICO_S3_LINK,
+                destination=self.PUERTO_RICO_ALL_STATES_SOURCE,
+            ),
+            FileDataSource(
+                source=self.MSA_MEDIAN_INCOME_URL,
+                destination=self.MSA_MEDIAN_INCOME_SOURCE,
+            ),
+            FileDataSource(
+                source=self.STATE_MEDIAN_INCOME_URL,
+                destination=self.STATE_MEDIAN_INCOME_SOURCE,
+            ),
+        ]
+
    def _transform_geocorr(self) -> pd.DataFrame:
        # Transform the geocorr data
        geocorr_df = self.raw_geocorr_df
@ -223,7 +274,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        )
        return state_median_incomes_df

-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
        # Load and clean GEOCORR data
        # Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
        # The specific query used is the following, which takes a couple of minutes to run:
@ -239,18 +291,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        # - Core based statistical area (CBSA)
        # - CBSA Type (Metro or Micro)
        logger.debug("Starting download of 1.5MB Geocorr information.")
-
-        unzip_file_from_url(
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/geocorr2014_all_states_tracts_only.csv.zip",
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path() / "geocorr",
-        )
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

        self.raw_geocorr_df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "geocorr"
-            / "geocorr2014_all_states_tracts_only.csv",
+            filepath_or_buffer=self.GEOCORR_ALL_STATES_SOURCE,
            # Skip second row, which has descriptions.
            skiprows=[1],
            # The following need to remain as strings for all of their digits, not get converted to numbers.
@ -264,39 +310,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            low_memory=False,
        )

-        logger.debug("Pulling PR tract list down.")
-        # This step is necessary because PR is not in geocorr at the level that gets joined
-        pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
-        download_file_from_url(
-            file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file
-        )
        self.pr_tracts = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "pr_tracts"
-            / "pr_tracts.csv",
+            filepath_or_buffer=self.PUERTO_RICO_ALL_STATES_SOURCE,
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={"GEOID10_TRACT": str},
            low_memory=False,
        )
        self.pr_tracts["State Abbreviation"] = "PR"

-        # Download MSA median incomes
-        logger.debug("Starting download of MSA median incomes.")
-        download = requests.get(
-            self.MSA_MEDIAN_INCOME_URL,
-            verify=None,
-            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
-        )
-        self.msa_median_incomes = json.loads(download.content)
+        with self.MSA_MEDIAN_INCOME_SOURCE.open() as source:
+            self.msa_median_incomes = json.load(source)

-        # Download state median incomes
-        logger.debug("Starting download of state median incomes.")
-        download_state = requests.get(
-            self.STATE_MEDIAN_INCOME_URL,
-            verify=None,
-            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
-        )
-        self.state_median_incomes = json.loads(download_state.content)
+        with self.STATE_MEDIAN_INCOME_SOURCE.open() as source:
+            self.state_median_incomes = json.load(source)
        ## NOTE we already have PR's MI here

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
@ -1,13 +1,14 @@
 import json
 from typing import List
+import os

 import numpy as np
 import pandas as pd
-import requests
-from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource

 pd.options.mode.chained_assignment = "raise"

@ -342,20 +343,23 @@ class CensusDecennialETL(ExtractTransformLoad):
            + "&for=tract:*&in=state:{}%20county:{}"
        )

+        census_api_key = os.environ.get("CENSUS_API_KEY")
+        if census_api_key:
+            self.API_URL = self.API_URL + f"&key={census_api_key}"
+
        self.final_race_fields: List[str] = []

        self.df: pd.DataFrame
        self.df_vi: pd.DataFrame
        self.df_all: pd.DataFrame

-    def extract(self) -> None:
-        dfs = []
-        dfs_vi = []
+    def get_data_sources(self) -> [DataSource]:
+
+        sources = []
+
        for island in self.ISLAND_TERRITORIES:
-            logger.debug(
-                f"Downloading data for state/territory {island['state_abbreviation']}"
-            )
            for county in island["county_fips"]:
+
                api_url = self.API_URL.format(
                    self.DECENNIAL_YEAR,
                    island["state_abbreviation"],
@ -363,17 +367,48 @@ class CensusDecennialETL(ExtractTransformLoad):
                    island["fips"],
                    county,
                )
-                logger.debug(f"CENSUS: Requesting {api_url}")
-                download = requests.get(
-                    api_url,
-                    timeout=settings.REQUESTS_DEFAULT_TIMOUT,
+
+                sources.append(
+                    FileDataSource(
+                        source=api_url,
+                        destination=self.get_sources_path()
+                        / str(self.DECENNIAL_YEAR)
+                        / island["state_abbreviation"]
+                        / island["fips"]
+                        / county
+                        / "census.json",
+                    )
                )

+        return sources
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        dfs = []
+        dfs_vi = []
+        for island in self.ISLAND_TERRITORIES:
+            logger.debug(
+                f"Downloading data for state/territory {island['state_abbreviation']}"
+            )
+            for county in island["county_fips"]:
+
                try:
-                    df = json.loads(download.content)
+                    filepath = (
+                        self.get_sources_path()
+                        / str(self.DECENNIAL_YEAR)
+                        / island["state_abbreviation"]
+                        / island["fips"]
+                        / county
+                        / "census.json"
+                    )
+                    df = json.load(filepath.open())
                except ValueError as e:
                    logger.error(
-                        f"Could not load content in census decennial ETL because {e}. Content is {download.content}."
+                        f"Could not load content in census decennial ETL because {e}."
                    )

                # First row is the header
--- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
@ -5,6 +5,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)

@ -38,17 +40,26 @@ class ChildOpportunityIndex(ExtractTransformLoad):
    PUERTO_RICO_EXPECTED_IN_DATA = False

    def __init__(self):
+
+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.SOURCE_URL = (
+            self.child_opportunity_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "child_opportunity_index/raw.zip"
            )
        else:
-            self.SOURCE_URL = (
+            self.child_opportunity_url = (
                "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
                "3a0ededa30a0?format=csv"
            )

+        # input
+        self.child_opportunity_index_source = (
+            self.get_sources_path() / "raw.csv"
+        )
+
+        # output
+
        # TODO: Decide about nixing this
        self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME

@ -62,17 +73,25 @@ class ChildOpportunityIndex(ExtractTransformLoad):
        self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN"
        self.READING_INPUT_FIELD = "ED_READING"

+        self.raw_df: pd.DataFrame
        self.output_df: pd.DataFrame

-    def extract(self) -> None:
-        super().extract(
-            source_url=self.SOURCE_URL,
-            extract_path=self.get_tmp_path(),
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.child_opportunity_url,
+                destination=self.get_sources_path(),
+            )
+        ]

-    def transform(self) -> None:
-        raw_df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path() / "raw.csv",
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.raw_df = pd.read_csv(
+            filepath_or_buffer=self.child_opportunity_index_source,
            # The following need to remain as strings for all of their digits, not get
            # converted to numbers.
            dtype={
@ -81,7 +100,9 @@ class ChildOpportunityIndex(ExtractTransformLoad):
            low_memory=False,
        )

-        output_df = raw_df.rename(
+    def transform(self) -> None:
+
+        output_df = self.raw_df.rename(
            columns={
                self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD,
--- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
@ -5,22 +5,35 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)


 class DOEEnergyBurden(ExtractTransformLoad):
+
    NAME = "doe_energy_burden"
-    SOURCE_URL: str = (
-        settings.AWS_JUSTICE40_DATASOURCES_URL
-        + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
-    )
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    LOAD_YAML_CONFIG: bool = True

    REVISED_ENERGY_BURDEN_FIELD_NAME: str

    def __init__(self):
+
+        # fetch
+        self.doe_energy_burden_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
+        )
+
+        # input
+        self.doe_energy_burden_source = (
+            self.get_sources_path() / "DOE_LEAD_AMI_TRACT_2018_ALL.csv"
+        )
+
+        # output
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "doe_energy_burden"
        )
@ -29,10 +42,22 @@ class DOEEnergyBurden(ExtractTransformLoad):
        self.raw_df: pd.DataFrame
        self.output_df: pd.DataFrame

-    def transform(self) -> None:
-        raw_df: pd.DataFrame = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.doe_energy_burden_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.raw_df = pd.read_csv(
+            filepath_or_buffer=self.doe_energy_burden_source,
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={
                self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
@ -40,8 +65,10 @@ class DOEEnergyBurden(ExtractTransformLoad):
            low_memory=False,
        )

+    def transform(self) -> None:
+
        logger.debug("Renaming columns and ensuring output format is correct")
-        output_df = raw_df.rename(
+        output_df = self.raw_df.rename(
            columns={
                self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
@ -3,6 +3,8 @@
 import geopandas as gpd
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -15,14 +17,6 @@ class TravelCompositeETL(ExtractTransformLoad):

    NAME = "travel_composite"

-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        SOURCE_URL = (
-            f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
-            "dot_travel_composite/Shapefile_and_Metadata.zip"
-        )
-    else:
-        SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
-
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -31,14 +25,29 @@ class TravelCompositeETL(ExtractTransformLoad):
    TRAVEL_BURDEN_FIELD_NAME: str

    def __init__(self):
+
+        # fetch
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.travel_composite_url = (
+                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
+                "dot_travel_composite/Shapefile_and_Metadata.zip"
+            )
+        else:
+            self.travel_composite_url = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
+
+        # input
        # define the full path for the input CSV file
-        self.INPUT_SHP = (
-            self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
+        self.disadvantage_layer_shape_source = (
+            self.get_sources_path()
+            / "DOT_Disadvantage_Layer_Final_April2022.shp"
        )

+        # output
        # this is the main dataframe
        self.df: pd.DataFrame

+        self.df_dot: pd.DataFrame
+
        # Start dataset-specific vars here
        ## Average of Transportation Indicator Percentiles (calculated)
        ## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
@ -46,6 +55,22 @@ class TravelCompositeETL(ExtractTransformLoad):
        self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
        self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"

+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.travel_composite_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df_dot = gpd.read_file(self.disadvantage_layer_shape_source)
+
    def transform(self) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:
@ -54,15 +79,15 @@ class TravelCompositeETL(ExtractTransformLoad):
        - Converts to CSV
        """

-        # read in the unzipped shapefile from data source
        # reformat it to be standard df, remove unassigned rows, and
        # then rename the Census Tract column for merging
-        df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
-        df_dot = df_dot.rename(
+
+        self.df_dot = self.df_dot.rename(
            columns={
                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
            }
        ).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
+
        # Assign the final df to the class' output_df for the load method
-        self.output_df = df_dot
+        self.output_df = self.df_dot
--- a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
@ -1,12 +1,15 @@
 from pathlib import Path

-import geopandas as gpd
 import pandas as pd
+import geopandas as gpd
+
 from data_pipeline.config import settings
-from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
-from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries

 logger = get_module_logger(__name__)

@ -39,13 +42,20 @@ class AbandonedMineETL(ExtractTransformLoad):
        "55",
    ]

-    # Define these for easy code completion
    def __init__(self):
-        self.SOURCE_URL = (
+
+        # fetch
+        self.eamlis_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/eAMLIS export of all data.tsv.zip"
        )

+        # input
+        self.eamlis_source = (
+            self.get_sources_path() / "eAMLIS export of all data.tsv"
+        )
+
+        # output
        self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME

        self.OUTPUT_PATH: Path = (
@ -58,18 +68,34 @@ class AbandonedMineETL(ExtractTransformLoad):
        ]

        self.output_df: pd.DataFrame
+        self.df: pd.DataFrame

-    def transform(self) -> None:
-        df = pd.read_csv(
-            self.get_tmp_path() / "eAMLIS export of all data.tsv",
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.eamlis_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.eamlis_source,
            sep="\t",
            low_memory=False,
        )
+
+    def transform(self) -> None:
+
        gdf = gpd.GeoDataFrame(
-            df,
+            self.df,
            geometry=gpd.points_from_xy(
-                x=df["Longitude"],
-                y=df["Latitude"],
+                x=self.df["Longitude"],
+                y=self.df["Latitude"],
            ),
            crs="epsg:4326",
        )
@ -77,4 +103,5 @@ class AbandonedMineETL(ExtractTransformLoad):
        gdf_tracts = add_tracts_for_geometries(gdf)
        gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
        gdf_tracts[self.AML_BOOLEAN] = True
+
        self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@ -3,6 +3,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)

@ -15,11 +17,18 @@ class EJSCREENETL(ExtractTransformLoad):
    INPUT_GEOID_TRACT_FIELD_NAME: str = "ID"

    def __init__(self):
-        self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
-        self.EJSCREEN_CSV = (
-            self.get_tmp_path() / "EJSCREEN_2021_USPR_Tracts.csv"
+
+        # fetch
+        self.ejscreen_url = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
+
+        # input
+        self.ejscreen_source = (
+            self.get_sources_path() / "EJSCREEN_2021_USPR_Tracts.csv"
        )
+
+        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen"
+
        self.df: pd.DataFrame

        self.COLUMNS_TO_KEEP = [
@ -43,22 +52,29 @@ class EJSCREENETL(ExtractTransformLoad):
            field_names.UST_FIELD,
        ]

-    def extract(self) -> None:
-        super().extract(
-            self.EJSCREEN_FTP_URL,
-            self.get_tmp_path(),
-            verify=False,  # EPA EJScreen end point has certificate issues often
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.ejscreen_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

-    def transform(self) -> None:
        self.df = pd.read_csv(
-            self.EJSCREEN_CSV,
+            self.ejscreen_source,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
            # EJSCREEN writes the word "None" for NA data.
            na_values=["None"],
            low_memory=False,
        )

+    def transform(self) -> None:
+
        # rename ID to Tract ID
        self.output_df = self.df.rename(
            columns={
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
@ -1,5 +1,6 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)
@ -9,12 +10,17 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
    # Note: while we normally set these properties in `__init__`,
    # we are setting them as class properties here so they can be accessed by the
    # class method `ejscreen_areas_of_concern_data_exists`.
-    LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
-    EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
-        LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
+
+    EJSCREEN_AREAS_OF_CONCERN_SOURCE = (
+        ExtractTransformLoad.DATA_PATH
+        / "sources"
+        / "EJSCREENAreasOfConcernETL"
+        / "ejscreen_areas_of_concerns_indicators.csv"
    )

    def __init__(self):
+
+        # output
        self.OUTPUT_PATH = (
            self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
        )
@ -22,6 +28,10 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
        # TO DO: Load from actual source; the issue is that this dataset is not public for now
        self.df: pd.DataFrame

+    def get_data_sources(self) -> [DataSource]:
+        """The source for this must be downloaded and saved manually. It is not publicly available"""
+        return []
+
    @classmethod
    def ejscreen_areas_of_concern_data_exists(cls):
        """Check whether or not the EJSCREEN areas of concern data exists.
@ -35,13 +45,19 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
        not reference this data.

        """
-        return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
+        return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE.is_file()

-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        logger.info(self.EJSCREEN_AREAS_OF_CONCERN_SOURCE)
        if self.ejscreen_areas_of_concern_data_exists():
            logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
            self.df = pd.read_csv(
-                filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
+                filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE,
                dtype={
                    self.GEOID_FIELD_NAME: "string",
                },
--- a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
@ -5,18 +5,27 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)


 class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
    def __init__(self):
-        self.DEFINITION_ALTERNATIVE_FILE_URL = (
+
+        # fetch
+        self.definition_alternative_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/alternative DAC definition.csv.zip"
        )

+        # input
+        self.definition_alternative_source = (
+            self.get_sources_path() / "J40 alternative DAC definition.csv"
+        )
+
+        # output
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "energy_definition_alternative_draft"
        )
@ -48,18 +57,22 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        unzip_file_from_url(
-            file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path()
-            / "energy_definition_alternative_draft",
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.definition_alternative_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

        self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "energy_definition_alternative_draft"
-            / "J40 alternative DAC definition.csv",
+            filepath_or_buffer=self.definition_alternative_source,
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={
                self.TRACT_INPUT_COLUMN_NAME: "string",
@ -68,6 +81,7 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
        )

    def transform(self) -> None:
+
        self.df = self.df.rename(
            columns={
                self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
@ -4,8 +4,9 @@ import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)

@ -23,17 +24,25 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):

    def __init__(self):

+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.AGGREGATED_RSEI_SCORE_FILE_URL = (
+            self.aggregated_rsei_score_file_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
            )
        else:
-            self.AGGREGATED_RSEI_SCORE_FILE_URL = (
+            self.aggregated_rsei_score_file_url = (
                "http://abt-rsei.s3.amazonaws.com/microdata2019/"
                "census_agg/CensusMicroTracts2019_2019_aggregated.zip"
            )

+        # input
+        self.aggregated_rsei_score_source = (
+            self.get_sources_path()
+            / "CensusMicroTracts2019_2019_aggregated.csv"
+        )
+
+        # output
        self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
        self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
        self.TRACT_INPUT_COLUMN_NAME = "GEOID10"
@ -64,7 +73,20 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.aggregated_rsei_score_file_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        # the column headers from the above dataset are actually a census tract's data at this point
        # We will use this data structure later to specify the column names
        input_columns = [
@ -79,16 +101,8 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
            self.NCSCORE_INPUT_FIELD,
        ]

-        unzip_file_from_url(
-            file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL,
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path() / "epa_rsei",
-        )
-
        self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "epa_rsei"
-            / "CensusMicroTracts2019_2019_aggregated.csv",
+            filepath_or_buffer=self.aggregated_rsei_score_source,
            # The following need to remain as strings for all of their digits, not get
            # converted to numbers.
            low_memory=False,
--- a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py
@ -5,6 +5,8 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)

@ -15,7 +17,7 @@ class FloodRiskETL(ExtractTransformLoad):
    NAME = "fsf_flood_risk"
    # These data were emailed to the J40 team while first street got
    # their official data sharing channels setup.
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    LOAD_YAML_CONFIG: bool = True

@ -27,13 +29,16 @@ class FloodRiskETL(ExtractTransformLoad):
    SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str

    def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = (
-            self.get_tmp_path() / "fsf_flood" / "flood-tract2010.csv"
+
+        # fetch
+        self.flood_tract_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
        )

-        # this is the main dataframe
-        self.df: pd.DataFrame
+        # input
+        self.flood_tract_source = (
+            self.get_sources_path() / "fsf_flood" / "flood-tract2010.csv"
+        )

        # Start dataset-specific vars here
        self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
@ -41,6 +46,29 @@ class FloodRiskETL(ExtractTransformLoad):
        self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30"
        self.CLIP_PROPERTIES_COUNT = 250

+        self.df_fsf_flood: pd.DataFrame
+
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.flood_tract_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        # read in the unzipped csv data source then rename the
+        # Census Tract column for merging
+        self.df_fsf_flood = pd.read_csv(
+            self.flood_tract_source,
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
+            low_memory=False,
+        )
+
    def transform(self) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:
@ -49,35 +77,29 @@ class FloodRiskETL(ExtractTransformLoad):
        - Calculates share of properties at risk, left-clipping number of properties at 250
        """

-        # read in the unzipped csv data source then rename the
-        # Census Tract column for merging
-        df_fsf_flood: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
-            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
-            low_memory=False,
-        )
-
-        df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = df_fsf_flood[
+        self.df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_flood[
            self.INPUT_GEOID_TRACT_FIELD_NAME
        ].str.zfill(11)

-        df_fsf_flood[self.COUNT_PROPERTIES] = df_fsf_flood[
+        self.df_fsf_flood[self.COUNT_PROPERTIES] = self.df_fsf_flood[
            self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
        ].clip(lower=self.CLIP_PROPERTIES_COUNT)

-        df_fsf_flood[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY] = (
-            df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
-            / df_fsf_flood[self.COUNT_PROPERTIES]
+        self.df_fsf_flood[
+            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY
+        ] = (
+            self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
+            / self.df_fsf_flood[self.COUNT_PROPERTIES]
        )
-        df_fsf_flood[
+        self.df_fsf_flood[
            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS
        ] = (
-            df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
-            / df_fsf_flood[self.COUNT_PROPERTIES]
+            self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
+            / self.df_fsf_flood[self.COUNT_PROPERTIES]
        )

        # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_fsf_flood.rename(
+        self.output_df = self.df_fsf_flood.rename(
            columns={
                self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY,
                self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS,
--- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
@ -4,6 +4,8 @@ import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)
@ -15,7 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad):
    NAME = "fsf_wildfire_risk"
    # These data were emailed to the J40 team while first street got
    # their official data sharing channels setup.
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -29,18 +31,48 @@ class WildfireRiskETL(ExtractTransformLoad):
    SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str

    def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = self.get_tmp_path() / "fsf_fire" / "fire-tract2010.csv"

+        # fetch
+        self.fsf_fire_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
+        )
+
+        # input
+        self.fsf_fire_source = (
+            self.get_sources_path() / "fsf_fire" / "fire-tract2010.csv"
+        )
+
+        # output
        # this is the main dataframe
        self.df: pd.DataFrame

+        self.df_fsf_fire: pd.DataFrame
+
        # Start dataset-specific vars here
        self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
        self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag"
        self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag"
        self.CLIP_PROPERTIES_COUNT = 250

+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.fsf_fire_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df_fsf_fire = pd.read_csv(
+            self.fsf_fire_source,
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
+            low_memory=False,
+        )
+
    def transform(self) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:
@ -50,31 +82,28 @@ class WildfireRiskETL(ExtractTransformLoad):
        """
        # read in the unzipped csv data source then rename the
        # Census Tract column for merging
-        df_fsf_fire: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
-            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
-            low_memory=False,
-        )

-        df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = df_fsf_fire[
+        self.df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_fire[
            self.INPUT_GEOID_TRACT_FIELD_NAME
        ].str.zfill(11)

-        df_fsf_fire[self.COUNT_PROPERTIES] = df_fsf_fire[
+        self.df_fsf_fire[self.COUNT_PROPERTIES] = self.df_fsf_fire[
            self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
        ].clip(lower=self.CLIP_PROPERTIES_COUNT)

-        df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
-            df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
-            / df_fsf_fire[self.COUNT_PROPERTIES]
+        self.df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
+            self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
+            / self.df_fsf_fire[self.COUNT_PROPERTIES]
        )
-        df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS] = (
-            df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
-            / df_fsf_fire[self.COUNT_PROPERTIES]
+        self.df_fsf_fire[
+            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS
+        ] = (
+            self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
+            / self.df_fsf_fire[self.COUNT_PROPERTIES]
        )

        # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_fsf_fire.rename(
+        self.output_df = self.df_fsf_fire.rename(
            columns={
                self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY,
                self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS,
--- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
@ -3,17 +3,33 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)


 class GeoCorrETL(ExtractTransformLoad):
+
    NAME = "geocorr"
+
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False

    def __init__(self):
+
+        # fetch
+        self.geocorr_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/geocorr_urban_rural.csv.zip"
+        )
+
+        # input
+        self.geocorr_source = (
+            self.get_sources_path() / "geocorr_urban_rural.csv"
+        )
+
+        # output
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"

        # Need to change hyperlink to S3
@ -23,7 +39,7 @@ class GeoCorrETL(ExtractTransformLoad):
        # The source data for this notebook was downloaded from GeoCorr;
        # the instructions for generating the source data is here:
        # https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787
-        self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
+        # self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
        self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
        self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag"
        self.COLUMNS_TO_KEEP = [
@ -33,16 +49,21 @@ class GeoCorrETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        unzip_file_from_url(
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/geocorr_urban_rural.csv.zip",
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path(),
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.geocorr_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

        self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv",
+            filepath_or_buffer=self.geocorr_source,
            dtype={
                self.GEOCORR_GEOID_FIELD_NAME: "string",
            },
--- a/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py
@ -3,12 +3,16 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)


 class HistoricRedliningETL(ExtractTransformLoad):
+
    NAME = "historic_redlining"
+
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
    EXPECTED_MISSING_STATES = [
        "10",
@ -25,14 +29,14 @@ class HistoricRedliningETL(ExtractTransformLoad):
    ]
    PUERTO_RICO_EXPECTED_IN_DATA = False
    ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"

    def __init__(self):
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "historic_redlining"

-        self.HISTORIC_REDLINING_FILE_PATH = (
-            self.get_tmp_path() / "HRS_2010.xlsx"
-        )
+        # fetch
+        self.hrs_url = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
+
+        # input
+        self.hrs_source = self.get_sources_path() / "HRS_2010.xlsx"

        self.REDLINING_SCALAR = "Tract-level redlining score"

@ -40,30 +44,47 @@ class HistoricRedliningETL(ExtractTransformLoad):
            self.GEOID_TRACT_FIELD_NAME,
            self.REDLINING_SCALAR,
        ]
+
        self.df: pd.DataFrame
+        self.historic_redlining_data: pd.DataFrame
+
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.hrs_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.historic_redlining_data = pd.read_excel(self.hrs_source)

    def transform(self) -> None:
        # this is obviously temporary
-        historic_redlining_data = pd.read_excel(
-            self.HISTORIC_REDLINING_FILE_PATH
+
+        self.historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
+            self.historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
        )
-        historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
-            historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
-        )
-        historic_redlining_data = historic_redlining_data.rename(
+        self.historic_redlining_data = self.historic_redlining_data.rename(
            columns={"HRS2010": self.REDLINING_SCALAR}
        )

-        logger.debug(f"{historic_redlining_data.columns}")
+        logger.debug(f"{self.historic_redlining_data.columns}")

        # Calculate lots of different score thresholds for convenience
        for threshold in [3.25, 3.5, 3.75]:
-            historic_redlining_data[
+            self.historic_redlining_data[
                f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
-            ] = (historic_redlining_data[self.REDLINING_SCALAR] >= threshold)
+            ] = (
+                self.historic_redlining_data[self.REDLINING_SCALAR] >= threshold
+            )
            ## NOTE We add to columns to keep here
            self.COLUMNS_TO_KEEP.append(
                f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
            )

-        self.output_df = historic_redlining_data
+        self.output_df = self.historic_redlining_data
--- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
@ -1,8 +1,9 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
 from pandas.errors import EmptyDataError

 logger = get_module_logger(__name__)
@ -10,36 +11,46 @@ logger = get_module_logger(__name__)

 class HousingTransportationETL(ExtractTransformLoad):
    def __init__(self):
-        self.HOUSING_FTP_URL = (
-            "https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
-        )
+
        self.OUTPUT_PATH = (
            self.DATA_PATH / "dataset" / "housing_and_transportation_index"
        )
        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+
+        housing_url = (
+            "https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
+        )
+
+        sources = []
+
+        for fips in get_state_fips_codes(self.DATA_PATH):
+            sources.append(
+                ZIPDataSource(
+                    source=f"{housing_url}{fips}",
+                    destination=self.get_sources_path(),
+                )
+            )
+
+        return sources
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        # Download each state / territory individually
        dfs = []
-        zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
        for fips in get_state_fips_codes(self.DATA_PATH):
-            logger.debug(
-                f"Downloading housing data for state/territory with FIPS code {fips}"
-            )

-            unzip_file_from_url(
-                f"{self.HOUSING_FTP_URL}{fips}",
-                self.get_tmp_path(),
-                zip_file_dir,
-            )
-
-            # New file name:
-            tmp_csv_file_path = (
-                zip_file_dir / f"htaindex2019_data_tracts_{fips}.csv"
+            csv_source = (
+                self.get_sources_path() / f"htaindex2019_data_tracts_{fips}.csv"
            )

            try:
-                tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
+                tmp_df = pd.read_csv(filepath_or_buffer=csv_source)
            except EmptyDataError:
                logger.error(
                    f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
@ -3,24 +3,33 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)


 class HudHousingETL(ExtractTransformLoad):
+
    NAME = "hud_housing"
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT

    def __init__(self):
-        self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"

+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.HOUSING_FTP_URL = (
+            self.housing_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "hud_housing/2014thru2018-140-csv.zip"
            )
        else:
-            self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
+            self.housing_url = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
+
+        # source
+
+        # output
+
+        self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"

        self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()

@ -55,15 +64,16 @@ class HudHousingETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        super().extract(
-            self.HOUSING_FTP_URL,
-            self.HOUSING_ZIP_FILE_DIR,
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.housing_url, destination=self.get_sources_path()
+            )
+        ]

    def _read_chas_table(self, file_name):
-        # New file name:
-        tmp_csv_file_path = self.HOUSING_ZIP_FILE_DIR / "140" / file_name
+
+        tmp_csv_file_path = self.get_sources_path() / "140" / file_name
        tmp_df = pd.read_csv(
            filepath_or_buffer=tmp_csv_file_path,
            encoding="latin-1",
@ -78,7 +88,12 @@ class HudHousingETL(ExtractTransformLoad):

        return tmp_df

-    def transform(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        table_8 = self._read_chas_table("Table8.csv")
        table_3 = self._read_chas_table("Table3.csv")

@ -86,6 +101,8 @@ class HudHousingETL(ExtractTransformLoad):
            table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME
        )

+    def transform(self) -> None:
+
        # Calculate share that lacks indoor plumbing or kitchen
        # This is computed as
        # (
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
@ -1,7 +1,9 @@
 import pandas as pd
-import requests
+
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.utils import get_module_logger


@ -11,44 +13,51 @@ logger = get_module_logger(__name__)
 class HudRecapETL(ExtractTransformLoad):
    def __init__(self):

+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.HUD_RECAP_CSV_URL = (
+            self.hud_recap_csv_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
            )
        else:
-            self.HUD_RECAP_CSV_URL = (
+            self.hud_recap_csv_url = (
                "https://opendata.arcgis.com/api/v3/datasets/"
                "56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
            )

-        self.HUD_RECAP_CSV = (
-            self.get_tmp_path()
+        # input
+        self.hud_recap_source = (
+            self.get_sources_path()
            / "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
        )
+
+        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"

-        # Definining some variable names
+        # Defining some variable names
        self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
            "hud_recap_priority_community"
        )

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        download = requests.get(
-            self.HUD_RECAP_CSV_URL,
-            verify=None,
-            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
-        )
-        file_contents = download.content
-        csv_file = open(self.HUD_RECAP_CSV, "wb")
-        csv_file.write(file_contents)
-        csv_file.close()
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.hud_recap_csv_url, destination=self.hud_recap_source
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        # Load comparison index (CalEnviroScreen 4)
+        self.df = pd.read_csv(self.hud_recap_source, dtype={"GEOID": "string"})

    def transform(self) -> None:
-        # Load comparison index (CalEnviroScreen 4)
-        self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})

        self.df.rename(
            columns={
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
@ -2,6 +2,8 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger

@ -10,16 +12,25 @@ logger = get_module_logger(__name__)

 class MappingForEJETL(ExtractTransformLoad):
    def __init__(self):
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"

-        self.MAPPING_FOR_EJ_VA_URL = (
+        # fetch
+        self.mapping_for_ej_va_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip"
        )
-        self.MAPPING_FOR_EJ_CO_URL = (
+        self.mapping_for_ej_co_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
        )
-        self.VA_SHP_FILE_PATH = self.get_tmp_path() / "mej_virginia_7_1.shp"
-        self.CO_SHP_FILE_PATH = self.get_tmp_path() / "mej_colorado_final.shp"
+
+        # input
+        self.va_shp_file_source = (
+            self.get_sources_path() / "mej_virginia_7_1.shp"
+        )
+        self.co_shp_file_source = (
+            self.get_sources_path() / "mej_colorado_final.shp"
+        )
+
+        # output
+        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"

        # Defining variables
        self.COLUMNS_TO_KEEP = [
@ -38,26 +49,35 @@ class MappingForEJETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        super().extract(
-            self.MAPPING_FOR_EJ_VA_URL,
-            self.get_tmp_path(),
-        )
-        super().extract(
-            self.MAPPING_FOR_EJ_CO_URL,
-            self.get_tmp_path(),
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.mapping_for_ej_va_url,
+                destination=self.get_sources_path(),
+            ),
+            ZIPDataSource(
+                source=self.mapping_for_ej_co_url,
+                destination=self.get_sources_path(),
+            ),
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

-    def transform(self) -> None:
        # Join (here, it's just concatenating) the two dataframes from
        # CO and VA
        self.df = pd.concat(
            [
-                gpd.read_file(self.VA_SHP_FILE_PATH),
-                gpd.read_file(self.CO_SHP_FILE_PATH),
+                gpd.read_file(self.va_shp_file_source),
+                gpd.read_file(self.co_shp_file_source),
            ]
        )

+    def transform(self) -> None:
+
        # Fill Census tract to get it to be 11 digits, incl. leading 0s
        # Note that VA and CO should never have leading 0s, so this isn't
        # strictly necessary, but if in the future, there are more states
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
@ -3,8 +3,9 @@ import pathlib
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings

@ -19,31 +20,35 @@ class MappingInequalityETL(ExtractTransformLoad):

    Information on the mapping of this data to census tracts is available at
    https://github.com/americanpanorama/Census_HOLC_Research.
-
    """

    def __init__(self):
+
+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.MAPPING_INEQUALITY_CSV_URL = (
+            self.mapping_inequality_csv_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "mapping_inequality/holc_tract_lookup.csv"
            )
        else:
-            self.MAPPING_INEQUALITY_CSV_URL = (
+            self.mapping_inequality_csv_url = (
                "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
                "main/2010_Census_Tracts/holc_tract_lookup.csv"
            )
-        self.MAPPING_INEQUALITY_CSV = (
-            self.get_tmp_path() / "holc_tract_lookup.csv"
-        )
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"

-        self.HOLC_MANUAL_MAPPING_CSV_PATH = (
+        # input
+        self.mapping_inequality_source = (
+            self.get_sources_path() / "holc_tract_lookup.csv"
+        )
+        self.holc_manual_mapping_source = (  # here be dragons – this file is pulled from a different place than most
            pathlib.Path(__file__).parent
            / "data"
            / "holc_grades_manually_mapped.csv"
        )

+        # output
+        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
+
        # Some input field names. From documentation: 'Census Tracts were intersected
        # with HOLC Polygons. Census information can be joined via the "geoid" field.
        # There are two field "holc_prop" and "tract_prop" which give the proportion
@ -73,22 +78,39 @@ class MappingInequalityETL(ExtractTransformLoad):
        ]

        self.df: pd.DataFrame
+        self.holc_manually_mapped_df: pd.DataFrame

-    def extract(self) -> None:
-        download_file_from_url(
-            file_url=self.MAPPING_INEQUALITY_CSV_URL,
-            download_file_name=self.MAPPING_INEQUALITY_CSV,
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.mapping_inequality_csv_url,
+                destination=self.mapping_inequality_source,
+            )
+        ]

-    def transform(self) -> None:
-        df: pd.DataFrame = pd.read_csv(
-            self.MAPPING_INEQUALITY_CSV,
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.mapping_inequality_source,
            dtype={self.TRACT_INPUT_FIELD: "string"},
            low_memory=False,
        )

+        # Some data needs to be manually mapped to its grade.
+        # TODO: Investigate more data that may need to be manually mapped.
+        self.holc_manually_mapped_df = pd.read_csv(
+            filepath_or_buffer=self.holc_manual_mapping_source,
+            low_memory=False,
+        )
+
+    def transform(self) -> None:
+
        # rename Tract ID
-        df.rename(
+        self.df.rename(
            columns={
                self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
            },
@ -98,28 +120,21 @@ class MappingInequalityETL(ExtractTransformLoad):
        # Keep the first character, which is the HOLC grade (A, B, C, D).
        # TODO: investigate why this dataframe triggers these pylint errors.
        # pylint: disable=unsupported-assignment-operation, unsubscriptable-object
-        df[self.HOLC_GRADE_DERIVED_FIELD] = df[
+        self.df[self.HOLC_GRADE_DERIVED_FIELD] = self.df[
            self.HOLC_GRADE_AND_ID_FIELD
        ].str[0:1]

        # Remove nonsense when the field has no grade or invalid grades.
        valid_grades = ["A", "B", "C", "D"]
-        df.loc[
+        self.df.loc[
            # pylint: disable=unsubscriptable-object
-            ~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
+            ~self.df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
            self.HOLC_GRADE_DERIVED_FIELD,
        ] = None

-        # Some data needs to be manually mapped to its grade.
-        # TODO: Investigate more data that may need to be manually mapped.
-        holc_manually_mapped_df = pd.read_csv(
-            filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH,
-            low_memory=False,
-        )
-
        # Join on the existing data
-        merged_df = df.merge(
-            right=holc_manually_mapped_df,
+        merged_df = self.df.merge(
+            right=self.holc_manually_mapped_df,
            on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
            how="left",
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
@ -4,6 +4,8 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger

@ -17,11 +19,16 @@ class MarylandEJScreenETL(ExtractTransformLoad):
    """

    def __init__(self):
-        self.MARYLAND_EJSCREEN_URL = (
+
+        # fetch
+        self.maryland_ejscreen_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
        )

-        self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen"
+        # input
+        self.shape_files_source = self.get_sources_path() / "mdejscreen"
+
+        # output
        self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"

        self.COLUMNS_TO_KEEP = [
@ -31,37 +38,47 @@ class MarylandEJScreenETL(ExtractTransformLoad):
        ]

        self.df: pd.DataFrame
+        self.dfs_list: pd.DataFrame
+
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.maryland_ejscreen_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:

-    def extract(self) -> None:
-        logger.debug("Downloading 207MB Maryland EJSCREEN Data")
        super().extract(
-            self.MARYLAND_EJSCREEN_URL,
-            self.get_tmp_path(),
-        )
+            use_cached_data_sources
+        )  # download and extract data sources

-    def transform(self) -> None:
-        list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
+        logger.debug("Downloading 207MB Maryland EJSCREEN Data")
+        list_of_files = list(glob(str(self.shape_files_source) + "/*.shp"))

-        # Ignore counties becauses this is not the level of measurement
+        # Ignore counties because this is not the level of measurement
        # that is consistent with our current scoring and ranking methodology.
-        dfs_list = [
+        self.dfs_list = [
            gpd.read_file(f)
            for f in list_of_files
            if not f.endswith("CountiesEJScore.shp")
        ]

+    def transform(self) -> None:
+
        # Set the Census tract as the index and drop the geometry column
        # that produces the census tract boundaries.
        # The latter is because Geopandas raises an exception if there
        # are duplicate geometry columns.
        # Moreover, since the unit of measurement is at the tract level
        # we can consistantly merge this with other datasets
-        dfs_list = [
+        self.dfs_list = [
            df.set_index("Census_Tra").drop("geometry", axis=1)
-            for df in dfs_list
+            for df in self.dfs_list
        ]
        # pylint: disable=unsubscriptable-object
-        self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1))
+        self.df = gpd.GeoDataFrame(pd.concat(self.dfs_list, axis=1))

        # Reset index so that we no longer have the tract as our index
        self.df = self.df.reset_index()
--- a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
@ -1,6 +1,8 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger

@ -15,12 +17,21 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
    """

    def __init__(self):
-        self.MICHIGAN_EJSCREEN_S3_URL = (
+
+        # fetch
+        self.michigan_ejscreen_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/michigan_ejscore_12212021.csv"
        )

+        # input
+        self.michigan_ejscreen_source = (
+            self.get_sources_path() / "michigan_ejscore_12212021.csv"
+        )
+
+        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen"
+
        self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75

        self.COLUMNS_TO_KEEP = [
@ -32,14 +43,28 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.michigan_ejscreen_url,
+                destination=self.michigan_ejscreen_source,
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        self.df = pd.read_csv(
-            filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
+            filepath_or_buffer=self.michigan_ejscreen_source,
            dtype={"GEO_ID": "string"},
            low_memory=False,
        )

    def transform(self) -> None:
+
        self.df.rename(
            columns={
                "GEO_ID": self.GEOID_TRACT_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -4,6 +4,8 @@
 # pylint: disable=unsupported-assignment-operation
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -16,17 +18,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):

    NAME = "national_risk_index"

-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        SOURCE_URL = (
-            f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
-            "national_risk_index/NRI_Table_CensusTracts.zip"
-        )
-    else:
-        SOURCE_URL = (
-            "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
-            "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
-        )
-
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -46,11 +37,28 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    AGRIVALUE_LOWER_BOUND = 408000

    def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"

+        # fetch
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.risk_index_url = (
+                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
+                "national_risk_index/NRI_Table_CensusTracts.zip"
+            )
+        else:
+            self.risk_index_url = (
+                "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
+                "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
+            )
+
+        # source
+        self.risk_index_source = (
+            self.get_sources_path() / "NRI_Table_CensusTracts.csv"
+        )
+
+        # output
        # this is the main dataframe
        self.df: pd.DataFrame
+        self.df_nri: pd.DataFrame

        # Start dataset-specific vars here
        self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
@ -65,14 +73,26 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
        self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"

-    def extract(self) -> None:
-        """Unzips NRI dataset from the FEMA data source and writes the files
-        to the temporary data folder for use in the transform() method
-        """
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.risk_index_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:

        super().extract(
-            source_url=self.SOURCE_URL,
-            extract_path=self.get_tmp_path(),
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        # read in the unzipped csv from NRI data source then rename the
+        # Census Tract column for merging
+        self.df_nri = pd.read_csv(
+            self.risk_index_source,
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
+            na_values=["None"],
+            low_memory=False,
        )

    def transform(self) -> None:
@ -84,16 +104,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
          Groups inside of that Tract
        """

-        # read in the unzipped csv from NRI data source then rename the
-        # Census Tract column for merging
-        df_nri: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
-            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
-            na_values=["None"],
-            low_memory=False,
-        )
-
-        df_nri.rename(
+        self.df_nri.rename(
            columns={
                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
@ -123,42 +134,46 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        agriculture_columns = [
            f"{x}_EALA"
            for x in disaster_categories
-            if f"{x}_EALA" in list(df_nri.columns)
+            if f"{x}_EALA" in list(self.df_nri.columns)
        ]

        population_columns = [
            f"{x}_EALP"
            for x in disaster_categories
-            if f"{x}_EALP" in list(df_nri.columns)
+            if f"{x}_EALP" in list(self.df_nri.columns)
        ]

        buildings_columns = [
            f"{x}_EALB"
            for x in disaster_categories
-            if f"{x}_EALB" in list(df_nri.columns)
+            if f"{x}_EALB" in list(self.df_nri.columns)
        ]

-        disaster_population_sum_series = df_nri[population_columns].sum(axis=1)
-
-        disaster_agriculture_sum_series = df_nri[agriculture_columns].sum(
+        disaster_population_sum_series = self.df_nri[population_columns].sum(
            axis=1
        )

-        disaster_buildings_sum_series = df_nri[buildings_columns].sum(axis=1)
+        disaster_agriculture_sum_series = self.df_nri[agriculture_columns].sum(
+            axis=1
+        )
+
+        disaster_buildings_sum_series = self.df_nri[buildings_columns].sum(
+            axis=1
+        )

        # Population EAL Rate = Eal Valp / Population
-        df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
+        self.df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
            disaster_population_sum_series
-            / df_nri[self.POPULATION_INPUT_FIELD_NAME]
+            / self.df_nri[self.POPULATION_INPUT_FIELD_NAME]
        )

        # Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000)
        ## FORMULA ADJUSTMENT 2/17
        ## Because AGRIVALUE contains a lot of 0s, we are going to consider
        ## 90th percentile only for places that have some agrivalue at all
-        df_nri[
+        self.df_nri[
            self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
-        ] = disaster_agriculture_sum_series / df_nri[
+        ] = disaster_agriculture_sum_series / self.df_nri[
            self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME
        ].clip(
            lower=self.AGRIVALUE_LOWER_BOUND
@ -167,11 +182,11 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        ## Check that this clip worked -- that the only place the value has changed is when the clip took effect
        base_expectation = (
            disaster_agriculture_sum_series
-            / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
+            / self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
        )
        assert (
-            df_nri[
-                df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+            self.df_nri[
+                self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
                != base_expectation
            ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
            <= self.AGRIVALUE_LOWER_BOUND
@ -181,27 +196,27 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        )

        assert (
-            df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+            self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
            != base_expectation
        ).sum() > 0, "Clipping the agrivalue did nothing!"

        # This produces a boolean that is True in the case of non-zero agricultural value
-        df_nri[self.CONTAINS_AGRIVALUE] = (
-            df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
+        self.df_nri[self.CONTAINS_AGRIVALUE] = (
+            self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
        )

        # divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)).
-        df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
+        self.df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
            disaster_buildings_sum_series
-            / df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
+            / self.df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
        )

        # Round all float columns to just 10 digits.
        # Note: `round` is smart enough to only apply to float columns.
-        df_nri = df_nri.round(10)
+        self.df_nri = self.df_nri.round(10)

        # Assign the final df to the class' output_df for the load method
-        self.output_df = df_nri
+        self.output_df = self.df_nri

    def load(self) -> None:
        # Suppress scientific notation.
--- a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
@ -3,6 +3,8 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger

@ -13,10 +15,7 @@ class NatureDeprivedETL(ExtractTransformLoad):
    """ETL class for the Nature Deprived Communities dataset"""

    NAME = "nlcd_nature_deprived"
-    SOURCE_URL = (
-        settings.AWS_JUSTICE40_DATASOURCES_URL
-        + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
-    )
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -29,14 +28,25 @@ class NatureDeprivedETL(ExtractTransformLoad):
    TRACT_PERCENT_CROPLAND_FIELD_NAME: str

    def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = (
-            self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
+
+        # fetch
+        self.nature_deprived_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
        )

+        # source
+        # define the full path for the input CSV file
+        self.nature_deprived_source = (
+            self.get_sources_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
+        )
+
+        # output
        # this is the main dataframe
        self.df: pd.DataFrame

+        self.df_ncld: pd.DataFrame
+
        # Start dataset-specific vars here
        self.PERCENT_NATURAL_FIELD_NAME = "PctNatural"
        self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv"
@ -47,28 +57,43 @@ class NatureDeprivedETL(ExtractTransformLoad):
        # for area. This does indeed remove tracts from the 90th+ percentile later on
        self.TRACT_ACRES_LOWER_BOUND = 35

-    def transform(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.nature_deprived_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:

        - Renames columns as needed
        """

-        df_ncld: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df_ncld = pd.read_csv(
+            self.nature_deprived_source,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
            low_memory=False,
        )

-        df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
-            df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND
+    def transform(self) -> None:
+
+        self.df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
+            self.df_ncld[self.TRACT_ACRES_FIELD_NAME]
+            >= self.TRACT_ACRES_LOWER_BOUND
        )
-        df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
-            100 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
+        self.df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
+            100 - self.df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
        )

        # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_ncld.rename(
+        self.output_df = self.df_ncld.rename(
            columns={
                self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME,
                self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
@ -3,9 +3,10 @@ import functools
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url

 logger = get_module_logger(__name__)

@ -23,6 +24,26 @@ class PersistentPovertyETL(ExtractTransformLoad):
    PUERTO_RICO_EXPECTED_IN_DATA = False

    def __init__(self):
+
+        # fetch
+        self.poverty_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/LTDB_Std_All_Sample.zip"
+        )
+
+        # source
+        self.poverty_sources = [
+            self.get_sources_path()
+            / "ltdb_std_all_sample"
+            / "ltdb_std_1990_sample.csv",
+            self.get_sources_path()
+            / "ltdb_std_all_sample"
+            / "ltdb_std_2000_sample.csv",
+            self.get_sources_path()
+            / "ltdb_std_all_sample"
+            / "ltdb_std_2010_sample.csv",
+        ]
+
+        # output
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"

        # Need to change hyperlink to S3
@ -44,6 +65,13 @@ class PersistentPovertyETL(ExtractTransformLoad):

        self.df: pd.DataFrame

+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.poverty_url, destination=self.get_sources_path()
+            )
+        ]
+
    def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
        df = functools.reduce(
            lambda df_a, df_b: pd.merge(
@ -75,28 +103,17 @@ class PersistentPovertyETL(ExtractTransformLoad):

        return df

-    def extract(self) -> None:
-        unzipped_file_path = self.get_tmp_path()
+    def extract(self, use_cached_data_sources: bool = False) -> None:

-        unzip_file_from_url(
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/LTDB_Std_All_Sample.zip",
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=unzipped_file_path,
-        )
-
-        file_names = [
-            "ltdb_std_1990_sample.csv",
-            "ltdb_std_2000_sample.csv",
-            "ltdb_std_2010_sample.csv",
-        ]
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

        temporary_input_dfs = []

-        for file_name in file_names:
+        for file_name in self.poverty_sources:
            temporary_input_df = pd.read_csv(
-                filepath_or_buffer=unzipped_file_path
-                / f"ltdb_std_all_sample/{file_name}",
+                filepath_or_buffer=file_name,
                dtype={
                    self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
                    self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",
--- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
@ -1,6 +1,8 @@
 import geopandas as gpd
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)
@ -20,10 +22,17 @@ class TreeEquityScoreETL(ExtractTransformLoad):
    """

    def __init__(self):
-        self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
-        self.TES_CSV = self.get_tmp_path() / "tes_2021_data.csv"
+
+        # input
+        self.TES_CSV = self.get_sources_path() / "tes_2021_data.csv"
+
+        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
        self.df: gpd.GeoDataFrame
+
+        self.tes_state_dfs = []
+
+        # config
        self.states = [
            "al",
            "az",
@ -76,21 +85,36 @@ class TreeEquityScoreETL(ExtractTransformLoad):
            "wy",
        ]

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+
+        tes_url = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
+
+        sources = []
        for state in self.states:
-            super().extract(
-                f"{self.TES_URL}{state}.zip.zip",
-                f"{self.get_tmp_path()}/{state}",
+            sources.append(
+                ZIPDataSource(
+                    source=f"{tes_url}{state}.zip.zip",
+                    destination=self.get_sources_path() / state,
+                )
+            )
+
+        return sources
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        for state in self.states:
+            self.tes_state_dfs.append(
+                gpd.read_file(f"{self.get_sources_path()}/{state}/{state}.shp")
            )

    def transform(self) -> None:
-        tes_state_dfs = []
-        for state in self.states:
-            tes_state_dfs.append(
-                gpd.read_file(f"{self.get_tmp_path()}/{state}/{state}.shp")
-            )
+
        self.df = gpd.GeoDataFrame(
-            pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs
+            pd.concat(self.tes_state_dfs), crs=self.tes_state_dfs[0].crs
        )

        # rename ID to Tract ID
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py
@ -4,63 +4,57 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url

 logger = get_module_logger(__name__)


 class TribalETL(ExtractTransformLoad):
    def __init__(self):
+
+        self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
+
        self.GEOGRAPHIC_BASE_PATH = (
            self.DATA_PATH / "tribal" / "geographic_data"
        )
-        self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
        self.NATIONAL_TRIBAL_GEOJSON_PATH = (
            self.GEOGRAPHIC_BASE_PATH / "usa.json"
        )
+
        self.USA_TRIBAL_DF_LIST = []

-    def extract(self) -> None:
-        """Extract the tribal geojson zip files from Justice40 S3 data folder
+    def get_data_sources(self) -> [DataSource]:

-        Returns:
-            None
-        """
-
-        bia_shapefile_zip_url = (
+        national_lar_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/BIA_National_LAR_updated_20220929.zip"
        )
-
-        tsa_and_aian_geojson_zip_url = (
+        tsa_and_aian_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/BIA_TSA_and_AIAN_json.zip"
        )
-
-        alaska_geojson_url = (
+        alaska_native_villages_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/Alaska_Native_Villages_json.zip"
        )

-        unzip_file_from_url(
-            bia_shapefile_zip_url,
-            self.TMP_PATH,
-            self.GEOGRAPHIC_BASE_PATH / "bia_national_lar",
-        )
-
-        unzip_file_from_url(
-            tsa_and_aian_geojson_zip_url,
-            self.TMP_PATH,
-            self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian",
-        )
-
-        unzip_file_from_url(
-            alaska_geojson_url,
-            self.TMP_PATH,
-            self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages",
-        )
+        return [
+            ZIPDataSource(
+                national_lar_url,
+                destination=self.get_sources_path() / "bia_national_lar",
+            ),
+            ZIPDataSource(
+                source=tsa_and_aian_url,
+                destination=self.get_sources_path() / "tsa_and_aian",
+            ),
+            ZIPDataSource(
+                source=alaska_native_villages_url,
+                destination=self.get_sources_path() / "alaska_native_villages",
+            ),
+        ]

    def _transform_bia_national_lar(self, path: Path) -> None:
        """Transform the Tribal BIA National Lar Geodataframe and appends it to the
@ -187,21 +181,21 @@ class TribalETL(ExtractTransformLoad):
        """
        # Set the filepaths:
        bia_national_lar_shapefile = (
-            self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
+            self.get_sources_path() / "bia_national_lar"
        )

        bia_aian_supplemental_geojson = (
-            self.GEOGRAPHIC_BASE_PATH
+            self.get_sources_path()
            / "tsa_and_aian"
            / "BIA_AIAN_Supplemental.json"
        )

        bia_tsa_geojson = (
-            self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json"
+            self.get_sources_path() / "tsa_and_aian" / "BIA_TSA.json"
        )

        alaska_native_villages_geojson = (
-            self.GEOGRAPHIC_BASE_PATH
+            self.get_sources_path()
            / "alaska_native_villages"
            / "AlaskaNativeVillages.gdb.geojson"
        )
@ -225,7 +219,11 @@ class TribalETL(ExtractTransformLoad):
            "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
        )

+        # note – this works a little different than many of the ETLs. The file
+        # being written here is used again downstream, so it's placed in a
+        # special directory.
        logger.debug("Writing national geojson file")
+        self.GEOGRAPHIC_BASE_PATH.mkdir(parents=True, exist_ok=True)
        usa_tribal_df.to_file(
            self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
@ -4,6 +4,7 @@ import geopandas as gpd
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 from data_pipeline.etl.sources.geo_utils import get_tract_geojson
@ -67,6 +68,9 @@ class TribalOverlapETL(ExtractTransformLoad):
        self.census_tract_gdf: gpd.GeoDataFrame
        self.tribal_gdf: gpd.GeoDataFrame

+    def get_data_sources(self) -> [DataSource]:
+        return []  # this uses already retrieved / calculated data
+
    @staticmethod
    def _create_string_from_list(series: pd.Series) -> str:
        """Helper method that creates a sorted string list (for tribal names)."""
@ -89,7 +93,12 @@ class TribalOverlapETL(ExtractTransformLoad):

        return percentage_float

-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        self.census_tract_gdf = get_tract_geojson()
        self.tribal_gdf = get_tribal_geojson()

--- a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
@ -4,9 +4,10 @@ import geopandas as gpd
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings

@ -28,19 +29,6 @@ class USArmyFUDS(ExtractTransformLoad):

    def __init__(self):

-        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.FILE_URL = (
-                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
-                "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
-                "all_data_reported_to_Congress_in_FY2020.geojson"
-            )
-        else:
-            self.FILE_URL: str = (
-                "https://opendata.arcgis.com/api/v3/datasets/"
-                "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
-                "data?format=geojson&spatialRefId=4326&where=1%3D1"
-            )
-
        self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"

        # Constants for output
@ -50,17 +38,27 @@ class USArmyFUDS(ExtractTransformLoad):
            self.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
            self.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
        ]
-        self.DOWNLOAD_FILE_NAME = self.get_tmp_path() / "fuds.geojson"
+        self.fuds_source = self.get_sources_path() / "fuds.geojson"

        self.raw_df: gpd.GeoDataFrame
        self.output_df: pd.DataFrame

-    def extract(self) -> None:
-        download_file_from_url(
-            file_url=self.FILE_URL,
-            download_file_name=self.DOWNLOAD_FILE_NAME,
-            verify=True,
-        )
+    def get_data_sources(self) -> [DataSource]:
+
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            fuds_url = (
+                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
+                "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
+                "all_data_reported_to_Congress_in_FY2020.geojson"
+            )
+        else:
+            fuds_url: str = (
+                "https://opendata.arcgis.com/api/v3/datasets/"
+                "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
+                "data?format=geojson&spatialRefId=4326&where=1%3D1"
+            )
+
+        return [FileDataSource(source=fuds_url, destination=self.fuds_source)]

    def transform(self) -> None:
        # before we try to do any transformation, get the tract data
@ -68,7 +66,7 @@ class USArmyFUDS(ExtractTransformLoad):

        logger.debug("Loading FUDS data as GeoDataFrame for transform")
        raw_df = gpd.read_file(
-            filename=self.DOWNLOAD_FILE_NAME,
+            filename=self.fuds_source,
            low_memory=False,
        )

--- a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
        data. A basic version of that patching is included here for classes that can use it.
        """

+        data_path, tmp_path = mock_paths
+        sources_path = data_path / "sources" / self._ETL_CLASS.__name__
+        sources_path.mkdir(parents=True, exist_ok=True)
+
        with mock.patch(
-            "data_pipeline.utils.requests"
+            "data_pipeline.etl.downloader.requests"
        ) as requests_mock, mock.patch(
+            "data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
+        ) as sources_mock, mock.patch(
            "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
        ) as mock_get_state_fips_codes:
-            tmp_path = mock_paths[1]

+            # requests mock
            def fake_get(url, *args, **kwargs):
                file_path = url.split("/")[-1]
                with open(
@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
                return response_mock

            requests_mock.get = fake_get
+
+            # fips codes mock
            mock_get_state_fips_codes.return_value = [
                x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
            ]
+
+            # sources mock
+            sources_mock.return_value = sources_path
+
            # Instantiate the ETL class.
            etl = self._get_instance_of_etl_class()

            # Monkey-patch the temporary directory to the one used in the test
            etl.TMP_PATH = tmp_path
+            etl.SOURCES_PATH = data_path / "sources"

            # Run the extract method.
            etl.extract()
+
+        def fake_get_sources_path() -> pathlib.PosixPath:
+            return sources_path
+
+        mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
+
        return etl

    def test_init(self, mock_etl, mock_paths):
--- a/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py
@ -28,7 +28,7 @@ class TestTravelCompositeETL(TestETL):
            mock_paths=mock_paths,
        )
        df = gpd.read_file(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
            dtype={etl.GEOID_TRACT_FIELD_NAME: str},
        )
        assert df.shape[0] == 30
--- a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
@ -5,6 +5,7 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource

 logger = get_module_logger(__name__)

@ -30,6 +31,9 @@ class ExampleETL(ExtractTransformLoad):
            self.EXAMPLE_FIELD_NAME,
        ]

+    def get_data_sources(self) -> [DataSource]:
+        return []
+
    def extract(self):
        # Pretend to download zip from external URL, write it to CSV.
        zip_file_path = (
@ -42,11 +46,11 @@ class ExampleETL(ExtractTransformLoad):
        )

        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
-            zip_ref.extractall(self.get_tmp_path())
+            zip_ref.extractall(self.get_sources_path())

    def transform(self):
        df: pd.DataFrame = pd.read_csv(
-            self.get_tmp_path() / "input.csv",
+            self.get_sources_path() / "input.csv",
            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
            low_memory=False,
        )
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@ -124,12 +124,18 @@ class TestETL:
        data. A basic version of that patching is included here for classes that can use it.
        """

+        data_path, tmp_path = mock_paths
+        sources_path = data_path / "sources" / self._ETL_CLASS.__name__
+        sources_path.mkdir(parents=True, exist_ok=True)
+
        with mock.patch(
-            "data_pipeline.utils.requests"
+            "data_pipeline.etl.downloader.requests"
        ) as requests_mock, mock.patch(
+            "data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
+        ) as sources_mock, mock.patch(
            "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
        ) as mock_get_state_fips_codes:
-            tmp_path = mock_paths[1]
+
            if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
                zip_file_fixture_src = (
                    self._DATA_DIRECTORY_FOR_TEST
@ -145,6 +151,7 @@ class TestETL:
                    "rb",
                ) as file:
                    file_contents = file.read()
+
            response_mock = requests.Response()
            response_mock.status_code = 200
            # pylint: disable=protected-access
@ -154,15 +161,25 @@ class TestETL:
            mock_get_state_fips_codes.return_value = [
                x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
            ]
+
+            # sources mock
+            sources_mock.return_value = sources_path
+
            # Instantiate the ETL class.
            etl = self._get_instance_of_etl_class()

            # Monkey-patch the temporary directory to the one used in the test
            etl.TMP_PATH = tmp_path
+            etl.SOURCES_PATH = data_path / "sources"

            # Run the extract method.
            etl.extract()

+        def fake_get_sources_path() -> pathlib.PosixPath:
+            return sources_path
+
+        mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
+
        return etl

    def test_init_base(self, mock_etl, mock_paths):
@ -263,17 +280,12 @@ class TestETL:
        file was unzipped from a "fake" downloaded zip (located in data) in a  temporary path.
        """
        if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
-            tmp_path = mock_paths[1]

-            _ = self._setup_etl_instance_and_run_extract(
+            etl = self._setup_etl_instance_and_run_extract(
                mock_etl=mock_etl,
                mock_paths=mock_paths,
            )
-            assert (
-                tmp_path
-                / self._EXTRACT_TMP_FOLDER_NAME
-                / self._SAMPLE_DATA_FILE_NAME
-            ).exists()
+            assert (etl.get_sources_path()).exists()

    def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
        """Tests the extract method.
@ -285,8 +297,11 @@ class TestETL:
            mock_etl=mock_etl,
            mock_paths=mock_paths,
        )
+
+        data_path, tmp_path = mock_paths
+
        tmp_df = pd.read_csv(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
            dtype={etl.GEOID_TRACT_FIELD_NAME: str},
        )
        snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
--- a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
@ -29,7 +29,7 @@ class TestHistoricRedliningETL(TestETL):
            mock_paths=mock_paths,
        )
        tmp_df = pd.read_excel(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
            dtype={etl.GEOID_TRACT_FIELD_NAME: str},
        )
        assert tmp_df.shape == (15, 5)
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@ -36,23 +36,12 @@ class TestNationalRiskIndexETL(TestETL):

    def test_init(self, mock_etl, mock_paths):
        """Tests that the mock NationalRiskIndexETL class instance was
-        initiliazed correctly.
-
-        Validates the following conditions:
-        - self.DATA_PATH points to the "data" folder in the temp directory
-        - self.TMP_PATH points to the "data/tmp" folder in the temp directory
-        - self.INPUT_PATH points to the correct path in the temp directory
-        - self.OUTPUT_PATH points to the correct path in the temp directory
+        initialized correctly.
        """
        # setup
        etl = NationalRiskIndexETL()
-        data_path, tmp_path = mock_paths
-        input_csv = (
-            tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
-        )

        # validation
-        assert etl.INPUT_CSV == input_csv
        assert etl.GEOID_FIELD_NAME == "GEOID10"
        assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
        assert etl.NAME == "national_risk_index"