From 6f39033ddee026354d455537ced127e6eae18cf7 Mon Sep 17 00:00:00 2001
From: Travis Newby <83976412+travis-newby@users.noreply.github.com>
Date: Fri, 3 Mar 2023 12:26:24 -0600
Subject: [PATCH] Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs

* Update code to be more production-ish

* Move fetch to Extract part of ETL
* Create a downloader to house all downloading operations
* Remove unnecessary "name" in data source

* Format source files with black

* Fix issues from pylint and get the tests working with the new folder structure

* Clean up files with black

* Fix unzip test

* Add caching notes to README

* Fix tests (linting and case sensitivity bug)

* Address PR comments and add API keys for census where missing

* Merging comparator changes from main into this branch for the sake of the PR

* Add note on using cache (-u) during pipeline
---
 data/data-pipeline/README.md                  |   4 +-
 .../data_pipeline/application.py              | 146 ++++++++++++++++--
 data/data-pipeline/data_pipeline/etl/base.py  |  89 +++++++----
 .../data_pipeline/etl/datasource.py           | 124 +++++++++++++++
 .../data_pipeline/etl/downloader.py           |  95 ++++++++++++
 .../data-pipeline/data_pipeline/etl/runner.py |  74 +++++++--
 .../data_pipeline/etl/score/etl_score.py      |  10 +-
 .../data_pipeline/etl/score/etl_score_geo.py  |   9 +-
 .../data_pipeline/etl/score/etl_score_post.py |  24 ++-
 .../data_pipeline/etl/score/etl_utils.py      |   4 +-
 .../etl/sources/calenviroscreen/etl.py        |  50 ++++--
 .../etl/sources/cdc_life_expectancy/etl.py    |  95 +++++++-----
 .../etl/sources/cdc_places/etl.py             |  37 +++--
 .../etl/sources/cdc_svi_index/etl.py          |  34 +++-
 .../data_pipeline/etl/sources/census/etl.py   |  41 ++---
 .../etl/sources/census/etl_utils.py           |   1 +
 .../etl/sources/census_acs/etl.py             |  64 +++++---
 .../etl/sources/census_acs_2010/etl.py        |  37 +++--
 .../sources/census_acs_median_income/etl.py   | 104 ++++++++-----
 .../etl/sources/census_decennial/etl.py       |  63 ++++++--
 .../sources/child_opportunity_index/etl.py    |  43 ++++--
 .../etl/sources/doe_energy_burden/etl.py      |  45 ++++--
 .../etl/sources/dot_travel_composite/etl.py   |  53 +++++--
 .../data_pipeline/etl/sources/eamlis/etl.py   |  49 ++++--
 .../data_pipeline/etl/sources/ejscreen/etl.py |  38 +++--
 .../sources/ejscreen_areas_of_concern/etl.py  |  28 +++-
 .../etl.py                                    |  38 +++--
 .../data_pipeline/etl/sources/epa_rsei/etl.py |  40 +++--
 .../etl/sources/fsf_flood_risk/etl.py         |  68 +++++---
 .../etl/sources/fsf_wildfire_risk/etl.py      |  63 ++++++--
 .../data_pipeline/etl/sources/geocorr/etl.py  |  41 +++--
 .../etl/sources/historic_redlining/etl.py     |  51 ++++--
 .../sources/housing_and_transportation/etl.py |  49 +++---
 .../etl/sources/hud_housing/etl.py            |  39 +++--
 .../etl/sources/hud_recap/etl.py              |  45 +++---
 .../etl/sources/mapping_for_ej/etl.py         |  54 +++++--
 .../etl/sources/mapping_inequality/etl.py     |  75 +++++----
 .../etl/sources/maryland_ejscreen/etl.py      |  45 ++++--
 .../etl/sources/michigan_ejscreen/etl.py      |  31 +++-
 .../etl/sources/national_risk_index/etl.py    | 115 ++++++++------
 .../etl/sources/nlcd_nature_deprived/etl.py   |  55 +++++--
 .../etl/sources/persistent_poverty/etl.py     |  53 ++++---
 .../etl/sources/tree_equity_score/etl.py      |  48 ++++--
 .../data_pipeline/etl/sources/tribal/etl.py   |  66 ++++----
 .../etl/sources/tribal_overlap/etl.py         |  11 +-
 .../etl/sources/us_army_fuds/etl.py           |  42 +++--
 .../sources/cdc_life_expectancy/test_etl.py   |  23 ++-
 .../sources/dot_travel_composite/test_etl.py  |   2 +-
 .../tests/sources/example/etl.py              |   8 +-
 .../tests/sources/example/test_etl.py         |  35 +++--
 .../sources/historic_redlining/test_etl.py    |   2 +-
 .../sources/national_risk_index/test_etl.py   |  13 +-
 52 files changed, 1787 insertions(+), 686 deletions(-)
 create mode 100644 data/data-pipeline/data_pipeline/etl/datasource.py
 create mode 100644 data/data-pipeline/data_pipeline/etl/downloader.py

diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md
index bd5d68e2..5372882b 100644
--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@@ -92,7 +92,6 @@ If you want to run specific data tasks, you can open a terminal window, navigate
 - Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles`
 
 To learn more about these commands and when they should be run, refer to [Running for Local Development](#running-for-local-development).
-
 </details>
 
 ---
@@ -136,6 +135,9 @@ Once you've downloaded the census data, run the following commands – in order
 
 Many commands have options. For example, you can run a single dataset with `etl-run` by passing the command line parameter `-d name-of-dataset-to-run`. Please use the `--help` option to find out more.
 
+> :bulb: **NOTE**  
+> One important command line option is enabling cached data sources. Pass the command line parameter `-u` to many commands (e.g. `etl-run`) to use locally cached data sources within the ETL portion of the pipeline. This will ensure that you don't download many GB of data with each run of the data pipeline.
+
 ## How Scoring Works
 
 Scores are generated by running the `score-run` command via Poetry or Docker. This command executes [`data_pipeline/etl/score/etl_score.py`](data_pipeline/etl/score/etl_score.py). During execution,
diff --git a/data/data-pipeline/data_pipeline/application.py b/data/data-pipeline/data_pipeline/application.py
index ad621894..a1c10865 100644
--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@@ -7,6 +7,9 @@ from data_pipeline.etl.runner import etl_runner
 from data_pipeline.etl.runner import score_generate
 from data_pipeline.etl.runner import score_geo
 from data_pipeline.etl.runner import score_post
+from data_pipeline.etl.runner import get_data_sources
+from data_pipeline.etl.runner import extract_data_sources as extract_ds
+from data_pipeline.etl.runner import clear_data_source_cache as clear_ds_cache
 from data_pipeline.etl.sources.census.etl_utils import check_census_data_source
 from data_pipeline.etl.sources.census.etl_utils import (
     reset_data_directories as census_reset,
@@ -79,7 +82,14 @@ def data_cleanup():
     is_flag=True,
     help="Upload to AWS S3 a zipped archive of the census data.",
 )
-def census_data_download(zip_compress):
+@click.option(
+    "-u",
+    "--use-cache",
+    is_flag=True,
+    default=False,
+    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+)
+def census_data_download(zip_compress, use_cache):
     """CLI command to download all census shape files from the Census FTP and extract the geojson
     to generate national and by state Census Block Group CSVs"""
     log_title("Download Census Data ")
@@ -88,7 +98,7 @@ def census_data_download(zip_compress):
     census_reset(data_path)
 
     log_info("Downloading census data")
-    etl_runner("census")
+    etl_runner("census", use_cache)
 
     if zip_compress:
         log_info("Zipping census data")
@@ -129,7 +139,14 @@ def pull_census_data(data_source: str):
     type=str,
     help=dataset_cli_help,
 )
-def etl_run(dataset: str):
+@click.option(
+    "-u",
+    "--use-cache",
+    is_flag=True,
+    default=False,
+    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+)
+def etl_run(dataset: str, use_cache: bool):
     """Run a specific or all ETL processes
 
     Args:
@@ -141,7 +158,7 @@ def etl_run(dataset: str):
     log_title("Run ETL")
 
     log_info("Running dataset(s)")
-    etl_runner(dataset)
+    etl_runner(dataset, use_cache)
 
     log_goodbye()
     sys.exit()
@@ -167,7 +184,14 @@ def score_run():
 @cli.command(
     help="Run ETL + Score Generation",
 )
-def score_full_run():
+@click.option(
+    "-u",
+    "--use-cache",
+    is_flag=True,
+    default=False,
+    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+)
+def score_full_run(use_cache: bool):
     """CLI command to run ETL and generate the score in one command"""
     log_title("Score Full Run", "Run ETL and Generate Score (no tiles)")
 
@@ -177,7 +201,7 @@ def score_full_run():
     temp_folder_cleanup()
 
     log_info("Running all ETLs")
-    etl_runner()
+    etl_runner(use_cache=use_cache)
 
     log_info("Generating score")
     score_generate()
@@ -297,7 +321,14 @@ def generate_map_tiles(generate_tribal_layer):
     type=str,
     help=dataset_cli_help,
 )
-def data_full_run(check: bool, data_source: str):
+@click.option(
+    "-u",
+    "--use-cache",
+    is_flag=True,
+    default=False,
+    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+)
+def data_full_run(check: bool, data_source: str, use_cache: bool):
     """CLI command to run ETL, score, JSON combine and generate tiles in one command
 
     Args:
@@ -330,10 +361,10 @@ def data_full_run(check: bool, data_source: str):
 
         if data_source == "local":
             log_info("Downloading census data")
-            etl_runner("census")
+            etl_runner("census", use_cache)
 
         log_info("Running all ETLs")
-        etl_runner()
+        etl_runner(use_cache=use_cache)
 
         log_info("Generating score")
         score_generate()
@@ -357,6 +388,103 @@ def data_full_run(check: bool, data_source: str):
     sys.exit()
 
 
+@cli.command(
+    help="Print data sources for all ETL processes (or a specific one)",
+)
+@click.option(
+    "-d",
+    "--dataset",
+    required=False,
+    type=str,
+    help=dataset_cli_help,
+)
+def print_data_sources(dataset: str):
+    """Print data sources for all ETL processes (or a specific one)
+
+    Args:
+        dataset (str): Name of the ETL module to be run (optional)
+
+    Returns:
+        None
+    """
+    log_title("Print ETL Datasources")
+
+    log_info("Retrieving dataset(s)")
+    sources = get_data_sources(dataset)
+
+    log_info(f"Discovered {len(sources)} files")
+
+    for s in sources:
+        log_info(s)
+
+    log_goodbye()
+    sys.exit()
+
+
+@cli.command(
+    help="Fetch data sources for all ETL processes (or a specific one)",
+)
+@click.option(
+    "-d",
+    "--dataset",
+    required=False,
+    type=str,
+    help=dataset_cli_help,
+)
+@click.option(
+    "-u",
+    "--use-cache",
+    is_flag=True,
+    default=False,
+    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
+)
+def extract_data_sources(dataset: str, use_cache: bool):
+    """Extract and cache data source(s) for all ETL processes (or a specific one)
+
+    Args:
+        dataset (str): Name of the ETL module whose data sources you wish to fetch
+        use_cache (bool): Use this flag if you wish to use the cached data sources (if they exist)
+
+    Returns:
+        None
+    """
+    log_title("Fetch ETL Datasources")
+
+    log_info("Fetching data source(s)")
+    extract_ds(dataset, use_cache)
+
+    log_goodbye()
+    sys.exit()
+
+
+@cli.command(
+    help="Clear data source cache for all ETL processes (or a specific one)",
+)
+@click.option(
+    "-d",
+    "--dataset",
+    required=False,
+    type=str,
+    help=dataset_cli_help,
+)
+def clear_data_source_cache(dataset: str):
+    """Clear data source(s) cache for all ETL processes (or a specific one)
+
+    Args:
+        dataset (str): Name of the ETL module whose cache you wish to clear
+
+    Returns:
+        None
+    """
+    log_title("Fetch ETL Datasources")
+
+    log_info("Clear data source cache")
+    clear_ds_cache(dataset)
+
+    log_goodbye()
+    sys.exit()
+
+
 def log_title(title: str, subtitle: str = None):
     """Logs a title in our fancy title format"""
     logger.info("-" * LOG_LINE_WIDTH)
diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py
index c15f0240..945b6ccb 100644
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@@ -2,7 +2,9 @@ import enum
 import pathlib
 import sys
 import typing
+import shutil
 from typing import Optional
+from abc import ABC, abstractmethod
 
 import pandas as pd
 from data_pipeline.config import settings
@@ -13,7 +15,7 @@ from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import remove_all_from_dir
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
 
 logger = get_module_logger(__name__)
 
@@ -25,7 +27,7 @@ class ValidGeoLevel(enum.Enum):
     CENSUS_BLOCK_GROUP = enum.auto()
 
 
-class ExtractTransformLoad:
+class ExtractTransformLoad(ABC):
     """
     A class used to instantiate an ETL object to retrieve and process data from
     datasets.
@@ -45,6 +47,7 @@ class ExtractTransformLoad:
     # Directories
     DATA_PATH: pathlib.Path = settings.DATA_PATH
     TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
+    SOURCES_PATH: pathlib.Path = DATA_PATH / "sources"
     CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
     DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
     DATASET_CONFIG: Optional[dict] = None
@@ -177,45 +180,60 @@ class ExtractTransformLoad:
         output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
         return output_file_path
 
-    def get_tmp_path(self) -> pathlib.Path:
-        """Returns the temporary path associated with this ETL class."""
-        # Note: the temporary path will be defined on `init`, because it uses the class
-        # of the instance which is often a child class.
-        tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
+    def get_sources_path(self) -> pathlib.Path:
+        """Returns the sources path associated with this ETL class. The sources path
+        is the home for cached data sources used by this ETL."""
+
+        sources_path = self.SOURCES_PATH / str(self.__class__.__name__)
 
         # Create directory if it doesn't exist
-        tmp_path.mkdir(parents=True, exist_ok=True)
+        sources_path.mkdir(parents=True, exist_ok=True)
 
-        return tmp_path
+        return sources_path
 
-    def extract(
-        self,
-        source_url: str = None,
-        extract_path: pathlib.Path = None,
-        verify: Optional[bool] = True,
-    ) -> None:
-        """Extract the data from a remote source. By default it provides code
-        to get the file from a source url, unzips it and stores it on an
-        extract_path."""
+    @abstractmethod
+    def get_data_sources(self) -> [DataSource]:
+        pass
 
-        if source_url is None:
-            source_url = self.SOURCE_URL
+    def _fetch(self) -> None:
+        """Fetch all data sources for this ETL. When data sources are fetched, they
+        are stored in a cache directory for consistency between runs."""
+        for ds in self.get_data_sources():
+            ds.fetch()
 
-        if extract_path is None:
-            extract_path = self.get_tmp_path()
+    def clear_data_source_cache(self) -> None:
+        """Clears the cache for this ETLs data source(s)"""
+        shutil.rmtree(self.get_sources_path())
 
-        unzip_file_from_url(
-            file_url=source_url,
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=extract_path,
-            verify=verify,
-        )
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+        """Extract (download) data from a remote source, and validate
+        that data. By default, this method fetches data from the set of
+        data sources returned by get_data_sources.
 
+        If use_cached_data_sources is true, this method attempts to use cached data
+        rather than re-downloading from the original source. The cache algorithm is very
+        simple: it just looks to see if the directory has any contents. If so, it uses
+        that content. If not, it downloads all data sources.
+
+        Subclasses should call super() before performing any work if they wish to take
+        advantage of the automatic downloading and caching ability of this superclass.
+        """
+
+        if use_cached_data_sources and any(self.get_sources_path().iterdir()):
+            logger.info(
+                f"Using cached data sources for {self.__class__.__name__}"
+            )
+        else:
+            self.clear_data_source_cache()
+            self._fetch()
+
+        # the rest of the work should be performed here
+
+    @abstractmethod
     def transform(self) -> None:
         """Transform the data extracted into a format that can be consumed by the
         score generator"""
-
-        raise NotImplementedError
+        pass
 
     def validate(self) -> None:
         """Validates the output.
@@ -380,3 +398,14 @@ class ExtractTransformLoad:
     def cleanup(self) -> None:
         """Clears out any files stored in the TMP folder"""
         remove_all_from_dir(self.get_tmp_path())
+
+    def get_tmp_path(self) -> pathlib.Path:
+        """Returns the temporary path associated with this ETL class."""
+        # Note: the temporary path will be defined on `init`, because it uses the class
+        # of the instance which is often a child class.
+        tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
+
+        # Create directory if it doesn't exist
+        tmp_path.mkdir(parents=True, exist_ok=True)
+
+        return tmp_path
diff --git a/data/data-pipeline/data_pipeline/etl/datasource.py b/data/data-pipeline/data_pipeline/etl/datasource.py
new file mode 100644
index 00000000..3d299207
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/etl/datasource.py
@@ -0,0 +1,124 @@
+"""This module defines a set of classes that can be used to fetch data
+from a remote source. They are meant to be used in conjuction with ETLs
+or other classes that require downloading data.
+
+There are three types of data sources defined in this file:
+
+FileDataSource – meant to be used when you have a single file to
+retrive from a remote location and save to a destination.
+
+ZipDataSource – used when you need to fetch and unzip a file, and save
+the contents of that file to a destination.
+
+CensusDataSource – used to download data from the Census API and store
+the contents to a destination.
+
+DataSource subclasses must implement the fetch method to define how
+they will reach out to a remote source, download the data, and save
+that data to the destination.
+"""
+
+from pathlib import Path
+from typing import List
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+
+from data_pipeline.etl.downloader import Downloader
+from data_pipeline.etl.sources.census_acs.etl_utils import (
+    retrieve_census_acs_data,
+)
+
+
+@dataclass
+class DataSource(ABC):
+    """A data source represents any source of data that is fetchable
+    from a remote location.
+
+    Attributes:
+    source : str
+            the location of this data source, as a url
+    destination : Path
+            the Path where the data source should be saved locally upon being fetched
+
+    """
+
+    source: str
+    destination: Path
+
+    @abstractmethod
+    def fetch(self) -> None:
+        pass
+
+
+@dataclass
+class FileDataSource(DataSource):
+    """A data source representing a single file.
+
+    This single file will be fetched from the source and saved to a single
+    destination.
+    """
+
+    def fetch(self) -> None:
+        """Fetches a single file from a source and saves it to a destination."""
+
+        self.destination.parent.mkdir(parents=True, exist_ok=True)
+        Downloader.download_file_from_url(
+            file_url=self.source,
+            download_file_name=self.destination,
+            verify=True,
+        )
+
+    def __str__(self):
+        return f"File – {self.source}"
+
+
+@dataclass
+class ZIPDataSource(DataSource):
+    """A data source representing ZIP files.
+
+    Zip files will be fetched and placed in the destination folder, then unzipped.
+    """
+
+    def fetch(self) -> None:
+
+        self.destination.mkdir(parents=True, exist_ok=True)
+        Downloader.download_zip_file_from_url(
+            file_url=self.source,
+            unzipped_file_path=self.destination,
+            verify=True,
+        )
+
+    def __str__(self):
+        return f"Zip – {self.source}"
+
+
+@dataclass
+class CensusDataSource(DataSource):
+    """A data source representing census data.
+
+    Data will be fetched using the Census API and saved to the destination file. Source is ignored.
+    """
+
+    acs_year: int
+    variables: List[str]
+    tract_output_field_name: str
+    data_path_for_fips_codes: Path
+    acs_type: str
+
+    def fetch(self) -> None:
+
+        df = retrieve_census_acs_data(
+            acs_year=self.acs_year,
+            variables=self.variables,
+            tract_output_field_name=self.tract_output_field_name,
+            data_path_for_fips_codes=self.data_path_for_fips_codes,
+            acs_type=self.acs_type,
+        )
+
+        self.destination.parent.mkdir(parents=True, exist_ok=True)
+
+        # Write CSV representation of census data
+        df.to_csv(self.destination, index=False)
+
+    def __str__(self):
+        return f"Census – {self.acs_type}, {self.acs_year}"
diff --git a/data/data-pipeline/data_pipeline/etl/downloader.py b/data/data-pipeline/data_pipeline/etl/downloader.py
new file mode 100644
index 00000000..53ea2a38
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/etl/downloader.py
@@ -0,0 +1,95 @@
+import uuid
+import urllib3
+import requests
+import zipfile
+import shutil
+
+from pathlib import Path
+from data_pipeline.config import settings
+
+
+class Downloader:
+    """A simple class to encapsulate the download capabilities of the application"""
+
+    @classmethod
+    def download_file_from_url(
+        cls,
+        file_url: str,
+        download_file_name: Path,
+        verify: bool = True,
+    ) -> str:
+        """Downloads a file from a remote URL location and returns the file location.
+
+        Args:
+                file_url (str): URL where the zip file is located
+                download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default)
+                verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
+                error (optional, default to False)
+
+        Returns:
+                None
+
+        """
+        # disable https warning
+        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+        download_file_name.parent.mkdir(parents=True, exist_ok=True)
+
+        response = requests.get(
+            file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
+        )
+        if response.status_code == 200:
+            file_contents = response.content
+        else:
+            raise Exception(
+                f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
+            )
+
+        # Write the contents to disk.
+        file = open(download_file_name, "wb")
+        file.write(file_contents)
+        file.close()
+
+        return download_file_name
+
+    @classmethod
+    def download_zip_file_from_url(
+        cls,
+        file_url: str,
+        unzipped_file_path: Path,
+        verify: bool = True,
+    ) -> None:
+        """Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after
+
+        Args:
+                file_url (str): URL where the zip file is located
+                unzipped_file_path (pathlib.Path): directory and name of the extracted file
+                verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
+                error (optional, default to False)
+
+        Returns:
+                None
+
+        """
+        # dir_id allows us to evade race conditions on parallel ETLs
+        dir_id = uuid.uuid4()
+
+        zip_download_path = (
+            settings.DATA_PATH
+            / "tmp"
+            / "downloads"
+            / f"{dir_id}"
+            / "download.zip"
+        )
+
+        zip_file_path = Downloader.download_file_from_url(
+            file_url=file_url,
+            download_file_name=zip_download_path,
+            verify=verify,
+        )
+
+        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+            zip_ref.extractall(unzipped_file_path)
+
+        # cleanup temporary file and directory
+        shutil.rmtree(zip_download_path.parent)
diff --git a/data/data-pipeline/data_pipeline/etl/runner.py b/data/data-pipeline/data_pipeline/etl/runner.py
index 8d896ded..5014771a 100644
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@@ -2,10 +2,14 @@ import concurrent.futures
 import importlib
 import typing
 
+from functools import reduce
+
 from data_pipeline.etl.score.etl_score import ScoreETL
 from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
 from data_pipeline.etl.score.etl_score_post import PostScoreETL
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
 
 from . import constants
 
@@ -40,20 +44,26 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
     return dataset_list
 
 
-def _run_one_dataset(dataset: dict) -> None:
-    """Runs one etl process."""
-
-    logger.info(f"Running ETL for {dataset['name']}")
-
+def _get_dataset(dataset: dict) -> ExtractTransformLoad:
+    """Instantiates a dataset object from a dictionary description of that object's class"""
     etl_module = importlib.import_module(
         f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
     )
     etl_class = getattr(etl_module, dataset["class_name"])
     etl_instance = etl_class()
 
+    return etl_instance
+
+
+def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
+    """Runs one etl process."""
+
+    logger.info(f"Running ETL for {dataset['name']}")
+    etl_instance = _get_dataset(dataset)
+
     # run extract
     logger.debug(f"Extracting {dataset['name']}")
-    etl_instance.extract()
+    etl_instance.extract(use_cache)
 
     # run transform
     logger.debug(f"Transforming {dataset['name']}")
@@ -74,11 +84,12 @@ def _run_one_dataset(dataset: dict) -> None:
     logger.info(f"Finished ETL for dataset {dataset['name']}")
 
 
-def etl_runner(dataset_to_run: str = None) -> None:
+def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None:
     """Runs all etl processes or a specific one
 
     Args:
         dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
+        use_cache (bool): Use the cached data sources – if they exist – rather than downloading them all from scratch
 
     Returns:
         None
@@ -105,7 +116,9 @@ def etl_runner(dataset_to_run: str = None) -> None:
         logger.info("Running concurrent ETL jobs")
         with concurrent.futures.ThreadPoolExecutor() as executor:
             futures = {
-                executor.submit(_run_one_dataset, dataset=dataset)
+                executor.submit(
+                    _run_one_dataset, dataset=dataset, use_cache=use_cache
+                )
                 for dataset in concurrent_datasets
             }
 
@@ -119,7 +132,50 @@ def etl_runner(dataset_to_run: str = None) -> None:
     if high_memory_datasets:
         logger.info("Running high-memory ETL jobs")
         for dataset in high_memory_datasets:
-            _run_one_dataset(dataset=dataset)
+            _run_one_dataset(dataset=dataset, use_cache=use_cache)
+
+
+def get_data_sources(dataset_to_run: str = None) -> [DataSource]:
+
+    dataset_list = _get_datasets_to_run(dataset_to_run)
+
+    sources = []
+
+    for dataset in dataset_list:
+        etl_instance = _get_dataset(dataset)
+        sources.append(etl_instance.get_data_sources())
+
+    sources = reduce(
+        list.__add__, sources
+    )  # flatten the list of lists into a single list
+
+    return sources
+
+
+def extract_data_sources(
+    dataset_to_run: str = None, use_cache: bool = False
+) -> None:
+
+    dataset_list = _get_datasets_to_run(dataset_to_run)
+
+    for dataset in dataset_list:
+        etl_instance = _get_dataset(dataset)
+        logger.info(
+            f"Extracting data set for {etl_instance.__class__.__name__}"
+        )
+        etl_instance.extract(use_cache)
+
+
+def clear_data_source_cache(dataset_to_run: str = None) -> None:
+
+    dataset_list = _get_datasets_to_run(dataset_to_run)
+
+    for dataset in dataset_list:
+        etl_instance = _get_dataset(dataset)
+        logger.info(
+            f"Clearing data set cache for {etl_instance.__class__.__name__}"
+        )
+        etl_instance.clear_data_source_cache()
 
 
 def score_generate() -> None:
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
index cf6c4366..0314512b 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -22,6 +22,8 @@ from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
 from data_pipeline.score import field_names
 from data_pipeline.score.score_runner import ScoreRunner
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+
 
 logger = get_module_logger(__name__)
 
@@ -55,7 +57,13 @@ class ScoreETL(ExtractTransformLoad):
 
         self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
 
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return (
+            []
+        )  # we have all prerequisite sources locally as a result of running the ETLs
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
         # EJSCreen csv Load
         ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
         self.ejscreen_df = pd.read_csv(
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
index b7937272..75544e45 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@@ -15,6 +15,7 @@ from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_dict_from_yaml_object_fields
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import zip_files
+from data_pipeline.etl.datasource import DataSource
 
 logger = get_module_logger(__name__)
 
@@ -68,7 +69,13 @@ class GeoScoreETL(ExtractTransformLoad):
         self.geojson_score_usa_high: gpd.GeoDataFrame
         self.geojson_score_usa_low: gpd.GeoDataFrame
 
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return (
+            []
+        )  # we have all prerequisite sources locally as a result of generating the previous steps in the pipeline
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
         # check census data
         check_census_data_source(
             census_data_path=self.DATA_PATH / "census",
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
index 0111bb04..85ce1ba5 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@@ -2,7 +2,9 @@ import json
 from pathlib import Path
 
 import numpy as np
+from numpy import float64
 import pandas as pd
+
 from data_pipeline.content.schemas.download_schemas import CodebookConfig
 from data_pipeline.content.schemas.download_schemas import CSVConfig
 from data_pipeline.content.schemas.download_schemas import ExcelConfig
@@ -16,7 +18,8 @@ from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_dict_from_yaml_object_fields
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import zip_files
-from numpy import float64
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.downloader import Downloader
 
 from . import constants
 
@@ -61,6 +64,11 @@ class PostScoreETL(ExtractTransformLoad):
         self.yaml_global_config_sort_by_label = "sort_by_label"
         # End YAML definition constants
 
+    def get_data_sources(self) -> [DataSource]:
+        return (
+            []
+        )  # we have all prerequisite sources locally as a result of generating the score
+
     def _extract_counties(self, county_path: Path) -> pd.DataFrame:
         logger.debug("Reading Counties CSV")
         return pd.read_csv(
@@ -97,17 +105,23 @@ class PostScoreETL(ExtractTransformLoad):
 
         return df
 
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
         # check census data
         check_census_data_source(
             census_data_path=self.DATA_PATH / "census",
             census_data_source=self.DATA_SOURCE,
         )
 
-        super().extract(
-            constants.CENSUS_COUNTIES_ZIP_URL,
-            constants.TMP_PATH,
+        # TODO would could probably add this to the data sources for this file
+        Downloader.download_zip_file_from_url(
+            constants.CENSUS_COUNTIES_ZIP_URL, constants.TMP_PATH
         )
+
         self.input_counties_df = self._extract_counties(
             constants.CENSUS_COUNTIES_FILE_NAME
         )
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
index bc0f45ac..7de96a42 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@@ -13,7 +13,7 @@ from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
 from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
+from data_pipeline.etl.downloader import Downloader
 from data_pipeline.utils import get_module_logger
 
 from . import constants
@@ -48,7 +48,7 @@ def check_score_data_source(
     # download from s3 if census_data_source is aws
     if score_data_source == "aws":
         logger.debug("Fetching Score Tile data from AWS S3")
-        download_file_from_url(
+        Downloader.download_file_from_url(
             file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
         )
     else:
diff --git a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
index 9e3b2db4..68fc010f 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
@@ -1,23 +1,36 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger
 
 logger = get_module_logger(__name__)
 
 
 class CalEnviroScreenETL(ExtractTransformLoad):
+    """California environmental screen
+
+    TODO: Need good description
+    """
+
     def __init__(self):
-        self.CALENVIROSCREEN_FTP_URL = (
+
+        # fetch
+        self.calenviroscreen_ftp_url = (
             settings.AWS_JUSTICE40_DATASOURCES_URL
             + "/CalEnviroScreen_4.0_2021.zip"
         )
-        self.CALENVIROSCREEN_CSV = (
-            self.get_tmp_path() / "CalEnviroScreen_4.0_2021.csv"
-        )
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
 
-        # Definining some variable names
+        # input
+        self.calenviroscreen_source = (
+            self.get_sources_path() / "CalEnviroScreen_4.0_2021.csv"
+        )
+
+        # output
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
+
+        # Defining some variable names
         self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
         self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
             "calenviroscreen_percentile"
@@ -32,19 +45,28 @@ class CalEnviroScreenETL(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.calenviroscreen_ftp_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
         super().extract(
-            self.CALENVIROSCREEN_FTP_URL,
-            self.get_tmp_path(),
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.calenviroscreen_source, dtype={"Census Tract": "string"}
         )
 
     def transform(self) -> None:
         # Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
         # https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
         # Load comparison index (CalEnviroScreen 4)
-        self.df = pd.read_csv(
-            self.CALENVIROSCREEN_CSV, dtype={"Census Tract": "string"}
-        )
 
         self.df.rename(
             columns={
@@ -68,5 +90,5 @@ class CalEnviroScreenETL(ExtractTransformLoad):
 
     def load(self) -> None:
         # write nationwide csv
-        self.CSV_PATH.mkdir(parents=True, exist_ok=True)
-        self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+        self.df.to_csv(self.OUTPUT_PATH / "data06.csv", index=False)
diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
index 8c2da2e9..14908281 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@@ -7,8 +7,9 @@ from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.score.etl_utils import (
     compare_to_list_of_expected_state_fips_codes,
 )
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
 
@@ -17,59 +18,74 @@ logger = get_module_logger(__name__)
 
 
 class CDCLifeExpectancy(ExtractTransformLoad):
+    """#TODO: create description"""
+
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
     PUERTO_RICO_EXPECTED_IN_DATA = False
 
     NAME = "cdc_life_expectancy"
 
-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
-    else:
-        USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
-
     LOAD_YAML_CONFIG: bool = False
     LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
     INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"
 
     STATES_MISSING_FROM_USA_FILE = ["23", "55"]
 
-    # For some reason, LEEP does not include Maine or Wisconsin in its "All of
-    # USA" file. Load these separately.
-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
-        MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
-    else:
-        WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
-        MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
-
     TRACT_INPUT_COLUMN_NAME = "Tract ID"
     STATE_INPUT_COLUMN_NAME = "STATE2KX"
 
-    raw_df: pd.DataFrame
-    output_df: pd.DataFrame
+    raw_df: pd.DataFrame  # result of extraction
+    output_df: pd.DataFrame  # result of transformation
 
     def __init__(self):
+
+        # fetch
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.usa_file_url = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
+        else:
+            self.usa_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
+
+        # For some reason, LEEP does not include Maine or Wisconsin in its "All of USA" file. Load these separately.
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.wisconsin_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
+            self.maine_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
+        else:
+            self.wisconsin_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
+            self.maine_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
+
+        # input
+        self.usa_source = self.get_sources_path() / "US_A.CSV"
+        self.maine_source = self.get_sources_path() / "ME_A.CSV"
+        self.wisconsin_source = self.get_sources_path() / "WI_A.CSV"
+
+        # output
         self.OUTPUT_PATH: Path = (
             self.DATA_PATH / "dataset" / "cdc_life_expectancy"
         )
 
-        # Constants for output
-        self.COLUMNS_TO_KEEP = [
+        self.COLUMNS_TO_KEEP = [  # the columns to save on output
             self.GEOID_TRACT_FIELD_NAME,
             field_names.LIFE_EXPECTANCY_FIELD,
         ]
 
-    def _download_and_prep_data(
-        self, file_url: str, download_file_name: pathlib.Path
-    ) -> pd.DataFrame:
-        download_file_from_url(
-            file_url=file_url,
-            download_file_name=download_file_name,
-            verify=True,
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.usa_file_url, destination=self.usa_source
+            ),
+            FileDataSource(
+                source=self.maine_file_url, destination=self.maine_source
+            ),
+            FileDataSource(
+                source=self.wisconsin_file_url,
+                destination=self.wisconsin_source,
+            ),
+        ]
+
+    def _read_data(self, file_name: pathlib.Path) -> pd.DataFrame:
 
         df = pd.read_csv(
-            filepath_or_buffer=download_file_name,
+            filepath_or_buffer=file_name,
             dtype={
                 # The following need to remain as strings for all of their digits, not get converted to numbers.
                 self.TRACT_INPUT_COLUMN_NAME: "string",
@@ -80,12 +96,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):
 
         return df
 
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
 
-        all_usa_raw_df = self._download_and_prep_data(
-            file_url=self.USA_FILE_URL,
-            download_file_name=self.get_tmp_path() / "US_A.CSV",
-        )
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        all_usa_raw_df = self._read_data(self.usa_source)
 
         # Check which states are missing
         states_in_life_expectancy_usa_file = list(
@@ -101,17 +118,11 @@ class CDCLifeExpectancy(ExtractTransformLoad):
             additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
         )
 
-        logger.debug("Downloading data for Maine")
-        maine_raw_df = self._download_and_prep_data(
-            file_url=self.MAINE_FILE_URL,
-            download_file_name=self.get_tmp_path() / "maine.csv",
+        maine_raw_df = self._read_data(
+            self.maine_source,
         )
 
-        logger.debug("Downloading data for Wisconsin")
-        wisconsin_raw_df = self._download_and_prep_data(
-            file_url=self.WISCONSIN_FILE_URL,
-            download_file_name=self.get_tmp_path() / "wisconsin.csv",
-        )
+        wisconsin_raw_df = self._read_data(self.wisconsin_source)
 
         combined_df = pd.concat(
             objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df],
diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
index d940cec9..87f79396 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
@@ -4,14 +4,17 @@ import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 
 logger = get_module_logger(__name__)
 
 
 class CDCPlacesETL(ExtractTransformLoad):
+    """#TODO: Need description"""
+
     NAME = "cdc_places"
     GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
     PUERTO_RICO_EXPECTED_IN_DATA = False
@@ -21,15 +24,21 @@ class CDCPlacesETL(ExtractTransformLoad):
     CDC_MEASURE_FIELD_NAME = "Measure"
 
     def __init__(self):
-        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
 
+        # fetch
         if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.CDC_PLACES_URL = (
+            self.cdc_places_url = (
                 f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                 "cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
             )
         else:
-            self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
+            self.cdc_places_url = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
+
+        # input
+        self.places_source = self.get_sources_path() / "census_tract.csv"
+
+        # output
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
 
         self.COLUMNS_TO_KEEP: typing.List[str] = [
             self.GEOID_TRACT_FIELD_NAME,
@@ -43,19 +52,27 @@ class CDCPlacesETL(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
-        file_path = download_file_from_url(
-            file_url=self.CDC_PLACES_URL,
-            download_file_name=self.get_tmp_path() / "census_tract.csv",
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.cdc_places_url, destination=self.places_source
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
 
         self.df = pd.read_csv(
-            filepath_or_buffer=file_path,
+            filepath_or_buffer=self.places_source,
             dtype={self.CDC_GEOID_FIELD_NAME: "string"},
             low_memory=False,
         )
 
     def transform(self) -> None:
+
         # Rename GEOID field
         self.df.rename(
             columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py
index 7f725e91..87c29000 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@@ -11,22 +13,28 @@ logger = get_module_logger(__name__)
 class CDCSVIIndex(ExtractTransformLoad):
     """CDC SVI Index class ingests 2018 dataset located
     here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html
+
     Please see the README in this module for further details.
     """
 
     def __init__(self):
-        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
 
+        # fetch
         if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.CDC_SVI_INDEX_URL = (
+            self.cdc_svi_index_url = (
                 f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                 "cdc_svi_index/SVI2018_US.csv"
             )
         else:
-            self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
+            self.cdc_svi_index_url = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
+
+        # input
+        self.svi_source = self.get_sources_path() / "SVI2018_US.csv"
+
+        # output
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
 
         self.CDC_RPL_THEMES_THRESHOLD = 0.90
-
         self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS"
 
         self.COLUMNS_TO_KEEP = [
@@ -47,9 +55,21 @@ class CDCSVIIndex(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.cdc_svi_index_url, destination=self.svi_source
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
         self.df = pd.read_csv(
-            filepath_or_buffer=self.CDC_SVI_INDEX_URL,
+            filepath_or_buffer=self.svi_source,
             dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
             low_memory=False,
         )
@@ -107,8 +127,8 @@ class CDCSVIIndex(ExtractTransformLoad):
             )
 
     def load(self) -> None:
-        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
 
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
         self.df[self.COLUMNS_TO_KEEP].to_csv(
             path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
         )
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
index 407b83fc..1f4b260a 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
@@ -8,7 +8,8 @@ import geopandas as gpd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 
 logger = get_module_logger(__name__)
 
@@ -20,7 +21,7 @@ class GeoFileType(Enum):
 
 
 class CensusETL(ExtractTransformLoad):
-    SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
+    # SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
     GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
     CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
     GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
@@ -29,6 +30,9 @@ class CensusETL(ExtractTransformLoad):
     GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
 
     def __init__(self):
+
+        self.shape_file_path = self.get_sources_path() / "shp"
+
         # the fips_states_2010.csv is generated from data here
         # https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
         self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
@@ -50,7 +54,7 @@ class CensusETL(ExtractTransformLoad):
         file_path: Path
         if file_type == GeoFileType.SHP:
             file_path = Path(
-                self.SHP_BASE_PATH
+                self.shape_file_path
                 / fips_code
                 / f"tl_2010_{fips_code}_tract10.shp"
             )
@@ -60,33 +64,22 @@ class CensusETL(ExtractTransformLoad):
             file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
         return file_path
 
-    def _extract_shp(self, fips_code: str) -> None:
-        """Download the SHP file for the provided FIPS code
+    def get_data_sources(self) -> [DataSource]:
 
-        Args:
-            fips_code (str): the FIPS code for the region of interest
+        sources = []
 
-        Returns:
-            None
-        """
-        shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
+        for fips_code in self.STATE_FIPS_CODES:
 
-        # check if file exists
-        if not shp_file_path.is_file():
             tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
-            unzip_file_from_url(
-                tract_state_url,
-                self.TMP_PATH,
-                self.DATA_PATH / "census" / "shp" / fips_code,
+            destination_path = self.shape_file_path / fips_code
+
+            sources.append(
+                ZIPDataSource(
+                    source=tract_state_url, destination=destination_path
+                )
             )
 
-    def extract(self) -> None:
-        logger.debug("Extracting census data")
-        for index, fips_code in enumerate(self.STATE_FIPS_CODES):
-            logger.debug(
-                f"Extracting shape for FIPS {fips_code} – {index+1} of {len(self.STATE_FIPS_CODES)}"
-            )
-            self._extract_shp(fips_code)
+        return sources
 
     def _transform_to_geojson(self, fips_code: str) -> None:
         """Convert the downloaded SHP file for the associated FIPS to geojson
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
index 67a9b32e..9806aa97 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@@ -56,6 +56,7 @@ def get_state_fips_codes(data_path: Path) -> list:
             else:
                 fips = row[0].strip()
                 fips_state_list.append(fips)
+
     return fips_state_list
 
 
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
index c2965493..d67a2bc3 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@@ -8,12 +8,11 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census_acs.etl_imputations import (
     calculate_income_measures,
 )
-from data_pipeline.etl.sources.census_acs.etl_utils import (
-    retrieve_census_acs_data,
-)
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import CensusDataSource
 
 logger = get_module_logger(__name__)
 
@@ -28,6 +27,9 @@ class CensusACSETL(ExtractTransformLoad):
     MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
 
     def __init__(self):
+
+        self.census_acs_source = self.get_sources_path() / "acs.csv"
+
         self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
         self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
         self.EMPLOYMENT_FIELDS = [
@@ -311,6 +313,34 @@ class CensusACSETL(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
+    def get_data_sources(self) -> [DataSource]:
+        # Define the variables to retrieve
+        variables = (
+            [
+                self.MEDIAN_INCOME_FIELD,
+                self.MEDIAN_HOUSE_VALUE_FIELD,
+            ]
+            + self.EMPLOYMENT_FIELDS
+            + self.LINGUISTIC_ISOLATION_FIELDS
+            + self.POVERTY_FIELDS
+            + self.EDUCATIONAL_FIELDS
+            + self.RE_FIELDS
+            + self.COLLEGE_ATTENDANCE_FIELDS
+            + self.AGE_INPUT_FIELDS
+        )
+
+        return [
+            CensusDataSource(
+                source=None,
+                destination=self.census_acs_source,
+                acs_year=self.ACS_YEAR,
+                variables=variables,
+                tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
+                data_path_for_fips_codes=self.DATA_PATH,
+                acs_type="acs5",
+            )
+        ]
+
     # pylint: disable=too-many-arguments
     def _merge_geojson(
         self,
@@ -339,27 +369,15 @@ class CensusACSETL(ExtractTransformLoad):
             )
         )
 
-    def extract(self) -> None:
-        # Define the variables to retrieve
-        variables = (
-            [
-                self.MEDIAN_INCOME_FIELD,
-                self.MEDIAN_HOUSE_VALUE_FIELD,
-            ]
-            + self.EMPLOYMENT_FIELDS
-            + self.LINGUISTIC_ISOLATION_FIELDS
-            + self.POVERTY_FIELDS
-            + self.EDUCATIONAL_FIELDS
-            + self.RE_FIELDS
-            + self.COLLEGE_ATTENDANCE_FIELDS
-            + self.AGE_INPUT_FIELDS
-        )
+    def extract(self, use_cached_data_sources: bool = False) -> None:
 
-        self.df = retrieve_census_acs_data(
-            acs_year=self.ACS_YEAR,
-            variables=variables,
-            tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
-            data_path_for_fips_codes=self.DATA_PATH,
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.census_acs_source,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
         )
 
     def transform(self) -> None:
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
index a6dc5869..50cfef76 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
@@ -1,10 +1,9 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.etl.sources.census_acs.etl_utils import (
-    retrieve_census_acs_data,
-)
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import CensusDataSource
 
 logger = get_module_logger(__name__)
 
@@ -18,6 +17,9 @@ class CensusACS2010ETL(ExtractTransformLoad):
     """
 
     def __init__(self):
+
+        self.census_acs_source = self.get_sources_path() / "acs_2010.csv"
+
         self.ACS_YEAR = 2010
         self.ACS_TYPE = "acs5"
         self.OUTPUT_PATH = (
@@ -99,7 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
         # Define the variables to retrieve
         variables = (
             self.UNEMPLOYED_FIELDS
@@ -107,13 +109,26 @@ class CensusACS2010ETL(ExtractTransformLoad):
             + self.POVERTY_FIELDS
         )
 
-        # Use the method defined on CensusACSETL to reduce coding redundancy.
-        self.df = retrieve_census_acs_data(
-            acs_year=self.ACS_YEAR,
-            variables=variables,
-            tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
-            data_path_for_fips_codes=self.DATA_PATH,
-            acs_type=self.ACS_TYPE,
+        return [
+            CensusDataSource(
+                source=None,
+                destination=self.census_acs_source,
+                acs_year=self.ACS_YEAR,
+                variables=variables,
+                tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
+                data_path_for_fips_codes=self.DATA_PATH,
+                acs_type=self.ACS_TYPE,
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.census_acs_source, dtype={"GEOID10_TRACT": "string"}
         )
 
     def transform(self) -> None:
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
index f8abc7c4..2a1bf962 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
@@ -1,14 +1,16 @@
+import os
 import json
 from pathlib import Path
 
 import numpy as np
 import pandas as pd
-import requests
+
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
+from data_pipeline.etl.datasource import FileDataSource
 
 logger = get_module_logger(__name__)
 
@@ -22,6 +24,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
             / f"census_acs_median_income_{self.ACS_YEAR}"
         )
 
+        self.GEOCORR_ALL_STATES_URL = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/geocorr2014_all_states_tracts_only.csv.zip"
+        )
+        self.GEOCORR_ALL_STATES_PATH = self.get_sources_path() / "geocorr"
+        self.GEOCORR_ALL_STATES_SOURCE = (
+            self.GEOCORR_ALL_STATES_PATH
+            / "geocorr2014_all_states_tracts_only.csv"
+        )
+
         # Set constants for Geocorr MSAs data.
         self.PLACE_FIELD_NAME: str = "Census Place Name"
         self.COUNTY_FIELD_NAME: str = "County Name"
@@ -39,10 +51,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
             f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E"
             + "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area"
         )
+        self.MSA_MEDIAN_INCOME_SOURCE = (
+            self.get_sources_path() / "msa" / "msa_median_income.json"
+        )
         self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)"
 
         # Set constants for state median incomes
         self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state"
+        self.STATE_MEDIAN_INCOME_SOURCE = (
+            self.get_sources_path() / "state" / "state_median_income.json"
+        )
         self.STATE_GEOID_FIELD_NAME: str = "GEOID2"
         self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)"
 
@@ -50,6 +68,18 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
         self.PUERTO_RICO_S3_LINK: str = (
             settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv"
         )
+        self.PUERTO_RICO_ALL_STATES_SOURCE = (
+            self.get_sources_path() / "pr_tracts" / "pr_tracts.csv"
+        )
+
+        census_api_key = os.environ.get("CENSUS_API_KEY")
+        if census_api_key:
+            self.MSA_MEDIAN_INCOME_URL = (
+                self.MSA_MEDIAN_INCOME_URL + f"&key={census_api_key}"
+            )
+            self.STATE_MEDIAN_INCOME_URL = (
+                self.STATE_MEDIAN_INCOME_URL + f"&key={census_api_key}"
+            )
 
         # Constants for output
         self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
@@ -76,6 +106,27 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
         self.state_median_incomes: dict
         self.pr_tracts: pd.DataFrame
 
+    def get_data_sources(self) -> [DataSource]:
+
+        return [
+            ZIPDataSource(
+                source=self.GEOCORR_ALL_STATES_URL,
+                destination=self.GEOCORR_ALL_STATES_PATH,
+            ),
+            FileDataSource(
+                source=self.PUERTO_RICO_S3_LINK,
+                destination=self.PUERTO_RICO_ALL_STATES_SOURCE,
+            ),
+            FileDataSource(
+                source=self.MSA_MEDIAN_INCOME_URL,
+                destination=self.MSA_MEDIAN_INCOME_SOURCE,
+            ),
+            FileDataSource(
+                source=self.STATE_MEDIAN_INCOME_URL,
+                destination=self.STATE_MEDIAN_INCOME_SOURCE,
+            ),
+        ]
+
     def _transform_geocorr(self) -> pd.DataFrame:
         # Transform the geocorr data
         geocorr_df = self.raw_geocorr_df
@@ -223,7 +274,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
         )
         return state_median_incomes_df
 
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
         # Load and clean GEOCORR data
         # Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
         # The specific query used is the following, which takes a couple of minutes to run:
@@ -239,18 +291,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
         # - Core based statistical area (CBSA)
         # - CBSA Type (Metro or Micro)
         logger.debug("Starting download of 1.5MB Geocorr information.")
-
-        unzip_file_from_url(
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/geocorr2014_all_states_tracts_only.csv.zip",
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path() / "geocorr",
-        )
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
 
         self.raw_geocorr_df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "geocorr"
-            / "geocorr2014_all_states_tracts_only.csv",
+            filepath_or_buffer=self.GEOCORR_ALL_STATES_SOURCE,
             # Skip second row, which has descriptions.
             skiprows=[1],
             # The following need to remain as strings for all of their digits, not get converted to numbers.
@@ -264,39 +310,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
             low_memory=False,
         )
 
-        logger.debug("Pulling PR tract list down.")
-        # This step is necessary because PR is not in geocorr at the level that gets joined
-        pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
-        download_file_from_url(
-            file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file
-        )
         self.pr_tracts = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "pr_tracts"
-            / "pr_tracts.csv",
+            filepath_or_buffer=self.PUERTO_RICO_ALL_STATES_SOURCE,
             # The following need to remain as strings for all of their digits, not get converted to numbers.
             dtype={"GEOID10_TRACT": str},
             low_memory=False,
         )
         self.pr_tracts["State Abbreviation"] = "PR"
 
-        # Download MSA median incomes
-        logger.debug("Starting download of MSA median incomes.")
-        download = requests.get(
-            self.MSA_MEDIAN_INCOME_URL,
-            verify=None,
-            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
-        )
-        self.msa_median_incomes = json.loads(download.content)
+        with self.MSA_MEDIAN_INCOME_SOURCE.open() as source:
+            self.msa_median_incomes = json.load(source)
 
-        # Download state median incomes
-        logger.debug("Starting download of state median incomes.")
-        download_state = requests.get(
-            self.STATE_MEDIAN_INCOME_URL,
-            verify=None,
-            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
-        )
-        self.state_median_incomes = json.loads(download_state.content)
+        with self.STATE_MEDIAN_INCOME_SOURCE.open() as source:
+            self.state_median_incomes = json.load(source)
         ## NOTE we already have PR's MI here
 
     def transform(self) -> None:
diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
index 395697fc..4fe26249 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
@@ -1,13 +1,14 @@
 import json
 from typing import List
+import os
 
 import numpy as np
 import pandas as pd
-import requests
-from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 
 pd.options.mode.chained_assignment = "raise"
 
@@ -342,20 +343,23 @@ class CensusDecennialETL(ExtractTransformLoad):
             + "&for=tract:*&in=state:{}%20county:{}"
         )
 
+        census_api_key = os.environ.get("CENSUS_API_KEY")
+        if census_api_key:
+            self.API_URL = self.API_URL + f"&key={census_api_key}"
+
         self.final_race_fields: List[str] = []
 
         self.df: pd.DataFrame
         self.df_vi: pd.DataFrame
         self.df_all: pd.DataFrame
 
-    def extract(self) -> None:
-        dfs = []
-        dfs_vi = []
+    def get_data_sources(self) -> [DataSource]:
+
+        sources = []
+
         for island in self.ISLAND_TERRITORIES:
-            logger.debug(
-                f"Downloading data for state/territory {island['state_abbreviation']}"
-            )
             for county in island["county_fips"]:
+
                 api_url = self.API_URL.format(
                     self.DECENNIAL_YEAR,
                     island["state_abbreviation"],
@@ -363,17 +367,48 @@ class CensusDecennialETL(ExtractTransformLoad):
                     island["fips"],
                     county,
                 )
-                logger.debug(f"CENSUS: Requesting {api_url}")
-                download = requests.get(
-                    api_url,
-                    timeout=settings.REQUESTS_DEFAULT_TIMOUT,
+
+                sources.append(
+                    FileDataSource(
+                        source=api_url,
+                        destination=self.get_sources_path()
+                        / str(self.DECENNIAL_YEAR)
+                        / island["state_abbreviation"]
+                        / island["fips"]
+                        / county
+                        / "census.json",
+                    )
                 )
 
+        return sources
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        dfs = []
+        dfs_vi = []
+        for island in self.ISLAND_TERRITORIES:
+            logger.debug(
+                f"Downloading data for state/territory {island['state_abbreviation']}"
+            )
+            for county in island["county_fips"]:
+
                 try:
-                    df = json.loads(download.content)
+                    filepath = (
+                        self.get_sources_path()
+                        / str(self.DECENNIAL_YEAR)
+                        / island["state_abbreviation"]
+                        / island["fips"]
+                        / county
+                        / "census.json"
+                    )
+                    df = json.load(filepath.open())
                 except ValueError as e:
                     logger.error(
-                        f"Could not load content in census decennial ETL because {e}. Content is {download.content}."
+                        f"Could not load content in census decennial ETL because {e}."
                     )
 
                 # First row is the header
diff --git a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
index 5f9a10b8..c9b95ecb 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
@@ -5,6 +5,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 
 logger = get_module_logger(__name__)
 
@@ -38,17 +40,26 @@ class ChildOpportunityIndex(ExtractTransformLoad):
     PUERTO_RICO_EXPECTED_IN_DATA = False
 
     def __init__(self):
+
+        # fetch
         if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.SOURCE_URL = (
+            self.child_opportunity_url = (
                 f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                 "child_opportunity_index/raw.zip"
             )
         else:
-            self.SOURCE_URL = (
+            self.child_opportunity_url = (
                 "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
                 "3a0ededa30a0?format=csv"
             )
 
+        # input
+        self.child_opportunity_index_source = (
+            self.get_sources_path() / "raw.csv"
+        )
+
+        # output
+
         # TODO: Decide about nixing this
         self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
 
@@ -62,17 +73,25 @@ class ChildOpportunityIndex(ExtractTransformLoad):
         self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN"
         self.READING_INPUT_FIELD = "ED_READING"
 
+        self.raw_df: pd.DataFrame
         self.output_df: pd.DataFrame
 
-    def extract(self) -> None:
-        super().extract(
-            source_url=self.SOURCE_URL,
-            extract_path=self.get_tmp_path(),
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.child_opportunity_url,
+                destination=self.get_sources_path(),
+            )
+        ]
 
-    def transform(self) -> None:
-        raw_df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path() / "raw.csv",
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.raw_df = pd.read_csv(
+            filepath_or_buffer=self.child_opportunity_index_source,
             # The following need to remain as strings for all of their digits, not get
             # converted to numbers.
             dtype={
@@ -81,7 +100,9 @@ class ChildOpportunityIndex(ExtractTransformLoad):
             low_memory=False,
         )
 
-        output_df = raw_df.rename(
+    def transform(self) -> None:
+
+        output_df = self.raw_df.rename(
             columns={
                 self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
                 self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD,
diff --git a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
index 0056be9a..39c1ba6c 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
@@ -5,22 +5,35 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 
 logger = get_module_logger(__name__)
 
 
 class DOEEnergyBurden(ExtractTransformLoad):
+
     NAME = "doe_energy_burden"
-    SOURCE_URL: str = (
-        settings.AWS_JUSTICE40_DATASOURCES_URL
-        + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
-    )
+
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
     LOAD_YAML_CONFIG: bool = True
 
     REVISED_ENERGY_BURDEN_FIELD_NAME: str
 
     def __init__(self):
+
+        # fetch
+        self.doe_energy_burden_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
+        )
+
+        # input
+        self.doe_energy_burden_source = (
+            self.get_sources_path() / "DOE_LEAD_AMI_TRACT_2018_ALL.csv"
+        )
+
+        # output
         self.OUTPUT_PATH: Path = (
             self.DATA_PATH / "dataset" / "doe_energy_burden"
         )
@@ -29,10 +42,22 @@ class DOEEnergyBurden(ExtractTransformLoad):
         self.raw_df: pd.DataFrame
         self.output_df: pd.DataFrame
 
-    def transform(self) -> None:
-        raw_df: pd.DataFrame = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.doe_energy_burden_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.raw_df = pd.read_csv(
+            filepath_or_buffer=self.doe_energy_burden_source,
             # The following need to remain as strings for all of their digits, not get converted to numbers.
             dtype={
                 self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
@@ -40,8 +65,10 @@ class DOEEnergyBurden(ExtractTransformLoad):
             low_memory=False,
         )
 
+    def transform(self) -> None:
+
         logger.debug("Renaming columns and ensuring output format is correct")
-        output_df = raw_df.rename(
+        output_df = self.raw_df.rename(
             columns={
                 self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
                 self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
diff --git a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
index 3329ec6a..794ee97e 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
@@ -3,6 +3,8 @@
 import geopandas as gpd
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@@ -15,14 +17,6 @@ class TravelCompositeETL(ExtractTransformLoad):
 
     NAME = "travel_composite"
 
-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        SOURCE_URL = (
-            f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
-            "dot_travel_composite/Shapefile_and_Metadata.zip"
-        )
-    else:
-        SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
-
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
     PUERTO_RICO_EXPECTED_IN_DATA = False
     LOAD_YAML_CONFIG: bool = True
@@ -31,14 +25,29 @@ class TravelCompositeETL(ExtractTransformLoad):
     TRAVEL_BURDEN_FIELD_NAME: str
 
     def __init__(self):
+
+        # fetch
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.travel_composite_url = (
+                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
+                "dot_travel_composite/Shapefile_and_Metadata.zip"
+            )
+        else:
+            self.travel_composite_url = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
+
+        # input
         # define the full path for the input CSV file
-        self.INPUT_SHP = (
-            self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
+        self.disadvantage_layer_shape_source = (
+            self.get_sources_path()
+            / "DOT_Disadvantage_Layer_Final_April2022.shp"
         )
 
+        # output
         # this is the main dataframe
         self.df: pd.DataFrame
 
+        self.df_dot: pd.DataFrame
+
         # Start dataset-specific vars here
         ## Average of Transportation Indicator Percentiles (calculated)
         ## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
@@ -46,6 +55,22 @@ class TravelCompositeETL(ExtractTransformLoad):
         self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
         self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
 
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.travel_composite_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df_dot = gpd.read_file(self.disadvantage_layer_shape_source)
+
     def transform(self) -> None:
         """Reads the unzipped data file into memory and applies the following
         transformations to prepare it for the load() method:
@@ -54,15 +79,15 @@ class TravelCompositeETL(ExtractTransformLoad):
         - Converts to CSV
         """
 
-        # read in the unzipped shapefile from data source
         # reformat it to be standard df, remove unassigned rows, and
         # then rename the Census Tract column for merging
-        df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
-        df_dot = df_dot.rename(
+
+        self.df_dot = self.df_dot.rename(
             columns={
                 self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                 self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
             }
         ).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
+
         # Assign the final df to the class' output_df for the load method
-        self.output_df = df_dot
+        self.output_df = self.df_dot
diff --git a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
index 3162c637..b5bb163f 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
@@ -1,12 +1,15 @@
 from pathlib import Path
 
-import geopandas as gpd
 import pandas as pd
+import geopandas as gpd
+
 from data_pipeline.config import settings
-from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
-from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 
 logger = get_module_logger(__name__)
 
@@ -39,13 +42,20 @@ class AbandonedMineETL(ExtractTransformLoad):
         "55",
     ]
 
-    # Define these for easy code completion
     def __init__(self):
-        self.SOURCE_URL = (
+
+        # fetch
+        self.eamlis_url = (
             settings.AWS_JUSTICE40_DATASOURCES_URL
             + "/eAMLIS export of all data.tsv.zip"
         )
 
+        # input
+        self.eamlis_source = (
+            self.get_sources_path() / "eAMLIS export of all data.tsv"
+        )
+
+        # output
         self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
 
         self.OUTPUT_PATH: Path = (
@@ -58,18 +68,34 @@ class AbandonedMineETL(ExtractTransformLoad):
         ]
 
         self.output_df: pd.DataFrame
+        self.df: pd.DataFrame
 
-    def transform(self) -> None:
-        df = pd.read_csv(
-            self.get_tmp_path() / "eAMLIS export of all data.tsv",
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.eamlis_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.eamlis_source,
             sep="\t",
             low_memory=False,
         )
+
+    def transform(self) -> None:
+
         gdf = gpd.GeoDataFrame(
-            df,
+            self.df,
             geometry=gpd.points_from_xy(
-                x=df["Longitude"],
-                y=df["Latitude"],
+                x=self.df["Longitude"],
+                y=self.df["Latitude"],
             ),
             crs="epsg:4326",
         )
@@ -77,4 +103,5 @@ class AbandonedMineETL(ExtractTransformLoad):
         gdf_tracts = add_tracts_for_geometries(gdf)
         gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
         gdf_tracts[self.AML_BOOLEAN] = True
+
         self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]
diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
index 0db8e648..44962156 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@@ -3,6 +3,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 
 logger = get_module_logger(__name__)
 
@@ -15,11 +17,18 @@ class EJSCREENETL(ExtractTransformLoad):
     INPUT_GEOID_TRACT_FIELD_NAME: str = "ID"
 
     def __init__(self):
-        self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
-        self.EJSCREEN_CSV = (
-            self.get_tmp_path() / "EJSCREEN_2021_USPR_Tracts.csv"
+
+        # fetch
+        self.ejscreen_url = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
+
+        # input
+        self.ejscreen_source = (
+            self.get_sources_path() / "EJSCREEN_2021_USPR_Tracts.csv"
         )
+
+        # output
         self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen"
+
         self.df: pd.DataFrame
 
         self.COLUMNS_TO_KEEP = [
@@ -43,22 +52,29 @@ class EJSCREENETL(ExtractTransformLoad):
             field_names.UST_FIELD,
         ]
 
-    def extract(self) -> None:
-        super().extract(
-            self.EJSCREEN_FTP_URL,
-            self.get_tmp_path(),
-            verify=False,  # EPA EJScreen end point has certificate issues often
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.ejscreen_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
 
-    def transform(self) -> None:
         self.df = pd.read_csv(
-            self.EJSCREEN_CSV,
+            self.ejscreen_source,
             dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
             # EJSCREEN writes the word "None" for NA data.
             na_values=["None"],
             low_memory=False,
         )
 
+    def transform(self) -> None:
+
         # rename ID to Tract ID
         self.output_df = self.df.rename(
             columns={
diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
index 8c18034d..f8e09cb1 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
@@ -1,5 +1,6 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.utils import get_module_logger
 
 logger = get_module_logger(__name__)
@@ -9,12 +10,17 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
     # Note: while we normally set these properties in `__init__`,
     # we are setting them as class properties here so they can be accessed by the
     # class method `ejscreen_areas_of_concern_data_exists`.
-    LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
-    EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
-        LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
+
+    EJSCREEN_AREAS_OF_CONCERN_SOURCE = (
+        ExtractTransformLoad.DATA_PATH
+        / "sources"
+        / "EJSCREENAreasOfConcernETL"
+        / "ejscreen_areas_of_concerns_indicators.csv"
     )
 
     def __init__(self):
+
+        # output
         self.OUTPUT_PATH = (
             self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
         )
@@ -22,6 +28,10 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
         # TO DO: Load from actual source; the issue is that this dataset is not public for now
         self.df: pd.DataFrame
 
+    def get_data_sources(self) -> [DataSource]:
+        """The source for this must be downloaded and saved manually. It is not publicly available"""
+        return []
+
     @classmethod
     def ejscreen_areas_of_concern_data_exists(cls):
         """Check whether or not the EJSCREEN areas of concern data exists.
@@ -35,13 +45,19 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
         not reference this data.
 
         """
-        return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
+        return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE.is_file()
 
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        logger.info(self.EJSCREEN_AREAS_OF_CONCERN_SOURCE)
         if self.ejscreen_areas_of_concern_data_exists():
             logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
             self.df = pd.read_csv(
-                filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
+                filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE,
                 dtype={
                     self.GEOID_FIELD_NAME: "string",
                 },
diff --git a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
index 3f27898e..136eaa54 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
@@ -5,18 +5,27 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 
 logger = get_module_logger(__name__)
 
 
 class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
     def __init__(self):
-        self.DEFINITION_ALTERNATIVE_FILE_URL = (
+
+        # fetch
+        self.definition_alternative_url = (
             settings.AWS_JUSTICE40_DATASOURCES_URL
             + "/alternative DAC definition.csv.zip"
         )
 
+        # input
+        self.definition_alternative_source = (
+            self.get_sources_path() / "J40 alternative DAC definition.csv"
+        )
+
+        # output
         self.OUTPUT_PATH: Path = (
             self.DATA_PATH / "dataset" / "energy_definition_alternative_draft"
         )
@@ -48,18 +57,22 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
-        unzip_file_from_url(
-            file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path()
-            / "energy_definition_alternative_draft",
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.definition_alternative_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
 
         self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "energy_definition_alternative_draft"
-            / "J40 alternative DAC definition.csv",
+            filepath_or_buffer=self.definition_alternative_source,
             # The following need to remain as strings for all of their digits, not get converted to numbers.
             dtype={
                 self.TRACT_INPUT_COLUMN_NAME: "string",
@@ -68,6 +81,7 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
         )
 
     def transform(self) -> None:
+
         self.df = self.df.rename(
             columns={
                 self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
diff --git a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
index 56f8bcc4..199ed9ff 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
@@ -4,8 +4,9 @@ import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 
 logger = get_module_logger(__name__)
 
@@ -23,17 +24,25 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
 
     def __init__(self):
 
+        # fetch
         if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.AGGREGATED_RSEI_SCORE_FILE_URL = (
+            self.aggregated_rsei_score_file_url = (
                 f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                 "epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
             )
         else:
-            self.AGGREGATED_RSEI_SCORE_FILE_URL = (
+            self.aggregated_rsei_score_file_url = (
                 "http://abt-rsei.s3.amazonaws.com/microdata2019/"
                 "census_agg/CensusMicroTracts2019_2019_aggregated.zip"
             )
 
+        # input
+        self.aggregated_rsei_score_source = (
+            self.get_sources_path()
+            / "CensusMicroTracts2019_2019_aggregated.csv"
+        )
+
+        # output
         self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
         self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
         self.TRACT_INPUT_COLUMN_NAME = "GEOID10"
@@ -64,7 +73,20 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.aggregated_rsei_score_file_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
         # the column headers from the above dataset are actually a census tract's data at this point
         # We will use this data structure later to specify the column names
         input_columns = [
@@ -79,16 +101,8 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
             self.NCSCORE_INPUT_FIELD,
         ]
 
-        unzip_file_from_url(
-            file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL,
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path() / "epa_rsei",
-        )
-
         self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "epa_rsei"
-            / "CensusMicroTracts2019_2019_aggregated.csv",
+            filepath_or_buffer=self.aggregated_rsei_score_source,
             # The following need to remain as strings for all of their digits, not get
             # converted to numbers.
             low_memory=False,
diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py
index 60534daa..55001436 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py
@@ -5,6 +5,8 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 
 logger = get_module_logger(__name__)
 
@@ -15,7 +17,7 @@ class FloodRiskETL(ExtractTransformLoad):
     NAME = "fsf_flood_risk"
     # These data were emailed to the J40 team while first street got
     # their official data sharing channels setup.
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
+
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
     LOAD_YAML_CONFIG: bool = True
 
@@ -27,13 +29,16 @@ class FloodRiskETL(ExtractTransformLoad):
     SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str
 
     def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = (
-            self.get_tmp_path() / "fsf_flood" / "flood-tract2010.csv"
+
+        # fetch
+        self.flood_tract_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
         )
 
-        # this is the main dataframe
-        self.df: pd.DataFrame
+        # input
+        self.flood_tract_source = (
+            self.get_sources_path() / "fsf_flood" / "flood-tract2010.csv"
+        )
 
         # Start dataset-specific vars here
         self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
@@ -41,6 +46,29 @@ class FloodRiskETL(ExtractTransformLoad):
         self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30"
         self.CLIP_PROPERTIES_COUNT = 250
 
+        self.df_fsf_flood: pd.DataFrame
+
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.flood_tract_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        # read in the unzipped csv data source then rename the
+        # Census Tract column for merging
+        self.df_fsf_flood = pd.read_csv(
+            self.flood_tract_source,
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
+            low_memory=False,
+        )
+
     def transform(self) -> None:
         """Reads the unzipped data file into memory and applies the following
         transformations to prepare it for the load() method:
@@ -49,35 +77,29 @@ class FloodRiskETL(ExtractTransformLoad):
         - Calculates share of properties at risk, left-clipping number of properties at 250
         """
 
-        # read in the unzipped csv data source then rename the
-        # Census Tract column for merging
-        df_fsf_flood: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
-            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
-            low_memory=False,
-        )
-
-        df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = df_fsf_flood[
+        self.df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_flood[
             self.INPUT_GEOID_TRACT_FIELD_NAME
         ].str.zfill(11)
 
-        df_fsf_flood[self.COUNT_PROPERTIES] = df_fsf_flood[
+        self.df_fsf_flood[self.COUNT_PROPERTIES] = self.df_fsf_flood[
             self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
         ].clip(lower=self.CLIP_PROPERTIES_COUNT)
 
-        df_fsf_flood[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY] = (
-            df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
-            / df_fsf_flood[self.COUNT_PROPERTIES]
+        self.df_fsf_flood[
+            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY
+        ] = (
+            self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
+            / self.df_fsf_flood[self.COUNT_PROPERTIES]
         )
-        df_fsf_flood[
+        self.df_fsf_flood[
             self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS
         ] = (
-            df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
-            / df_fsf_flood[self.COUNT_PROPERTIES]
+            self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
+            / self.df_fsf_flood[self.COUNT_PROPERTIES]
         )
 
         # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_fsf_flood.rename(
+        self.output_df = self.df_fsf_flood.rename(
             columns={
                 self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY,
                 self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS,
diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
index 2680eaf3..ebb88b73 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
@@ -4,6 +4,8 @@ import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger
 
 logger = get_module_logger(__name__)
@@ -15,7 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad):
     NAME = "fsf_wildfire_risk"
     # These data were emailed to the J40 team while first street got
     # their official data sharing channels setup.
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
+
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
     PUERTO_RICO_EXPECTED_IN_DATA = False
     LOAD_YAML_CONFIG: bool = True
@@ -29,18 +31,48 @@ class WildfireRiskETL(ExtractTransformLoad):
     SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str
 
     def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = self.get_tmp_path() / "fsf_fire" / "fire-tract2010.csv"
 
+        # fetch
+        self.fsf_fire_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
+        )
+
+        # input
+        self.fsf_fire_source = (
+            self.get_sources_path() / "fsf_fire" / "fire-tract2010.csv"
+        )
+
+        # output
         # this is the main dataframe
         self.df: pd.DataFrame
 
+        self.df_fsf_fire: pd.DataFrame
+
         # Start dataset-specific vars here
         self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
         self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag"
         self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag"
         self.CLIP_PROPERTIES_COUNT = 250
 
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.fsf_fire_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df_fsf_fire = pd.read_csv(
+            self.fsf_fire_source,
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
+            low_memory=False,
+        )
+
     def transform(self) -> None:
         """Reads the unzipped data file into memory and applies the following
         transformations to prepare it for the load() method:
@@ -50,31 +82,28 @@ class WildfireRiskETL(ExtractTransformLoad):
         """
         # read in the unzipped csv data source then rename the
         # Census Tract column for merging
-        df_fsf_fire: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
-            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
-            low_memory=False,
-        )
 
-        df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = df_fsf_fire[
+        self.df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_fire[
             self.INPUT_GEOID_TRACT_FIELD_NAME
         ].str.zfill(11)
 
-        df_fsf_fire[self.COUNT_PROPERTIES] = df_fsf_fire[
+        self.df_fsf_fire[self.COUNT_PROPERTIES] = self.df_fsf_fire[
             self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
         ].clip(lower=self.CLIP_PROPERTIES_COUNT)
 
-        df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
-            df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
-            / df_fsf_fire[self.COUNT_PROPERTIES]
+        self.df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
+            self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
+            / self.df_fsf_fire[self.COUNT_PROPERTIES]
         )
-        df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS] = (
-            df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
-            / df_fsf_fire[self.COUNT_PROPERTIES]
+        self.df_fsf_fire[
+            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS
+        ] = (
+            self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
+            / self.df_fsf_fire[self.COUNT_PROPERTIES]
         )
 
         # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_fsf_fire.rename(
+        self.output_df = self.df_fsf_fire.rename(
             columns={
                 self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY,
                 self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS,
diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
index 223f0b09..16b719c2 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
@@ -3,17 +3,33 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 
 logger = get_module_logger(__name__)
 
 
 class GeoCorrETL(ExtractTransformLoad):
+
     NAME = "geocorr"
+
     GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
     PUERTO_RICO_EXPECTED_IN_DATA = False
 
     def __init__(self):
+
+        # fetch
+        self.geocorr_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/geocorr_urban_rural.csv.zip"
+        )
+
+        # input
+        self.geocorr_source = (
+            self.get_sources_path() / "geocorr_urban_rural.csv"
+        )
+
+        # output
         self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"
 
         # Need to change hyperlink to S3
@@ -23,7 +39,7 @@ class GeoCorrETL(ExtractTransformLoad):
         # The source data for this notebook was downloaded from GeoCorr;
         # the instructions for generating the source data is here:
         # https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787
-        self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
+        # self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
         self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
         self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag"
         self.COLUMNS_TO_KEEP = [
@@ -33,16 +49,21 @@ class GeoCorrETL(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
-        unzip_file_from_url(
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/geocorr_urban_rural.csv.zip",
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path(),
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.geocorr_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
 
         self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv",
+            filepath_or_buffer=self.geocorr_source,
             dtype={
                 self.GEOCORR_GEOID_FIELD_NAME: "string",
             },
diff --git a/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py b/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py
index a65ed126..fb56b3cc 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py
@@ -3,12 +3,16 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 
 logger = get_module_logger(__name__)
 
 
 class HistoricRedliningETL(ExtractTransformLoad):
+
     NAME = "historic_redlining"
+
     GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
     EXPECTED_MISSING_STATES = [
         "10",
@@ -25,14 +29,14 @@ class HistoricRedliningETL(ExtractTransformLoad):
     ]
     PUERTO_RICO_EXPECTED_IN_DATA = False
     ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
 
     def __init__(self):
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "historic_redlining"
 
-        self.HISTORIC_REDLINING_FILE_PATH = (
-            self.get_tmp_path() / "HRS_2010.xlsx"
-        )
+        # fetch
+        self.hrs_url = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
+
+        # input
+        self.hrs_source = self.get_sources_path() / "HRS_2010.xlsx"
 
         self.REDLINING_SCALAR = "Tract-level redlining score"
 
@@ -40,30 +44,47 @@ class HistoricRedliningETL(ExtractTransformLoad):
             self.GEOID_TRACT_FIELD_NAME,
             self.REDLINING_SCALAR,
         ]
+
         self.df: pd.DataFrame
+        self.historic_redlining_data: pd.DataFrame
+
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.hrs_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.historic_redlining_data = pd.read_excel(self.hrs_source)
 
     def transform(self) -> None:
         # this is obviously temporary
-        historic_redlining_data = pd.read_excel(
-            self.HISTORIC_REDLINING_FILE_PATH
+
+        self.historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
+            self.historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
         )
-        historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
-            historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
-        )
-        historic_redlining_data = historic_redlining_data.rename(
+        self.historic_redlining_data = self.historic_redlining_data.rename(
             columns={"HRS2010": self.REDLINING_SCALAR}
         )
 
-        logger.debug(f"{historic_redlining_data.columns}")
+        logger.debug(f"{self.historic_redlining_data.columns}")
 
         # Calculate lots of different score thresholds for convenience
         for threshold in [3.25, 3.5, 3.75]:
-            historic_redlining_data[
+            self.historic_redlining_data[
                 f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
-            ] = (historic_redlining_data[self.REDLINING_SCALAR] >= threshold)
+            ] = (
+                self.historic_redlining_data[self.REDLINING_SCALAR] >= threshold
+            )
             ## NOTE We add to columns to keep here
             self.COLUMNS_TO_KEEP.append(
                 f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
             )
 
-        self.output_df = historic_redlining_data
+        self.output_df = self.historic_redlining_data
diff --git a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
index b5e5a875..74e6623b 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
@@ -1,8 +1,9 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
 from pandas.errors import EmptyDataError
 
 logger = get_module_logger(__name__)
@@ -10,36 +11,46 @@ logger = get_module_logger(__name__)
 
 class HousingTransportationETL(ExtractTransformLoad):
     def __init__(self):
-        self.HOUSING_FTP_URL = (
-            "https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
-        )
+
         self.OUTPUT_PATH = (
             self.DATA_PATH / "dataset" / "housing_and_transportation_index"
         )
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+
+        housing_url = (
+            "https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
+        )
+
+        sources = []
+
+        for fips in get_state_fips_codes(self.DATA_PATH):
+            sources.append(
+                ZIPDataSource(
+                    source=f"{housing_url}{fips}",
+                    destination=self.get_sources_path(),
+                )
+            )
+
+        return sources
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
         # Download each state / territory individually
         dfs = []
-        zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
         for fips in get_state_fips_codes(self.DATA_PATH):
-            logger.debug(
-                f"Downloading housing data for state/territory with FIPS code {fips}"
-            )
 
-            unzip_file_from_url(
-                f"{self.HOUSING_FTP_URL}{fips}",
-                self.get_tmp_path(),
-                zip_file_dir,
-            )
-
-            # New file name:
-            tmp_csv_file_path = (
-                zip_file_dir / f"htaindex2019_data_tracts_{fips}.csv"
+            csv_source = (
+                self.get_sources_path() / f"htaindex2019_data_tracts_{fips}.csv"
             )
 
             try:
-                tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
+                tmp_df = pd.read_csv(filepath_or_buffer=csv_source)
             except EmptyDataError:
                 logger.error(
                     f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
index 4cf0ee7d..9ca02cf8 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
@@ -3,24 +3,33 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 
 logger = get_module_logger(__name__)
 
 
 class HudHousingETL(ExtractTransformLoad):
+
     NAME = "hud_housing"
     GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
 
     def __init__(self):
-        self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
 
+        # fetch
         if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.HOUSING_FTP_URL = (
+            self.housing_url = (
                 f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                 "hud_housing/2014thru2018-140-csv.zip"
             )
         else:
-            self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
+            self.housing_url = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
+
+        # source
+
+        # output
+
+        self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
 
         self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()
 
@@ -55,15 +64,16 @@ class HudHousingETL(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
-        super().extract(
-            self.HOUSING_FTP_URL,
-            self.HOUSING_ZIP_FILE_DIR,
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.housing_url, destination=self.get_sources_path()
+            )
+        ]
 
     def _read_chas_table(self, file_name):
-        # New file name:
-        tmp_csv_file_path = self.HOUSING_ZIP_FILE_DIR / "140" / file_name
+
+        tmp_csv_file_path = self.get_sources_path() / "140" / file_name
         tmp_df = pd.read_csv(
             filepath_or_buffer=tmp_csv_file_path,
             encoding="latin-1",
@@ -78,7 +88,12 @@ class HudHousingETL(ExtractTransformLoad):
 
         return tmp_df
 
-    def transform(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
         table_8 = self._read_chas_table("Table8.csv")
         table_3 = self._read_chas_table("Table3.csv")
 
@@ -86,6 +101,8 @@ class HudHousingETL(ExtractTransformLoad):
             table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME
         )
 
+    def transform(self) -> None:
+
         # Calculate share that lacks indoor plumbing or kitchen
         # This is computed as
         # (
diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
index ddf476b6..f6c61bfa 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
@@ -1,7 +1,9 @@
 import pandas as pd
-import requests
+
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.utils import get_module_logger
 
 
@@ -11,44 +13,51 @@ logger = get_module_logger(__name__)
 class HudRecapETL(ExtractTransformLoad):
     def __init__(self):
 
+        # fetch
         if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.HUD_RECAP_CSV_URL = (
+            self.hud_recap_csv_url = (
                 f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                 "hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
             )
         else:
-            self.HUD_RECAP_CSV_URL = (
+            self.hud_recap_csv_url = (
                 "https://opendata.arcgis.com/api/v3/datasets/"
                 "56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
             )
 
-        self.HUD_RECAP_CSV = (
-            self.get_tmp_path()
+        # input
+        self.hud_recap_source = (
+            self.get_sources_path()
             / "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
         )
+
+        # output
         self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"
 
-        # Definining some variable names
+        # Defining some variable names
         self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
             "hud_recap_priority_community"
         )
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
-        download = requests.get(
-            self.HUD_RECAP_CSV_URL,
-            verify=None,
-            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
-        )
-        file_contents = download.content
-        csv_file = open(self.HUD_RECAP_CSV, "wb")
-        csv_file.write(file_contents)
-        csv_file.close()
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.hud_recap_csv_url, destination=self.hud_recap_source
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        # Load comparison index (CalEnviroScreen 4)
+        self.df = pd.read_csv(self.hud_recap_source, dtype={"GEOID": "string"})
 
     def transform(self) -> None:
-        # Load comparison index (CalEnviroScreen 4)
-        self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
 
         self.df.rename(
             columns={
diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py b/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
index 7b4879f3..68e01824 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
@@ -2,6 +2,8 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 
@@ -10,16 +12,25 @@ logger = get_module_logger(__name__)
 
 class MappingForEJETL(ExtractTransformLoad):
     def __init__(self):
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
 
-        self.MAPPING_FOR_EJ_VA_URL = (
+        # fetch
+        self.mapping_for_ej_va_url = (
             settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip"
         )
-        self.MAPPING_FOR_EJ_CO_URL = (
+        self.mapping_for_ej_co_url = (
             settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
         )
-        self.VA_SHP_FILE_PATH = self.get_tmp_path() / "mej_virginia_7_1.shp"
-        self.CO_SHP_FILE_PATH = self.get_tmp_path() / "mej_colorado_final.shp"
+
+        # input
+        self.va_shp_file_source = (
+            self.get_sources_path() / "mej_virginia_7_1.shp"
+        )
+        self.co_shp_file_source = (
+            self.get_sources_path() / "mej_colorado_final.shp"
+        )
+
+        # output
+        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
 
         # Defining variables
         self.COLUMNS_TO_KEEP = [
@@ -38,26 +49,35 @@ class MappingForEJETL(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
-        super().extract(
-            self.MAPPING_FOR_EJ_VA_URL,
-            self.get_tmp_path(),
-        )
-        super().extract(
-            self.MAPPING_FOR_EJ_CO_URL,
-            self.get_tmp_path(),
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.mapping_for_ej_va_url,
+                destination=self.get_sources_path(),
+            ),
+            ZIPDataSource(
+                source=self.mapping_for_ej_co_url,
+                destination=self.get_sources_path(),
+            ),
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
 
-    def transform(self) -> None:
         # Join (here, it's just concatenating) the two dataframes from
         # CO and VA
         self.df = pd.concat(
             [
-                gpd.read_file(self.VA_SHP_FILE_PATH),
-                gpd.read_file(self.CO_SHP_FILE_PATH),
+                gpd.read_file(self.va_shp_file_source),
+                gpd.read_file(self.co_shp_file_source),
             ]
         )
 
+    def transform(self) -> None:
+
         # Fill Census tract to get it to be 11 digits, incl. leading 0s
         # Note that VA and CO should never have leading 0s, so this isn't
         # strictly necessary, but if in the future, there are more states
diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
index 05ff0593..e983efb6 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
@@ -3,8 +3,9 @@ import pathlib
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
 
@@ -19,31 +20,35 @@ class MappingInequalityETL(ExtractTransformLoad):
 
     Information on the mapping of this data to census tracts is available at
     https://github.com/americanpanorama/Census_HOLC_Research.
-
     """
 
     def __init__(self):
+
+        # fetch
         if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.MAPPING_INEQUALITY_CSV_URL = (
+            self.mapping_inequality_csv_url = (
                 f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                 "mapping_inequality/holc_tract_lookup.csv"
             )
         else:
-            self.MAPPING_INEQUALITY_CSV_URL = (
+            self.mapping_inequality_csv_url = (
                 "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
                 "main/2010_Census_Tracts/holc_tract_lookup.csv"
             )
-        self.MAPPING_INEQUALITY_CSV = (
-            self.get_tmp_path() / "holc_tract_lookup.csv"
-        )
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
 
-        self.HOLC_MANUAL_MAPPING_CSV_PATH = (
+        # input
+        self.mapping_inequality_source = (
+            self.get_sources_path() / "holc_tract_lookup.csv"
+        )
+        self.holc_manual_mapping_source = (  # here be dragons – this file is pulled from a different place than most
             pathlib.Path(__file__).parent
             / "data"
             / "holc_grades_manually_mapped.csv"
         )
 
+        # output
+        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
+
         # Some input field names. From documentation: 'Census Tracts were intersected
         # with HOLC Polygons. Census information can be joined via the "geoid" field.
         # There are two field "holc_prop" and "tract_prop" which give the proportion
@@ -73,22 +78,39 @@ class MappingInequalityETL(ExtractTransformLoad):
         ]
 
         self.df: pd.DataFrame
+        self.holc_manually_mapped_df: pd.DataFrame
 
-    def extract(self) -> None:
-        download_file_from_url(
-            file_url=self.MAPPING_INEQUALITY_CSV_URL,
-            download_file_name=self.MAPPING_INEQUALITY_CSV,
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.mapping_inequality_csv_url,
+                destination=self.mapping_inequality_source,
+            )
+        ]
 
-    def transform(self) -> None:
-        df: pd.DataFrame = pd.read_csv(
-            self.MAPPING_INEQUALITY_CSV,
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.mapping_inequality_source,
             dtype={self.TRACT_INPUT_FIELD: "string"},
             low_memory=False,
         )
 
+        # Some data needs to be manually mapped to its grade.
+        # TODO: Investigate more data that may need to be manually mapped.
+        self.holc_manually_mapped_df = pd.read_csv(
+            filepath_or_buffer=self.holc_manual_mapping_source,
+            low_memory=False,
+        )
+
+    def transform(self) -> None:
+
         # rename Tract ID
-        df.rename(
+        self.df.rename(
             columns={
                 self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
             },
@@ -98,28 +120,21 @@ class MappingInequalityETL(ExtractTransformLoad):
         # Keep the first character, which is the HOLC grade (A, B, C, D).
         # TODO: investigate why this dataframe triggers these pylint errors.
         # pylint: disable=unsupported-assignment-operation, unsubscriptable-object
-        df[self.HOLC_GRADE_DERIVED_FIELD] = df[
+        self.df[self.HOLC_GRADE_DERIVED_FIELD] = self.df[
             self.HOLC_GRADE_AND_ID_FIELD
         ].str[0:1]
 
         # Remove nonsense when the field has no grade or invalid grades.
         valid_grades = ["A", "B", "C", "D"]
-        df.loc[
+        self.df.loc[
             # pylint: disable=unsubscriptable-object
-            ~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
+            ~self.df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
             self.HOLC_GRADE_DERIVED_FIELD,
         ] = None
 
-        # Some data needs to be manually mapped to its grade.
-        # TODO: Investigate more data that may need to be manually mapped.
-        holc_manually_mapped_df = pd.read_csv(
-            filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH,
-            low_memory=False,
-        )
-
         # Join on the existing data
-        merged_df = df.merge(
-            right=holc_manually_mapped_df,
+        merged_df = self.df.merge(
+            right=self.holc_manually_mapped_df,
             on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
             how="left",
         )
diff --git a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
index 8f714c81..2f066525 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
@@ -4,6 +4,8 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 
@@ -17,11 +19,16 @@ class MarylandEJScreenETL(ExtractTransformLoad):
     """
 
     def __init__(self):
-        self.MARYLAND_EJSCREEN_URL = (
+
+        # fetch
+        self.maryland_ejscreen_url = (
             settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
         )
 
-        self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen"
+        # input
+        self.shape_files_source = self.get_sources_path() / "mdejscreen"
+
+        # output
         self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"
 
         self.COLUMNS_TO_KEEP = [
@@ -31,37 +38,47 @@ class MarylandEJScreenETL(ExtractTransformLoad):
         ]
 
         self.df: pd.DataFrame
+        self.dfs_list: pd.DataFrame
+
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.maryland_ejscreen_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
 
-    def extract(self) -> None:
-        logger.debug("Downloading 207MB Maryland EJSCREEN Data")
         super().extract(
-            self.MARYLAND_EJSCREEN_URL,
-            self.get_tmp_path(),
-        )
+            use_cached_data_sources
+        )  # download and extract data sources
 
-    def transform(self) -> None:
-        list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
+        logger.debug("Downloading 207MB Maryland EJSCREEN Data")
+        list_of_files = list(glob(str(self.shape_files_source) + "/*.shp"))
 
-        # Ignore counties becauses this is not the level of measurement
+        # Ignore counties because this is not the level of measurement
         # that is consistent with our current scoring and ranking methodology.
-        dfs_list = [
+        self.dfs_list = [
             gpd.read_file(f)
             for f in list_of_files
             if not f.endswith("CountiesEJScore.shp")
         ]
 
+    def transform(self) -> None:
+
         # Set the Census tract as the index and drop the geometry column
         # that produces the census tract boundaries.
         # The latter is because Geopandas raises an exception if there
         # are duplicate geometry columns.
         # Moreover, since the unit of measurement is at the tract level
         # we can consistantly merge this with other datasets
-        dfs_list = [
+        self.dfs_list = [
             df.set_index("Census_Tra").drop("geometry", axis=1)
-            for df in dfs_list
+            for df in self.dfs_list
         ]
         # pylint: disable=unsubscriptable-object
-        self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1))
+        self.df = gpd.GeoDataFrame(pd.concat(self.dfs_list, axis=1))
 
         # Reset index so that we no longer have the tract as our index
         self.df = self.df.reset_index()
diff --git a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
index efde123c..2c33f888 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
@@ -1,6 +1,8 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 
@@ -15,12 +17,21 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
     """
 
     def __init__(self):
-        self.MICHIGAN_EJSCREEN_S3_URL = (
+
+        # fetch
+        self.michigan_ejscreen_url = (
             settings.AWS_JUSTICE40_DATASOURCES_URL
             + "/michigan_ejscore_12212021.csv"
         )
 
+        # input
+        self.michigan_ejscreen_source = (
+            self.get_sources_path() / "michigan_ejscore_12212021.csv"
+        )
+
+        # output
         self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen"
+
         self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75
 
         self.COLUMNS_TO_KEEP = [
@@ -32,14 +43,28 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.michigan_ejscreen_url,
+                destination=self.michigan_ejscreen_source,
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
         self.df = pd.read_csv(
-            filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
+            filepath_or_buffer=self.michigan_ejscreen_source,
             dtype={"GEO_ID": "string"},
             low_memory=False,
         )
 
     def transform(self) -> None:
+
         self.df.rename(
             columns={
                 "GEO_ID": self.GEOID_TRACT_FIELD_NAME,
diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
index bced98f5..b58d8f30 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@@ -4,6 +4,8 @@
 # pylint: disable=unsupported-assignment-operation
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@@ -16,17 +18,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
 
     NAME = "national_risk_index"
 
-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        SOURCE_URL = (
-            f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
-            "national_risk_index/NRI_Table_CensusTracts.zip"
-        )
-    else:
-        SOURCE_URL = (
-            "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
-            "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
-        )
-
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
     PUERTO_RICO_EXPECTED_IN_DATA = False
     LOAD_YAML_CONFIG: bool = True
@@ -46,11 +37,28 @@ class NationalRiskIndexETL(ExtractTransformLoad):
     AGRIVALUE_LOWER_BOUND = 408000
 
     def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
 
+        # fetch
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.risk_index_url = (
+                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
+                "national_risk_index/NRI_Table_CensusTracts.zip"
+            )
+        else:
+            self.risk_index_url = (
+                "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
+                "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
+            )
+
+        # source
+        self.risk_index_source = (
+            self.get_sources_path() / "NRI_Table_CensusTracts.csv"
+        )
+
+        # output
         # this is the main dataframe
         self.df: pd.DataFrame
+        self.df_nri: pd.DataFrame
 
         # Start dataset-specific vars here
         self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
@@ -65,14 +73,26 @@ class NationalRiskIndexETL(ExtractTransformLoad):
         self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
         self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"
 
-    def extract(self) -> None:
-        """Unzips NRI dataset from the FEMA data source and writes the files
-        to the temporary data folder for use in the transform() method
-        """
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.risk_index_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
 
         super().extract(
-            source_url=self.SOURCE_URL,
-            extract_path=self.get_tmp_path(),
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        # read in the unzipped csv from NRI data source then rename the
+        # Census Tract column for merging
+        self.df_nri = pd.read_csv(
+            self.risk_index_source,
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
+            na_values=["None"],
+            low_memory=False,
         )
 
     def transform(self) -> None:
@@ -84,16 +104,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
           Groups inside of that Tract
         """
 
-        # read in the unzipped csv from NRI data source then rename the
-        # Census Tract column for merging
-        df_nri: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
-            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
-            na_values=["None"],
-            low_memory=False,
-        )
-
-        df_nri.rename(
+        self.df_nri.rename(
             columns={
                 self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                 self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
@@ -123,42 +134,46 @@ class NationalRiskIndexETL(ExtractTransformLoad):
         agriculture_columns = [
             f"{x}_EALA"
             for x in disaster_categories
-            if f"{x}_EALA" in list(df_nri.columns)
+            if f"{x}_EALA" in list(self.df_nri.columns)
         ]
 
         population_columns = [
             f"{x}_EALP"
             for x in disaster_categories
-            if f"{x}_EALP" in list(df_nri.columns)
+            if f"{x}_EALP" in list(self.df_nri.columns)
         ]
 
         buildings_columns = [
             f"{x}_EALB"
             for x in disaster_categories
-            if f"{x}_EALB" in list(df_nri.columns)
+            if f"{x}_EALB" in list(self.df_nri.columns)
         ]
 
-        disaster_population_sum_series = df_nri[population_columns].sum(axis=1)
-
-        disaster_agriculture_sum_series = df_nri[agriculture_columns].sum(
+        disaster_population_sum_series = self.df_nri[population_columns].sum(
             axis=1
         )
 
-        disaster_buildings_sum_series = df_nri[buildings_columns].sum(axis=1)
+        disaster_agriculture_sum_series = self.df_nri[agriculture_columns].sum(
+            axis=1
+        )
+
+        disaster_buildings_sum_series = self.df_nri[buildings_columns].sum(
+            axis=1
+        )
 
         # Population EAL Rate = Eal Valp / Population
-        df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
+        self.df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
             disaster_population_sum_series
-            / df_nri[self.POPULATION_INPUT_FIELD_NAME]
+            / self.df_nri[self.POPULATION_INPUT_FIELD_NAME]
         )
 
         # Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000)
         ## FORMULA ADJUSTMENT 2/17
         ## Because AGRIVALUE contains a lot of 0s, we are going to consider
         ## 90th percentile only for places that have some agrivalue at all
-        df_nri[
+        self.df_nri[
             self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
-        ] = disaster_agriculture_sum_series / df_nri[
+        ] = disaster_agriculture_sum_series / self.df_nri[
             self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME
         ].clip(
             lower=self.AGRIVALUE_LOWER_BOUND
@@ -167,11 +182,11 @@ class NationalRiskIndexETL(ExtractTransformLoad):
         ## Check that this clip worked -- that the only place the value has changed is when the clip took effect
         base_expectation = (
             disaster_agriculture_sum_series
-            / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
+            / self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
         )
         assert (
-            df_nri[
-                df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+            self.df_nri[
+                self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
                 != base_expectation
             ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
             <= self.AGRIVALUE_LOWER_BOUND
@@ -181,27 +196,27 @@ class NationalRiskIndexETL(ExtractTransformLoad):
         )
 
         assert (
-            df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+            self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
             != base_expectation
         ).sum() > 0, "Clipping the agrivalue did nothing!"
 
         # This produces a boolean that is True in the case of non-zero agricultural value
-        df_nri[self.CONTAINS_AGRIVALUE] = (
-            df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
+        self.df_nri[self.CONTAINS_AGRIVALUE] = (
+            self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
         )
 
         # divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)).
-        df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
+        self.df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
             disaster_buildings_sum_series
-            / df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
+            / self.df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
         )
 
         # Round all float columns to just 10 digits.
         # Note: `round` is smart enough to only apply to float columns.
-        df_nri = df_nri.round(10)
+        self.df_nri = self.df_nri.round(10)
 
         # Assign the final df to the class' output_df for the load method
-        self.output_df = df_nri
+        self.output_df = self.df_nri
 
     def load(self) -> None:
         # Suppress scientific notation.
diff --git a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
index 39b12af0..782e824f 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
@@ -3,6 +3,8 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 
@@ -13,10 +15,7 @@ class NatureDeprivedETL(ExtractTransformLoad):
     """ETL class for the Nature Deprived Communities dataset"""
 
     NAME = "nlcd_nature_deprived"
-    SOURCE_URL = (
-        settings.AWS_JUSTICE40_DATASOURCES_URL
-        + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
-    )
+
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
     PUERTO_RICO_EXPECTED_IN_DATA = False
     LOAD_YAML_CONFIG: bool = True
@@ -29,14 +28,25 @@ class NatureDeprivedETL(ExtractTransformLoad):
     TRACT_PERCENT_CROPLAND_FIELD_NAME: str
 
     def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = (
-            self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
+
+        # fetch
+        self.nature_deprived_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
         )
 
+        # source
+        # define the full path for the input CSV file
+        self.nature_deprived_source = (
+            self.get_sources_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
+        )
+
+        # output
         # this is the main dataframe
         self.df: pd.DataFrame
 
+        self.df_ncld: pd.DataFrame
+
         # Start dataset-specific vars here
         self.PERCENT_NATURAL_FIELD_NAME = "PctNatural"
         self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv"
@@ -47,28 +57,43 @@ class NatureDeprivedETL(ExtractTransformLoad):
         # for area. This does indeed remove tracts from the 90th+ percentile later on
         self.TRACT_ACRES_LOWER_BOUND = 35
 
-    def transform(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.nature_deprived_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
         """Reads the unzipped data file into memory and applies the following
         transformations to prepare it for the load() method:
 
         - Renames columns as needed
         """
 
-        df_ncld: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df_ncld = pd.read_csv(
+            self.nature_deprived_source,
             dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
             low_memory=False,
         )
 
-        df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
-            df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND
+    def transform(self) -> None:
+
+        self.df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
+            self.df_ncld[self.TRACT_ACRES_FIELD_NAME]
+            >= self.TRACT_ACRES_LOWER_BOUND
         )
-        df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
-            100 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
+        self.df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
+            100 - self.df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
         )
 
         # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_ncld.rename(
+        self.output_df = self.df_ncld.rename(
             columns={
                 self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME,
                 self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME,
diff --git a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
index b797c418..7bdb7b55 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
@@ -3,9 +3,10 @@ import functools
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
 
 logger = get_module_logger(__name__)
 
@@ -23,6 +24,26 @@ class PersistentPovertyETL(ExtractTransformLoad):
     PUERTO_RICO_EXPECTED_IN_DATA = False
 
     def __init__(self):
+
+        # fetch
+        self.poverty_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/LTDB_Std_All_Sample.zip"
+        )
+
+        # source
+        self.poverty_sources = [
+            self.get_sources_path()
+            / "ltdb_std_all_sample"
+            / "ltdb_std_1990_sample.csv",
+            self.get_sources_path()
+            / "ltdb_std_all_sample"
+            / "ltdb_std_2000_sample.csv",
+            self.get_sources_path()
+            / "ltdb_std_all_sample"
+            / "ltdb_std_2010_sample.csv",
+        ]
+
+        # output
         self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"
 
         # Need to change hyperlink to S3
@@ -44,6 +65,13 @@ class PersistentPovertyETL(ExtractTransformLoad):
 
         self.df: pd.DataFrame
 
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.poverty_url, destination=self.get_sources_path()
+            )
+        ]
+
     def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
         df = functools.reduce(
             lambda df_a, df_b: pd.merge(
@@ -75,28 +103,17 @@ class PersistentPovertyETL(ExtractTransformLoad):
 
         return df
 
-    def extract(self) -> None:
-        unzipped_file_path = self.get_tmp_path()
+    def extract(self, use_cached_data_sources: bool = False) -> None:
 
-        unzip_file_from_url(
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/LTDB_Std_All_Sample.zip",
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=unzipped_file_path,
-        )
-
-        file_names = [
-            "ltdb_std_1990_sample.csv",
-            "ltdb_std_2000_sample.csv",
-            "ltdb_std_2010_sample.csv",
-        ]
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
 
         temporary_input_dfs = []
 
-        for file_name in file_names:
+        for file_name in self.poverty_sources:
             temporary_input_df = pd.read_csv(
-                filepath_or_buffer=unzipped_file_path
-                / f"ltdb_std_all_sample/{file_name}",
+                filepath_or_buffer=file_name,
                 dtype={
                     self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
                     self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",
diff --git a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
index 0b99b01a..56f803bd 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
@@ -1,6 +1,8 @@
 import geopandas as gpd
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger
 
 logger = get_module_logger(__name__)
@@ -20,10 +22,17 @@ class TreeEquityScoreETL(ExtractTransformLoad):
     """
 
     def __init__(self):
-        self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
-        self.TES_CSV = self.get_tmp_path() / "tes_2021_data.csv"
+
+        # input
+        self.TES_CSV = self.get_sources_path() / "tes_2021_data.csv"
+
+        # output
         self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
         self.df: gpd.GeoDataFrame
+
+        self.tes_state_dfs = []
+
+        # config
         self.states = [
             "al",
             "az",
@@ -76,21 +85,36 @@ class TreeEquityScoreETL(ExtractTransformLoad):
             "wy",
         ]
 
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+
+        tes_url = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
+
+        sources = []
         for state in self.states:
-            super().extract(
-                f"{self.TES_URL}{state}.zip.zip",
-                f"{self.get_tmp_path()}/{state}",
+            sources.append(
+                ZIPDataSource(
+                    source=f"{tes_url}{state}.zip.zip",
+                    destination=self.get_sources_path() / state,
+                )
+            )
+
+        return sources
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        for state in self.states:
+            self.tes_state_dfs.append(
+                gpd.read_file(f"{self.get_sources_path()}/{state}/{state}.shp")
             )
 
     def transform(self) -> None:
-        tes_state_dfs = []
-        for state in self.states:
-            tes_state_dfs.append(
-                gpd.read_file(f"{self.get_tmp_path()}/{state}/{state}.shp")
-            )
+
         self.df = gpd.GeoDataFrame(
-            pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs
+            pd.concat(self.tes_state_dfs), crs=self.tes_state_dfs[0].crs
         )
 
         # rename ID to Tract ID
diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py
index f8bd9df7..25f73366 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py
@@ -4,63 +4,57 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
 
 logger = get_module_logger(__name__)
 
 
 class TribalETL(ExtractTransformLoad):
     def __init__(self):
+
+        self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
+
         self.GEOGRAPHIC_BASE_PATH = (
             self.DATA_PATH / "tribal" / "geographic_data"
         )
-        self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
         self.NATIONAL_TRIBAL_GEOJSON_PATH = (
             self.GEOGRAPHIC_BASE_PATH / "usa.json"
         )
+
         self.USA_TRIBAL_DF_LIST = []
 
-    def extract(self) -> None:
-        """Extract the tribal geojson zip files from Justice40 S3 data folder
+    def get_data_sources(self) -> [DataSource]:
 
-        Returns:
-            None
-        """
-
-        bia_shapefile_zip_url = (
+        national_lar_url = (
             settings.AWS_JUSTICE40_DATASOURCES_URL
             + "/BIA_National_LAR_updated_20220929.zip"
         )
-
-        tsa_and_aian_geojson_zip_url = (
+        tsa_and_aian_url = (
             settings.AWS_JUSTICE40_DATASOURCES_URL
             + "/BIA_TSA_and_AIAN_json.zip"
         )
-
-        alaska_geojson_url = (
+        alaska_native_villages_url = (
             settings.AWS_JUSTICE40_DATASOURCES_URL
             + "/Alaska_Native_Villages_json.zip"
         )
 
-        unzip_file_from_url(
-            bia_shapefile_zip_url,
-            self.TMP_PATH,
-            self.GEOGRAPHIC_BASE_PATH / "bia_national_lar",
-        )
-
-        unzip_file_from_url(
-            tsa_and_aian_geojson_zip_url,
-            self.TMP_PATH,
-            self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian",
-        )
-
-        unzip_file_from_url(
-            alaska_geojson_url,
-            self.TMP_PATH,
-            self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages",
-        )
+        return [
+            ZIPDataSource(
+                national_lar_url,
+                destination=self.get_sources_path() / "bia_national_lar",
+            ),
+            ZIPDataSource(
+                source=tsa_and_aian_url,
+                destination=self.get_sources_path() / "tsa_and_aian",
+            ),
+            ZIPDataSource(
+                source=alaska_native_villages_url,
+                destination=self.get_sources_path() / "alaska_native_villages",
+            ),
+        ]
 
     def _transform_bia_national_lar(self, path: Path) -> None:
         """Transform the Tribal BIA National Lar Geodataframe and appends it to the
@@ -187,21 +181,21 @@ class TribalETL(ExtractTransformLoad):
         """
         # Set the filepaths:
         bia_national_lar_shapefile = (
-            self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
+            self.get_sources_path() / "bia_national_lar"
         )
 
         bia_aian_supplemental_geojson = (
-            self.GEOGRAPHIC_BASE_PATH
+            self.get_sources_path()
             / "tsa_and_aian"
             / "BIA_AIAN_Supplemental.json"
         )
 
         bia_tsa_geojson = (
-            self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json"
+            self.get_sources_path() / "tsa_and_aian" / "BIA_TSA.json"
         )
 
         alaska_native_villages_geojson = (
-            self.GEOGRAPHIC_BASE_PATH
+            self.get_sources_path()
             / "alaska_native_villages"
             / "AlaskaNativeVillages.gdb.geojson"
         )
@@ -225,7 +219,11 @@ class TribalETL(ExtractTransformLoad):
             "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
         )
 
+        # note – this works a little different than many of the ETLs. The file
+        # being written here is used again downstream, so it's placed in a
+        # special directory.
         logger.debug("Writing national geojson file")
+        self.GEOGRAPHIC_BASE_PATH.mkdir(parents=True, exist_ok=True)
         usa_tribal_df.to_file(
             self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
         )
diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
index ba2e2226..602e6005 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
@@ -4,6 +4,7 @@ import geopandas as gpd
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 from data_pipeline.etl.sources.geo_utils import get_tract_geojson
@@ -67,6 +68,9 @@ class TribalOverlapETL(ExtractTransformLoad):
         self.census_tract_gdf: gpd.GeoDataFrame
         self.tribal_gdf: gpd.GeoDataFrame
 
+    def get_data_sources(self) -> [DataSource]:
+        return []  # this uses already retrieved / calculated data
+
     @staticmethod
     def _create_string_from_list(series: pd.Series) -> str:
         """Helper method that creates a sorted string list (for tribal names)."""
@@ -89,7 +93,12 @@ class TribalOverlapETL(ExtractTransformLoad):
 
         return percentage_float
 
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
         self.census_tract_gdf = get_tract_geojson()
         self.tribal_gdf = get_tribal_geojson()
 
diff --git a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
index 7f692603..3ad58a2a 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
@@ -4,9 +4,10 @@ import geopandas as gpd
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
 
@@ -28,19 +29,6 @@ class USArmyFUDS(ExtractTransformLoad):
 
     def __init__(self):
 
-        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.FILE_URL = (
-                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
-                "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
-                "all_data_reported_to_Congress_in_FY2020.geojson"
-            )
-        else:
-            self.FILE_URL: str = (
-                "https://opendata.arcgis.com/api/v3/datasets/"
-                "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
-                "data?format=geojson&spatialRefId=4326&where=1%3D1"
-            )
-
         self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"
 
         # Constants for output
@@ -50,17 +38,27 @@ class USArmyFUDS(ExtractTransformLoad):
             self.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
             self.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
         ]
-        self.DOWNLOAD_FILE_NAME = self.get_tmp_path() / "fuds.geojson"
+        self.fuds_source = self.get_sources_path() / "fuds.geojson"
 
         self.raw_df: gpd.GeoDataFrame
         self.output_df: pd.DataFrame
 
-    def extract(self) -> None:
-        download_file_from_url(
-            file_url=self.FILE_URL,
-            download_file_name=self.DOWNLOAD_FILE_NAME,
-            verify=True,
-        )
+    def get_data_sources(self) -> [DataSource]:
+
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            fuds_url = (
+                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
+                "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
+                "all_data_reported_to_Congress_in_FY2020.geojson"
+            )
+        else:
+            fuds_url: str = (
+                "https://opendata.arcgis.com/api/v3/datasets/"
+                "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
+                "data?format=geojson&spatialRefId=4326&where=1%3D1"
+            )
+
+        return [FileDataSource(source=fuds_url, destination=self.fuds_source)]
 
     def transform(self) -> None:
         # before we try to do any transformation, get the tract data
@@ -68,7 +66,7 @@ class USArmyFUDS(ExtractTransformLoad):
 
         logger.debug("Loading FUDS data as GeoDataFrame for transform")
         raw_df = gpd.read_file(
-            filename=self.DOWNLOAD_FILE_NAME,
+            filename=self.fuds_source,
             low_memory=False,
         )
 
diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
index 271ba800..7450d754 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
@@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
         data. A basic version of that patching is included here for classes that can use it.
         """
 
+        data_path, tmp_path = mock_paths
+        sources_path = data_path / "sources" / self._ETL_CLASS.__name__
+        sources_path.mkdir(parents=True, exist_ok=True)
+
         with mock.patch(
-            "data_pipeline.utils.requests"
+            "data_pipeline.etl.downloader.requests"
         ) as requests_mock, mock.patch(
+            "data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
+        ) as sources_mock, mock.patch(
             "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
         ) as mock_get_state_fips_codes:
-            tmp_path = mock_paths[1]
 
+            # requests mock
             def fake_get(url, *args, **kwargs):
                 file_path = url.split("/")[-1]
                 with open(
@@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
                 return response_mock
 
             requests_mock.get = fake_get
+
+            # fips codes mock
             mock_get_state_fips_codes.return_value = [
                 x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
             ]
+
+            # sources mock
+            sources_mock.return_value = sources_path
+
             # Instantiate the ETL class.
             etl = self._get_instance_of_etl_class()
 
             # Monkey-patch the temporary directory to the one used in the test
             etl.TMP_PATH = tmp_path
+            etl.SOURCES_PATH = data_path / "sources"
 
             # Run the extract method.
             etl.extract()
+
+        def fake_get_sources_path() -> pathlib.PosixPath:
+            return sources_path
+
+        mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
+
         return etl
 
     def test_init(self, mock_etl, mock_paths):
diff --git a/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py
index 602be901..f0200f62 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py
@@ -28,7 +28,7 @@ class TestTravelCompositeETL(TestETL):
             mock_paths=mock_paths,
         )
         df = gpd.read_file(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
             dtype={etl.GEOID_TRACT_FIELD_NAME: str},
         )
         assert df.shape[0] == 30
diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
index 7f78d3e4..d461d913 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
@@ -5,6 +5,7 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
 
 logger = get_module_logger(__name__)
 
@@ -30,6 +31,9 @@ class ExampleETL(ExtractTransformLoad):
             self.EXAMPLE_FIELD_NAME,
         ]
 
+    def get_data_sources(self) -> [DataSource]:
+        return []
+
     def extract(self):
         # Pretend to download zip from external URL, write it to CSV.
         zip_file_path = (
@@ -42,11 +46,11 @@ class ExampleETL(ExtractTransformLoad):
         )
 
         with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
-            zip_ref.extractall(self.get_tmp_path())
+            zip_ref.extractall(self.get_sources_path())
 
     def transform(self):
         df: pd.DataFrame = pd.read_csv(
-            self.get_tmp_path() / "input.csv",
+            self.get_sources_path() / "input.csv",
             dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
             low_memory=False,
         )
diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
index 888cb5f1..cf019b17 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@@ -124,12 +124,18 @@ class TestETL:
         data. A basic version of that patching is included here for classes that can use it.
         """
 
+        data_path, tmp_path = mock_paths
+        sources_path = data_path / "sources" / self._ETL_CLASS.__name__
+        sources_path.mkdir(parents=True, exist_ok=True)
+
         with mock.patch(
-            "data_pipeline.utils.requests"
+            "data_pipeline.etl.downloader.requests"
         ) as requests_mock, mock.patch(
+            "data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
+        ) as sources_mock, mock.patch(
             "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
         ) as mock_get_state_fips_codes:
-            tmp_path = mock_paths[1]
+
             if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
                 zip_file_fixture_src = (
                     self._DATA_DIRECTORY_FOR_TEST
@@ -145,6 +151,7 @@ class TestETL:
                     "rb",
                 ) as file:
                     file_contents = file.read()
+
             response_mock = requests.Response()
             response_mock.status_code = 200
             # pylint: disable=protected-access
@@ -154,15 +161,25 @@ class TestETL:
             mock_get_state_fips_codes.return_value = [
                 x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
             ]
+
+            # sources mock
+            sources_mock.return_value = sources_path
+
             # Instantiate the ETL class.
             etl = self._get_instance_of_etl_class()
 
             # Monkey-patch the temporary directory to the one used in the test
             etl.TMP_PATH = tmp_path
+            etl.SOURCES_PATH = data_path / "sources"
 
             # Run the extract method.
             etl.extract()
 
+        def fake_get_sources_path() -> pathlib.PosixPath:
+            return sources_path
+
+        mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
+
         return etl
 
     def test_init_base(self, mock_etl, mock_paths):
@@ -263,17 +280,12 @@ class TestETL:
         file was unzipped from a "fake" downloaded zip (located in data) in a  temporary path.
         """
         if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
-            tmp_path = mock_paths[1]
 
-            _ = self._setup_etl_instance_and_run_extract(
+            etl = self._setup_etl_instance_and_run_extract(
                 mock_etl=mock_etl,
                 mock_paths=mock_paths,
             )
-            assert (
-                tmp_path
-                / self._EXTRACT_TMP_FOLDER_NAME
-                / self._SAMPLE_DATA_FILE_NAME
-            ).exists()
+            assert (etl.get_sources_path()).exists()
 
     def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
         """Tests the extract method.
@@ -285,8 +297,11 @@ class TestETL:
             mock_etl=mock_etl,
             mock_paths=mock_paths,
         )
+
+        data_path, tmp_path = mock_paths
+
         tmp_df = pd.read_csv(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
             dtype={etl.GEOID_TRACT_FIELD_NAME: str},
         )
         snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
diff --git a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
index de2c7f8f..51aedce5 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
@@ -29,7 +29,7 @@ class TestHistoricRedliningETL(TestETL):
             mock_paths=mock_paths,
         )
         tmp_df = pd.read_excel(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
             dtype={etl.GEOID_TRACT_FIELD_NAME: str},
         )
         assert tmp_df.shape == (15, 5)
diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
index 493c0be2..26ef48bf 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@@ -36,23 +36,12 @@ class TestNationalRiskIndexETL(TestETL):
 
     def test_init(self, mock_etl, mock_paths):
         """Tests that the mock NationalRiskIndexETL class instance was
-        initiliazed correctly.
-
-        Validates the following conditions:
-        - self.DATA_PATH points to the "data" folder in the temp directory
-        - self.TMP_PATH points to the "data/tmp" folder in the temp directory
-        - self.INPUT_PATH points to the correct path in the temp directory
-        - self.OUTPUT_PATH points to the correct path in the temp directory
+        initialized correctly.
         """
         # setup
         etl = NationalRiskIndexETL()
-        data_path, tmp_path = mock_paths
-        input_csv = (
-            tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
-        )
 
         # validation
-        assert etl.INPUT_CSV == input_csv
         assert etl.GEOID_FIELD_NAME == "GEOID10"
         assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
         assert etl.NAME == "national_risk_index"