Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline
2025-07-26 19:11:16 -07:00 · 2023-03-03 12:26:24 -06:00 · 2023-03-03 12:26:24 -06:00 · 6f39033dde
commit 6f39033dde
parent 4d9c1dd11e
52 changed files with 1787 additions and 686 deletions
--- a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
@ -1,23 +1,36 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)


 class CalEnviroScreenETL(ExtractTransformLoad):
+    """California environmental screen
+
+    TODO: Need good description
+    """
+
    def __init__(self):
-        self.CALENVIROSCREEN_FTP_URL = (
+
+        # fetch
+        self.calenviroscreen_ftp_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/CalEnviroScreen_4.0_2021.zip"
        )
-        self.CALENVIROSCREEN_CSV = (
-            self.get_tmp_path() / "CalEnviroScreen_4.0_2021.csv"
-        )
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"

-        # Definining some variable names
+        # input
+        self.calenviroscreen_source = (
+            self.get_sources_path() / "CalEnviroScreen_4.0_2021.csv"
+        )
+
+        # output
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
+
+        # Defining some variable names
        self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
        self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
            "calenviroscreen_percentile"
@ -32,19 +45,28 @@ class CalEnviroScreenETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.calenviroscreen_ftp_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
        super().extract(
-            self.CALENVIROSCREEN_FTP_URL,
-            self.get_tmp_path(),
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.calenviroscreen_source, dtype={"Census Tract": "string"}
        )

    def transform(self) -> None:
        # Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
        # https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
        # Load comparison index (CalEnviroScreen 4)
-        self.df = pd.read_csv(
-            self.CALENVIROSCREEN_CSV, dtype={"Census Tract": "string"}
-        )

        self.df.rename(
            columns={
@ -68,5 +90,5 @@ class CalEnviroScreenETL(ExtractTransformLoad):

    def load(self) -> None:
        # write nationwide csv
-        self.CSV_PATH.mkdir(parents=True, exist_ok=True)
-        self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+        self.df.to_csv(self.OUTPUT_PATH / "data06.csv", index=False)
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@ -7,8 +7,9 @@ from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.score.etl_utils import (
    compare_to_list_of_expected_state_fips_codes,
 )
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings

@ -17,59 +18,74 @@ logger = get_module_logger(__name__)


 class CDCLifeExpectancy(ExtractTransformLoad):
+    """#TODO: create description"""
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False

    NAME = "cdc_life_expectancy"

-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
-    else:
-        USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
-
    LOAD_YAML_CONFIG: bool = False
    LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
    INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"

    STATES_MISSING_FROM_USA_FILE = ["23", "55"]

-    # For some reason, LEEP does not include Maine or Wisconsin in its "All of
-    # USA" file. Load these separately.
-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
-        MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
-    else:
-        WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
-        MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
-
    TRACT_INPUT_COLUMN_NAME = "Tract ID"
    STATE_INPUT_COLUMN_NAME = "STATE2KX"

-    raw_df: pd.DataFrame
-    output_df: pd.DataFrame
+    raw_df: pd.DataFrame  # result of extraction
+    output_df: pd.DataFrame  # result of transformation

    def __init__(self):
+
+        # fetch
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.usa_file_url = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
+        else:
+            self.usa_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
+
+        # For some reason, LEEP does not include Maine or Wisconsin in its "All of USA" file. Load these separately.
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.wisconsin_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
+            self.maine_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
+        else:
+            self.wisconsin_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
+            self.maine_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
+
+        # input
+        self.usa_source = self.get_sources_path() / "US_A.CSV"
+        self.maine_source = self.get_sources_path() / "ME_A.CSV"
+        self.wisconsin_source = self.get_sources_path() / "WI_A.CSV"
+
+        # output
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "cdc_life_expectancy"
        )

-        # Constants for output
-        self.COLUMNS_TO_KEEP = [
+        self.COLUMNS_TO_KEEP = [  # the columns to save on output
            self.GEOID_TRACT_FIELD_NAME,
            field_names.LIFE_EXPECTANCY_FIELD,
        ]

-    def _download_and_prep_data(
-        self, file_url: str, download_file_name: pathlib.Path
-    ) -> pd.DataFrame:
-        download_file_from_url(
-            file_url=file_url,
-            download_file_name=download_file_name,
-            verify=True,
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.usa_file_url, destination=self.usa_source
+            ),
+            FileDataSource(
+                source=self.maine_file_url, destination=self.maine_source
+            ),
+            FileDataSource(
+                source=self.wisconsin_file_url,
+                destination=self.wisconsin_source,
+            ),
+        ]
+
+    def _read_data(self, file_name: pathlib.Path) -> pd.DataFrame:

        df = pd.read_csv(
-            filepath_or_buffer=download_file_name,
+            filepath_or_buffer=file_name,
            dtype={
                # The following need to remain as strings for all of their digits, not get converted to numbers.
                self.TRACT_INPUT_COLUMN_NAME: "string",
@ -80,12 +96,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):

        return df

-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:

-        all_usa_raw_df = self._download_and_prep_data(
-            file_url=self.USA_FILE_URL,
-            download_file_name=self.get_tmp_path() / "US_A.CSV",
-        )
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        all_usa_raw_df = self._read_data(self.usa_source)

        # Check which states are missing
        states_in_life_expectancy_usa_file = list(
@ -101,17 +118,11 @@ class CDCLifeExpectancy(ExtractTransformLoad):
            additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
        )

-        logger.debug("Downloading data for Maine")
-        maine_raw_df = self._download_and_prep_data(
-            file_url=self.MAINE_FILE_URL,
-            download_file_name=self.get_tmp_path() / "maine.csv",
+        maine_raw_df = self._read_data(
+            self.maine_source,
        )

-        logger.debug("Downloading data for Wisconsin")
-        wisconsin_raw_df = self._download_and_prep_data(
-            file_url=self.WISCONSIN_FILE_URL,
-            download_file_name=self.get_tmp_path() / "wisconsin.csv",
-        )
+        wisconsin_raw_df = self._read_data(self.wisconsin_source)

        combined_df = pd.concat(
            objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df],
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
@ -4,14 +4,17 @@ import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource

 logger = get_module_logger(__name__)


 class CDCPlacesETL(ExtractTransformLoad):
+    """#TODO: Need description"""
+
    NAME = "cdc_places"
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
@ -21,15 +24,21 @@ class CDCPlacesETL(ExtractTransformLoad):
    CDC_MEASURE_FIELD_NAME = "Measure"

    def __init__(self):
-        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"

+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.CDC_PLACES_URL = (
+            self.cdc_places_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
            )
        else:
-            self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
+            self.cdc_places_url = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
+
+        # input
+        self.places_source = self.get_sources_path() / "census_tract.csv"
+
+        # output
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"

        self.COLUMNS_TO_KEEP: typing.List[str] = [
            self.GEOID_TRACT_FIELD_NAME,
@ -43,19 +52,27 @@ class CDCPlacesETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        file_path = download_file_from_url(
-            file_url=self.CDC_PLACES_URL,
-            download_file_name=self.get_tmp_path() / "census_tract.csv",
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.cdc_places_url, destination=self.places_source
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

        self.df = pd.read_csv(
-            filepath_or_buffer=file_path,
+            filepath_or_buffer=self.places_source,
            dtype={self.CDC_GEOID_FIELD_NAME: "string"},
            low_memory=False,
        )

    def transform(self) -> None:
+
        # Rename GEOID field
        self.df.rename(
            columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py
@ -1,6 +1,8 @@
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -11,22 +13,28 @@ logger = get_module_logger(__name__)
 class CDCSVIIndex(ExtractTransformLoad):
    """CDC SVI Index class ingests 2018 dataset located
    here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html
+
    Please see the README in this module for further details.
    """

    def __init__(self):
-        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"

+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.CDC_SVI_INDEX_URL = (
+            self.cdc_svi_index_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "cdc_svi_index/SVI2018_US.csv"
            )
        else:
-            self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
+            self.cdc_svi_index_url = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
+
+        # input
+        self.svi_source = self.get_sources_path() / "SVI2018_US.csv"
+
+        # output
+        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"

        self.CDC_RPL_THEMES_THRESHOLD = 0.90
-
        self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS"

        self.COLUMNS_TO_KEEP = [
@ -47,9 +55,21 @@ class CDCSVIIndex(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.cdc_svi_index_url, destination=self.svi_source
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        self.df = pd.read_csv(
-            filepath_or_buffer=self.CDC_SVI_INDEX_URL,
+            filepath_or_buffer=self.svi_source,
            dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
            low_memory=False,
        )
@ -107,8 +127,8 @@ class CDCSVIIndex(ExtractTransformLoad):
            )

    def load(self) -> None:
-        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
        self.df[self.COLUMNS_TO_KEEP].to_csv(
            path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
@ -8,7 +8,8 @@ import geopandas as gpd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)

@ -20,7 +21,7 @@ class GeoFileType(Enum):


 class CensusETL(ExtractTransformLoad):
-    SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
+    # SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
    GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
    CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
    GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
@ -29,6 +30,9 @@ class CensusETL(ExtractTransformLoad):
    GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"

    def __init__(self):
+
+        self.shape_file_path = self.get_sources_path() / "shp"
+
        # the fips_states_2010.csv is generated from data here
        # https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
        self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
@ -50,7 +54,7 @@ class CensusETL(ExtractTransformLoad):
        file_path: Path
        if file_type == GeoFileType.SHP:
            file_path = Path(
-                self.SHP_BASE_PATH
+                self.shape_file_path
                / fips_code
                / f"tl_2010_{fips_code}_tract10.shp"
            )
@ -60,33 +64,22 @@ class CensusETL(ExtractTransformLoad):
            file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
        return file_path

-    def _extract_shp(self, fips_code: str) -> None:
-        """Download the SHP file for the provided FIPS code
+    def get_data_sources(self) -> [DataSource]:

-        Args:
-            fips_code (str): the FIPS code for the region of interest
+        sources = []

-        Returns:
-            None
-        """
-        shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
+        for fips_code in self.STATE_FIPS_CODES:

-        # check if file exists
-        if not shp_file_path.is_file():
            tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
-            unzip_file_from_url(
-                tract_state_url,
-                self.TMP_PATH,
-                self.DATA_PATH / "census" / "shp" / fips_code,
+            destination_path = self.shape_file_path / fips_code
+
+            sources.append(
+                ZIPDataSource(
+                    source=tract_state_url, destination=destination_path
+                )
            )

-    def extract(self) -> None:
-        logger.debug("Extracting census data")
-        for index, fips_code in enumerate(self.STATE_FIPS_CODES):
-            logger.debug(
-                f"Extracting shape for FIPS {fips_code} – {index+1} of {len(self.STATE_FIPS_CODES)}"
-            )
-            self._extract_shp(fips_code)
+        return sources

    def _transform_to_geojson(self, fips_code: str) -> None:
        """Convert the downloaded SHP file for the associated FIPS to geojson
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@ -56,6 +56,7 @@ def get_state_fips_codes(data_path: Path) -> list:
            else:
                fips = row[0].strip()
                fips_state_list.append(fips)
+
    return fips_state_list


--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -8,12 +8,11 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census_acs.etl_imputations import (
    calculate_income_measures,
 )
-from data_pipeline.etl.sources.census_acs.etl_utils import (
-    retrieve_census_acs_data,
-)
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import CensusDataSource

 logger = get_module_logger(__name__)

@ -28,6 +27,9 @@ class CensusACSETL(ExtractTransformLoad):
    MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1

    def __init__(self):
+
+        self.census_acs_source = self.get_sources_path() / "acs.csv"
+
        self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
        self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
        self.EMPLOYMENT_FIELDS = [
@ -311,6 +313,34 @@ class CensusACSETL(ExtractTransformLoad):

        self.df: pd.DataFrame

+    def get_data_sources(self) -> [DataSource]:
+        # Define the variables to retrieve
+        variables = (
+            [
+                self.MEDIAN_INCOME_FIELD,
+                self.MEDIAN_HOUSE_VALUE_FIELD,
+            ]
+            + self.EMPLOYMENT_FIELDS
+            + self.LINGUISTIC_ISOLATION_FIELDS
+            + self.POVERTY_FIELDS
+            + self.EDUCATIONAL_FIELDS
+            + self.RE_FIELDS
+            + self.COLLEGE_ATTENDANCE_FIELDS
+            + self.AGE_INPUT_FIELDS
+        )
+
+        return [
+            CensusDataSource(
+                source=None,
+                destination=self.census_acs_source,
+                acs_year=self.ACS_YEAR,
+                variables=variables,
+                tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
+                data_path_for_fips_codes=self.DATA_PATH,
+                acs_type="acs5",
+            )
+        ]
+
    # pylint: disable=too-many-arguments
    def _merge_geojson(
        self,
@ -339,27 +369,15 @@ class CensusACSETL(ExtractTransformLoad):
            )
        )

-    def extract(self) -> None:
-        # Define the variables to retrieve
-        variables = (
-            [
-                self.MEDIAN_INCOME_FIELD,
-                self.MEDIAN_HOUSE_VALUE_FIELD,
-            ]
-            + self.EMPLOYMENT_FIELDS
-            + self.LINGUISTIC_ISOLATION_FIELDS
-            + self.POVERTY_FIELDS
-            + self.EDUCATIONAL_FIELDS
-            + self.RE_FIELDS
-            + self.COLLEGE_ATTENDANCE_FIELDS
-            + self.AGE_INPUT_FIELDS
-        )
+    def extract(self, use_cached_data_sources: bool = False) -> None:

-        self.df = retrieve_census_acs_data(
-            acs_year=self.ACS_YEAR,
-            variables=variables,
-            tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
-            data_path_for_fips_codes=self.DATA_PATH,
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.census_acs_source,
+            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
@ -1,10 +1,9 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.etl.sources.census_acs.etl_utils import (
-    retrieve_census_acs_data,
-)
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import CensusDataSource

 logger = get_module_logger(__name__)

@ -18,6 +17,9 @@ class CensusACS2010ETL(ExtractTransformLoad):
    """

    def __init__(self):
+
+        self.census_acs_source = self.get_sources_path() / "acs_2010.csv"
+
        self.ACS_YEAR = 2010
        self.ACS_TYPE = "acs5"
        self.OUTPUT_PATH = (
@ -99,7 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        # Define the variables to retrieve
        variables = (
            self.UNEMPLOYED_FIELDS
@ -107,13 +109,26 @@ class CensusACS2010ETL(ExtractTransformLoad):
            + self.POVERTY_FIELDS
        )

-        # Use the method defined on CensusACSETL to reduce coding redundancy.
-        self.df = retrieve_census_acs_data(
-            acs_year=self.ACS_YEAR,
-            variables=variables,
-            tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
-            data_path_for_fips_codes=self.DATA_PATH,
-            acs_type=self.ACS_TYPE,
+        return [
+            CensusDataSource(
+                source=None,
+                destination=self.census_acs_source,
+                acs_year=self.ACS_YEAR,
+                variables=variables,
+                tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
+                data_path_for_fips_codes=self.DATA_PATH,
+                acs_type=self.ACS_TYPE,
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.census_acs_source, dtype={"GEOID10_TRACT": "string"}
        )

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
@ -1,14 +1,16 @@
+import os
 import json
 from pathlib import Path

 import numpy as np
 import pandas as pd
-import requests
+
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
+from data_pipeline.etl.datasource import FileDataSource

 logger = get_module_logger(__name__)

@ -22,6 +24,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            / f"census_acs_median_income_{self.ACS_YEAR}"
        )

+        self.GEOCORR_ALL_STATES_URL = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/geocorr2014_all_states_tracts_only.csv.zip"
+        )
+        self.GEOCORR_ALL_STATES_PATH = self.get_sources_path() / "geocorr"
+        self.GEOCORR_ALL_STATES_SOURCE = (
+            self.GEOCORR_ALL_STATES_PATH
+            / "geocorr2014_all_states_tracts_only.csv"
+        )
+
        # Set constants for Geocorr MSAs data.
        self.PLACE_FIELD_NAME: str = "Census Place Name"
        self.COUNTY_FIELD_NAME: str = "County Name"
@ -39,10 +51,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E"
            + "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area"
        )
+        self.MSA_MEDIAN_INCOME_SOURCE = (
+            self.get_sources_path() / "msa" / "msa_median_income.json"
+        )
        self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)"

        # Set constants for state median incomes
        self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state"
+        self.STATE_MEDIAN_INCOME_SOURCE = (
+            self.get_sources_path() / "state" / "state_median_income.json"
+        )
        self.STATE_GEOID_FIELD_NAME: str = "GEOID2"
        self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)"

@ -50,6 +68,18 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        self.PUERTO_RICO_S3_LINK: str = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv"
        )
+        self.PUERTO_RICO_ALL_STATES_SOURCE = (
+            self.get_sources_path() / "pr_tracts" / "pr_tracts.csv"
+        )
+
+        census_api_key = os.environ.get("CENSUS_API_KEY")
+        if census_api_key:
+            self.MSA_MEDIAN_INCOME_URL = (
+                self.MSA_MEDIAN_INCOME_URL + f"&key={census_api_key}"
+            )
+            self.STATE_MEDIAN_INCOME_URL = (
+                self.STATE_MEDIAN_INCOME_URL + f"&key={census_api_key}"
+            )

        # Constants for output
        self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
@ -76,6 +106,27 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        self.state_median_incomes: dict
        self.pr_tracts: pd.DataFrame

+    def get_data_sources(self) -> [DataSource]:
+
+        return [
+            ZIPDataSource(
+                source=self.GEOCORR_ALL_STATES_URL,
+                destination=self.GEOCORR_ALL_STATES_PATH,
+            ),
+            FileDataSource(
+                source=self.PUERTO_RICO_S3_LINK,
+                destination=self.PUERTO_RICO_ALL_STATES_SOURCE,
+            ),
+            FileDataSource(
+                source=self.MSA_MEDIAN_INCOME_URL,
+                destination=self.MSA_MEDIAN_INCOME_SOURCE,
+            ),
+            FileDataSource(
+                source=self.STATE_MEDIAN_INCOME_URL,
+                destination=self.STATE_MEDIAN_INCOME_SOURCE,
+            ),
+        ]
+
    def _transform_geocorr(self) -> pd.DataFrame:
        # Transform the geocorr data
        geocorr_df = self.raw_geocorr_df
@ -223,7 +274,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        )
        return state_median_incomes_df

-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
        # Load and clean GEOCORR data
        # Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
        # The specific query used is the following, which takes a couple of minutes to run:
@ -239,18 +291,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        # - Core based statistical area (CBSA)
        # - CBSA Type (Metro or Micro)
        logger.debug("Starting download of 1.5MB Geocorr information.")
-
-        unzip_file_from_url(
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/geocorr2014_all_states_tracts_only.csv.zip",
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path() / "geocorr",
-        )
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

        self.raw_geocorr_df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "geocorr"
-            / "geocorr2014_all_states_tracts_only.csv",
+            filepath_or_buffer=self.GEOCORR_ALL_STATES_SOURCE,
            # Skip second row, which has descriptions.
            skiprows=[1],
            # The following need to remain as strings for all of their digits, not get converted to numbers.
@ -264,39 +310,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            low_memory=False,
        )

-        logger.debug("Pulling PR tract list down.")
-        # This step is necessary because PR is not in geocorr at the level that gets joined
-        pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
-        download_file_from_url(
-            file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file
-        )
        self.pr_tracts = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "pr_tracts"
-            / "pr_tracts.csv",
+            filepath_or_buffer=self.PUERTO_RICO_ALL_STATES_SOURCE,
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={"GEOID10_TRACT": str},
            low_memory=False,
        )
        self.pr_tracts["State Abbreviation"] = "PR"

-        # Download MSA median incomes
-        logger.debug("Starting download of MSA median incomes.")
-        download = requests.get(
-            self.MSA_MEDIAN_INCOME_URL,
-            verify=None,
-            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
-        )
-        self.msa_median_incomes = json.loads(download.content)
+        with self.MSA_MEDIAN_INCOME_SOURCE.open() as source:
+            self.msa_median_incomes = json.load(source)

-        # Download state median incomes
-        logger.debug("Starting download of state median incomes.")
-        download_state = requests.get(
-            self.STATE_MEDIAN_INCOME_URL,
-            verify=None,
-            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
-        )
-        self.state_median_incomes = json.loads(download_state.content)
+        with self.STATE_MEDIAN_INCOME_SOURCE.open() as source:
+            self.state_median_incomes = json.load(source)
        ## NOTE we already have PR's MI here

    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
@ -1,13 +1,14 @@
 import json
 from typing import List
+import os

 import numpy as np
 import pandas as pd
-import requests
-from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource

 pd.options.mode.chained_assignment = "raise"

@ -342,20 +343,23 @@ class CensusDecennialETL(ExtractTransformLoad):
            + "&for=tract:*&in=state:{}%20county:{}"
        )

+        census_api_key = os.environ.get("CENSUS_API_KEY")
+        if census_api_key:
+            self.API_URL = self.API_URL + f"&key={census_api_key}"
+
        self.final_race_fields: List[str] = []

        self.df: pd.DataFrame
        self.df_vi: pd.DataFrame
        self.df_all: pd.DataFrame

-    def extract(self) -> None:
-        dfs = []
-        dfs_vi = []
+    def get_data_sources(self) -> [DataSource]:
+
+        sources = []
+
        for island in self.ISLAND_TERRITORIES:
-            logger.debug(
-                f"Downloading data for state/territory {island['state_abbreviation']}"
-            )
            for county in island["county_fips"]:
+
                api_url = self.API_URL.format(
                    self.DECENNIAL_YEAR,
                    island["state_abbreviation"],
@ -363,17 +367,48 @@ class CensusDecennialETL(ExtractTransformLoad):
                    island["fips"],
                    county,
                )
-                logger.debug(f"CENSUS: Requesting {api_url}")
-                download = requests.get(
-                    api_url,
-                    timeout=settings.REQUESTS_DEFAULT_TIMOUT,
+
+                sources.append(
+                    FileDataSource(
+                        source=api_url,
+                        destination=self.get_sources_path()
+                        / str(self.DECENNIAL_YEAR)
+                        / island["state_abbreviation"]
+                        / island["fips"]
+                        / county
+                        / "census.json",
+                    )
                )

+        return sources
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        dfs = []
+        dfs_vi = []
+        for island in self.ISLAND_TERRITORIES:
+            logger.debug(
+                f"Downloading data for state/territory {island['state_abbreviation']}"
+            )
+            for county in island["county_fips"]:
+
                try:
-                    df = json.loads(download.content)
+                    filepath = (
+                        self.get_sources_path()
+                        / str(self.DECENNIAL_YEAR)
+                        / island["state_abbreviation"]
+                        / island["fips"]
+                        / county
+                        / "census.json"
+                    )
+                    df = json.load(filepath.open())
                except ValueError as e:
                    logger.error(
-                        f"Could not load content in census decennial ETL because {e}. Content is {download.content}."
+                        f"Could not load content in census decennial ETL because {e}."
                    )

                # First row is the header
--- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
@ -5,6 +5,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)

@ -38,17 +40,26 @@ class ChildOpportunityIndex(ExtractTransformLoad):
    PUERTO_RICO_EXPECTED_IN_DATA = False

    def __init__(self):
+
+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.SOURCE_URL = (
+            self.child_opportunity_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "child_opportunity_index/raw.zip"
            )
        else:
-            self.SOURCE_URL = (
+            self.child_opportunity_url = (
                "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
                "3a0ededa30a0?format=csv"
            )

+        # input
+        self.child_opportunity_index_source = (
+            self.get_sources_path() / "raw.csv"
+        )
+
+        # output
+
        # TODO: Decide about nixing this
        self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME

@ -62,17 +73,25 @@ class ChildOpportunityIndex(ExtractTransformLoad):
        self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN"
        self.READING_INPUT_FIELD = "ED_READING"

+        self.raw_df: pd.DataFrame
        self.output_df: pd.DataFrame

-    def extract(self) -> None:
-        super().extract(
-            source_url=self.SOURCE_URL,
-            extract_path=self.get_tmp_path(),
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.child_opportunity_url,
+                destination=self.get_sources_path(),
+            )
+        ]

-    def transform(self) -> None:
-        raw_df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path() / "raw.csv",
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.raw_df = pd.read_csv(
+            filepath_or_buffer=self.child_opportunity_index_source,
            # The following need to remain as strings for all of their digits, not get
            # converted to numbers.
            dtype={
@ -81,7 +100,9 @@ class ChildOpportunityIndex(ExtractTransformLoad):
            low_memory=False,
        )

-        output_df = raw_df.rename(
+    def transform(self) -> None:
+
+        output_df = self.raw_df.rename(
            columns={
                self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD,
--- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
@ -5,22 +5,35 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)


 class DOEEnergyBurden(ExtractTransformLoad):
+
    NAME = "doe_energy_burden"
-    SOURCE_URL: str = (
-        settings.AWS_JUSTICE40_DATASOURCES_URL
-        + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
-    )
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    LOAD_YAML_CONFIG: bool = True

    REVISED_ENERGY_BURDEN_FIELD_NAME: str

    def __init__(self):
+
+        # fetch
+        self.doe_energy_burden_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
+        )
+
+        # input
+        self.doe_energy_burden_source = (
+            self.get_sources_path() / "DOE_LEAD_AMI_TRACT_2018_ALL.csv"
+        )
+
+        # output
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "doe_energy_burden"
        )
@ -29,10 +42,22 @@ class DOEEnergyBurden(ExtractTransformLoad):
        self.raw_df: pd.DataFrame
        self.output_df: pd.DataFrame

-    def transform(self) -> None:
-        raw_df: pd.DataFrame = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.doe_energy_burden_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.raw_df = pd.read_csv(
+            filepath_or_buffer=self.doe_energy_burden_source,
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={
                self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
@ -40,8 +65,10 @@ class DOEEnergyBurden(ExtractTransformLoad):
            low_memory=False,
        )

+    def transform(self) -> None:
+
        logger.debug("Renaming columns and ensuring output format is correct")
-        output_df = raw_df.rename(
+        output_df = self.raw_df.rename(
            columns={
                self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
@ -3,6 +3,8 @@
 import geopandas as gpd
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -15,14 +17,6 @@ class TravelCompositeETL(ExtractTransformLoad):

    NAME = "travel_composite"

-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        SOURCE_URL = (
-            f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
-            "dot_travel_composite/Shapefile_and_Metadata.zip"
-        )
-    else:
-        SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
-
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -31,14 +25,29 @@ class TravelCompositeETL(ExtractTransformLoad):
    TRAVEL_BURDEN_FIELD_NAME: str

    def __init__(self):
+
+        # fetch
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.travel_composite_url = (
+                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
+                "dot_travel_composite/Shapefile_and_Metadata.zip"
+            )
+        else:
+            self.travel_composite_url = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
+
+        # input
        # define the full path for the input CSV file
-        self.INPUT_SHP = (
-            self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
+        self.disadvantage_layer_shape_source = (
+            self.get_sources_path()
+            / "DOT_Disadvantage_Layer_Final_April2022.shp"
        )

+        # output
        # this is the main dataframe
        self.df: pd.DataFrame

+        self.df_dot: pd.DataFrame
+
        # Start dataset-specific vars here
        ## Average of Transportation Indicator Percentiles (calculated)
        ## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
@ -46,6 +55,22 @@ class TravelCompositeETL(ExtractTransformLoad):
        self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
        self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"

+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.travel_composite_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df_dot = gpd.read_file(self.disadvantage_layer_shape_source)
+
    def transform(self) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:
@ -54,15 +79,15 @@ class TravelCompositeETL(ExtractTransformLoad):
        - Converts to CSV
        """

-        # read in the unzipped shapefile from data source
        # reformat it to be standard df, remove unassigned rows, and
        # then rename the Census Tract column for merging
-        df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
-        df_dot = df_dot.rename(
+
+        self.df_dot = self.df_dot.rename(
            columns={
                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
            }
        ).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
+
        # Assign the final df to the class' output_df for the load method
-        self.output_df = df_dot
+        self.output_df = self.df_dot
--- a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
@ -1,12 +1,15 @@
 from pathlib import Path

-import geopandas as gpd
 import pandas as pd
+import geopandas as gpd
+
 from data_pipeline.config import settings
-from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
-from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
+from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries

 logger = get_module_logger(__name__)

@ -39,13 +42,20 @@ class AbandonedMineETL(ExtractTransformLoad):
        "55",
    ]

-    # Define these for easy code completion
    def __init__(self):
-        self.SOURCE_URL = (
+
+        # fetch
+        self.eamlis_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/eAMLIS export of all data.tsv.zip"
        )

+        # input
+        self.eamlis_source = (
+            self.get_sources_path() / "eAMLIS export of all data.tsv"
+        )
+
+        # output
        self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME

        self.OUTPUT_PATH: Path = (
@ -58,18 +68,34 @@ class AbandonedMineETL(ExtractTransformLoad):
        ]

        self.output_df: pd.DataFrame
+        self.df: pd.DataFrame

-    def transform(self) -> None:
-        df = pd.read_csv(
-            self.get_tmp_path() / "eAMLIS export of all data.tsv",
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.eamlis_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.eamlis_source,
            sep="\t",
            low_memory=False,
        )
+
+    def transform(self) -> None:
+
        gdf = gpd.GeoDataFrame(
-            df,
+            self.df,
            geometry=gpd.points_from_xy(
-                x=df["Longitude"],
-                y=df["Latitude"],
+                x=self.df["Longitude"],
+                y=self.df["Latitude"],
            ),
            crs="epsg:4326",
        )
@ -77,4 +103,5 @@ class AbandonedMineETL(ExtractTransformLoad):
        gdf_tracts = add_tracts_for_geometries(gdf)
        gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
        gdf_tracts[self.AML_BOOLEAN] = True
+
        self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@ -3,6 +3,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)

@ -15,11 +17,18 @@ class EJSCREENETL(ExtractTransformLoad):
    INPUT_GEOID_TRACT_FIELD_NAME: str = "ID"

    def __init__(self):
-        self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
-        self.EJSCREEN_CSV = (
-            self.get_tmp_path() / "EJSCREEN_2021_USPR_Tracts.csv"
+
+        # fetch
+        self.ejscreen_url = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
+
+        # input
+        self.ejscreen_source = (
+            self.get_sources_path() / "EJSCREEN_2021_USPR_Tracts.csv"
        )
+
+        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen"
+
        self.df: pd.DataFrame

        self.COLUMNS_TO_KEEP = [
@ -43,22 +52,29 @@ class EJSCREENETL(ExtractTransformLoad):
            field_names.UST_FIELD,
        ]

-    def extract(self) -> None:
-        super().extract(
-            self.EJSCREEN_FTP_URL,
-            self.get_tmp_path(),
-            verify=False,  # EPA EJScreen end point has certificate issues often
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.ejscreen_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

-    def transform(self) -> None:
        self.df = pd.read_csv(
-            self.EJSCREEN_CSV,
+            self.ejscreen_source,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
            # EJSCREEN writes the word "None" for NA data.
            na_values=["None"],
            low_memory=False,
        )

+    def transform(self) -> None:
+
        # rename ID to Tract ID
        self.output_df = self.df.rename(
            columns={
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
@ -1,5 +1,6 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)
@ -9,12 +10,17 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
    # Note: while we normally set these properties in `__init__`,
    # we are setting them as class properties here so they can be accessed by the
    # class method `ejscreen_areas_of_concern_data_exists`.
-    LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
-    EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
-        LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
+
+    EJSCREEN_AREAS_OF_CONCERN_SOURCE = (
+        ExtractTransformLoad.DATA_PATH
+        / "sources"
+        / "EJSCREENAreasOfConcernETL"
+        / "ejscreen_areas_of_concerns_indicators.csv"
    )

    def __init__(self):
+
+        # output
        self.OUTPUT_PATH = (
            self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
        )
@ -22,6 +28,10 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
        # TO DO: Load from actual source; the issue is that this dataset is not public for now
        self.df: pd.DataFrame

+    def get_data_sources(self) -> [DataSource]:
+        """The source for this must be downloaded and saved manually. It is not publicly available"""
+        return []
+
    @classmethod
    def ejscreen_areas_of_concern_data_exists(cls):
        """Check whether or not the EJSCREEN areas of concern data exists.
@ -35,13 +45,19 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
        not reference this data.

        """
-        return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
+        return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE.is_file()

-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        logger.info(self.EJSCREEN_AREAS_OF_CONCERN_SOURCE)
        if self.ejscreen_areas_of_concern_data_exists():
            logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
            self.df = pd.read_csv(
-                filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
+                filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE,
                dtype={
                    self.GEOID_FIELD_NAME: "string",
                },
--- a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
@ -5,18 +5,27 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)


 class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
    def __init__(self):
-        self.DEFINITION_ALTERNATIVE_FILE_URL = (
+
+        # fetch
+        self.definition_alternative_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/alternative DAC definition.csv.zip"
        )

+        # input
+        self.definition_alternative_source = (
+            self.get_sources_path() / "J40 alternative DAC definition.csv"
+        )
+
+        # output
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "energy_definition_alternative_draft"
        )
@ -48,18 +57,22 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        unzip_file_from_url(
-            file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path()
-            / "energy_definition_alternative_draft",
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.definition_alternative_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

        self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "energy_definition_alternative_draft"
-            / "J40 alternative DAC definition.csv",
+            filepath_or_buffer=self.definition_alternative_source,
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={
                self.TRACT_INPUT_COLUMN_NAME: "string",
@ -68,6 +81,7 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
        )

    def transform(self) -> None:
+
        self.df = self.df.rename(
            columns={
                self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
@ -4,8 +4,9 @@ import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)

@ -23,17 +24,25 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):

    def __init__(self):

+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.AGGREGATED_RSEI_SCORE_FILE_URL = (
+            self.aggregated_rsei_score_file_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
            )
        else:
-            self.AGGREGATED_RSEI_SCORE_FILE_URL = (
+            self.aggregated_rsei_score_file_url = (
                "http://abt-rsei.s3.amazonaws.com/microdata2019/"
                "census_agg/CensusMicroTracts2019_2019_aggregated.zip"
            )

+        # input
+        self.aggregated_rsei_score_source = (
+            self.get_sources_path()
+            / "CensusMicroTracts2019_2019_aggregated.csv"
+        )
+
+        # output
        self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
        self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
        self.TRACT_INPUT_COLUMN_NAME = "GEOID10"
@ -64,7 +73,20 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.aggregated_rsei_score_file_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        # the column headers from the above dataset are actually a census tract's data at this point
        # We will use this data structure later to specify the column names
        input_columns = [
@ -79,16 +101,8 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
            self.NCSCORE_INPUT_FIELD,
        ]

-        unzip_file_from_url(
-            file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL,
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path() / "epa_rsei",
-        )
-
        self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
-            / "epa_rsei"
-            / "CensusMicroTracts2019_2019_aggregated.csv",
+            filepath_or_buffer=self.aggregated_rsei_score_source,
            # The following need to remain as strings for all of their digits, not get
            # converted to numbers.
            low_memory=False,
--- a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py
@ -5,6 +5,8 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)

@ -15,7 +17,7 @@ class FloodRiskETL(ExtractTransformLoad):
    NAME = "fsf_flood_risk"
    # These data were emailed to the J40 team while first street got
    # their official data sharing channels setup.
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    LOAD_YAML_CONFIG: bool = True

@ -27,13 +29,16 @@ class FloodRiskETL(ExtractTransformLoad):
    SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str

    def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = (
-            self.get_tmp_path() / "fsf_flood" / "flood-tract2010.csv"
+
+        # fetch
+        self.flood_tract_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
        )

-        # this is the main dataframe
-        self.df: pd.DataFrame
+        # input
+        self.flood_tract_source = (
+            self.get_sources_path() / "fsf_flood" / "flood-tract2010.csv"
+        )

        # Start dataset-specific vars here
        self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
@ -41,6 +46,29 @@ class FloodRiskETL(ExtractTransformLoad):
        self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30"
        self.CLIP_PROPERTIES_COUNT = 250

+        self.df_fsf_flood: pd.DataFrame
+
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.flood_tract_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        # read in the unzipped csv data source then rename the
+        # Census Tract column for merging
+        self.df_fsf_flood = pd.read_csv(
+            self.flood_tract_source,
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
+            low_memory=False,
+        )
+
    def transform(self) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:
@ -49,35 +77,29 @@ class FloodRiskETL(ExtractTransformLoad):
        - Calculates share of properties at risk, left-clipping number of properties at 250
        """

-        # read in the unzipped csv data source then rename the
-        # Census Tract column for merging
-        df_fsf_flood: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
-            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
-            low_memory=False,
-        )
-
-        df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = df_fsf_flood[
+        self.df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_flood[
            self.INPUT_GEOID_TRACT_FIELD_NAME
        ].str.zfill(11)

-        df_fsf_flood[self.COUNT_PROPERTIES] = df_fsf_flood[
+        self.df_fsf_flood[self.COUNT_PROPERTIES] = self.df_fsf_flood[
            self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
        ].clip(lower=self.CLIP_PROPERTIES_COUNT)

-        df_fsf_flood[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY] = (
-            df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
-            / df_fsf_flood[self.COUNT_PROPERTIES]
+        self.df_fsf_flood[
+            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY
+        ] = (
+            self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
+            / self.df_fsf_flood[self.COUNT_PROPERTIES]
        )
-        df_fsf_flood[
+        self.df_fsf_flood[
            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS
        ] = (
-            df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
-            / df_fsf_flood[self.COUNT_PROPERTIES]
+            self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
+            / self.df_fsf_flood[self.COUNT_PROPERTIES]
        )

        # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_fsf_flood.rename(
+        self.output_df = self.df_fsf_flood.rename(
            columns={
                self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY,
                self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS,
--- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
@ -4,6 +4,8 @@ import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)
@ -15,7 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad):
    NAME = "fsf_wildfire_risk"
    # These data were emailed to the J40 team while first street got
    # their official data sharing channels setup.
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -29,18 +31,48 @@ class WildfireRiskETL(ExtractTransformLoad):
    SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str

    def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = self.get_tmp_path() / "fsf_fire" / "fire-tract2010.csv"

+        # fetch
+        self.fsf_fire_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
+        )
+
+        # input
+        self.fsf_fire_source = (
+            self.get_sources_path() / "fsf_fire" / "fire-tract2010.csv"
+        )
+
+        # output
        # this is the main dataframe
        self.df: pd.DataFrame

+        self.df_fsf_fire: pd.DataFrame
+
        # Start dataset-specific vars here
        self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
        self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag"
        self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag"
        self.CLIP_PROPERTIES_COUNT = 250

+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.fsf_fire_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df_fsf_fire = pd.read_csv(
+            self.fsf_fire_source,
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
+            low_memory=False,
+        )
+
    def transform(self) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:
@ -50,31 +82,28 @@ class WildfireRiskETL(ExtractTransformLoad):
        """
        # read in the unzipped csv data source then rename the
        # Census Tract column for merging
-        df_fsf_fire: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
-            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
-            low_memory=False,
-        )

-        df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = df_fsf_fire[
+        self.df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_fire[
            self.INPUT_GEOID_TRACT_FIELD_NAME
        ].str.zfill(11)

-        df_fsf_fire[self.COUNT_PROPERTIES] = df_fsf_fire[
+        self.df_fsf_fire[self.COUNT_PROPERTIES] = self.df_fsf_fire[
            self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
        ].clip(lower=self.CLIP_PROPERTIES_COUNT)

-        df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
-            df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
-            / df_fsf_fire[self.COUNT_PROPERTIES]
+        self.df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
+            self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
+            / self.df_fsf_fire[self.COUNT_PROPERTIES]
        )
-        df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS] = (
-            df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
-            / df_fsf_fire[self.COUNT_PROPERTIES]
+        self.df_fsf_fire[
+            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS
+        ] = (
+            self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
+            / self.df_fsf_fire[self.COUNT_PROPERTIES]
        )

        # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_fsf_fire.rename(
+        self.output_df = self.df_fsf_fire.rename(
            columns={
                self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY,
                self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS,
--- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
@ -3,17 +3,33 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)


 class GeoCorrETL(ExtractTransformLoad):
+
    NAME = "geocorr"
+
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False

    def __init__(self):
+
+        # fetch
+        self.geocorr_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/geocorr_urban_rural.csv.zip"
+        )
+
+        # input
+        self.geocorr_source = (
+            self.get_sources_path() / "geocorr_urban_rural.csv"
+        )
+
+        # output
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"

        # Need to change hyperlink to S3
@ -23,7 +39,7 @@ class GeoCorrETL(ExtractTransformLoad):
        # The source data for this notebook was downloaded from GeoCorr;
        # the instructions for generating the source data is here:
        # https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787
-        self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
+        # self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
        self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
        self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag"
        self.COLUMNS_TO_KEEP = [
@ -33,16 +49,21 @@ class GeoCorrETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        unzip_file_from_url(
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/geocorr_urban_rural.csv.zip",
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=self.get_tmp_path(),
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.geocorr_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

        self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv",
+            filepath_or_buffer=self.geocorr_source,
            dtype={
                self.GEOCORR_GEOID_FIELD_NAME: "string",
            },
--- a/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py
@ -3,12 +3,16 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)


 class HistoricRedliningETL(ExtractTransformLoad):
+
    NAME = "historic_redlining"
+
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
    EXPECTED_MISSING_STATES = [
        "10",
@ -25,14 +29,14 @@ class HistoricRedliningETL(ExtractTransformLoad):
    ]
    PUERTO_RICO_EXPECTED_IN_DATA = False
    ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"

    def __init__(self):
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "historic_redlining"

-        self.HISTORIC_REDLINING_FILE_PATH = (
-            self.get_tmp_path() / "HRS_2010.xlsx"
-        )
+        # fetch
+        self.hrs_url = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
+
+        # input
+        self.hrs_source = self.get_sources_path() / "HRS_2010.xlsx"

        self.REDLINING_SCALAR = "Tract-level redlining score"

@ -40,30 +44,47 @@ class HistoricRedliningETL(ExtractTransformLoad):
            self.GEOID_TRACT_FIELD_NAME,
            self.REDLINING_SCALAR,
        ]
+
        self.df: pd.DataFrame
+        self.historic_redlining_data: pd.DataFrame
+
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.hrs_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.historic_redlining_data = pd.read_excel(self.hrs_source)

    def transform(self) -> None:
        # this is obviously temporary
-        historic_redlining_data = pd.read_excel(
-            self.HISTORIC_REDLINING_FILE_PATH
+
+        self.historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
+            self.historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
        )
-        historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
-            historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
-        )
-        historic_redlining_data = historic_redlining_data.rename(
+        self.historic_redlining_data = self.historic_redlining_data.rename(
            columns={"HRS2010": self.REDLINING_SCALAR}
        )

-        logger.debug(f"{historic_redlining_data.columns}")
+        logger.debug(f"{self.historic_redlining_data.columns}")

        # Calculate lots of different score thresholds for convenience
        for threshold in [3.25, 3.5, 3.75]:
-            historic_redlining_data[
+            self.historic_redlining_data[
                f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
-            ] = (historic_redlining_data[self.REDLINING_SCALAR] >= threshold)
+            ] = (
+                self.historic_redlining_data[self.REDLINING_SCALAR] >= threshold
+            )
            ## NOTE We add to columns to keep here
            self.COLUMNS_TO_KEEP.append(
                f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
            )

-        self.output_df = historic_redlining_data
+        self.output_df = self.historic_redlining_data
--- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
@ -1,8 +1,9 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
 from pandas.errors import EmptyDataError

 logger = get_module_logger(__name__)
@ -10,36 +11,46 @@ logger = get_module_logger(__name__)

 class HousingTransportationETL(ExtractTransformLoad):
    def __init__(self):
-        self.HOUSING_FTP_URL = (
-            "https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
-        )
+
        self.OUTPUT_PATH = (
            self.DATA_PATH / "dataset" / "housing_and_transportation_index"
        )
        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+
+        housing_url = (
+            "https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
+        )
+
+        sources = []
+
+        for fips in get_state_fips_codes(self.DATA_PATH):
+            sources.append(
+                ZIPDataSource(
+                    source=f"{housing_url}{fips}",
+                    destination=self.get_sources_path(),
+                )
+            )
+
+        return sources
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        # Download each state / territory individually
        dfs = []
-        zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
        for fips in get_state_fips_codes(self.DATA_PATH):
-            logger.debug(
-                f"Downloading housing data for state/territory with FIPS code {fips}"
-            )

-            unzip_file_from_url(
-                f"{self.HOUSING_FTP_URL}{fips}",
-                self.get_tmp_path(),
-                zip_file_dir,
-            )
-
-            # New file name:
-            tmp_csv_file_path = (
-                zip_file_dir / f"htaindex2019_data_tracts_{fips}.csv"
+            csv_source = (
+                self.get_sources_path() / f"htaindex2019_data_tracts_{fips}.csv"
            )

            try:
-                tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
+                tmp_df = pd.read_csv(filepath_or_buffer=csv_source)
            except EmptyDataError:
                logger.error(
                    f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
@ -3,24 +3,33 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource

 logger = get_module_logger(__name__)


 class HudHousingETL(ExtractTransformLoad):
+
    NAME = "hud_housing"
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT

    def __init__(self):
-        self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"

+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.HOUSING_FTP_URL = (
+            self.housing_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "hud_housing/2014thru2018-140-csv.zip"
            )
        else:
-            self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
+            self.housing_url = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
+
+        # source
+
+        # output
+
+        self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"

        self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()

@ -55,15 +64,16 @@ class HudHousingETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        super().extract(
-            self.HOUSING_FTP_URL,
-            self.HOUSING_ZIP_FILE_DIR,
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.housing_url, destination=self.get_sources_path()
+            )
+        ]

    def _read_chas_table(self, file_name):
-        # New file name:
-        tmp_csv_file_path = self.HOUSING_ZIP_FILE_DIR / "140" / file_name
+
+        tmp_csv_file_path = self.get_sources_path() / "140" / file_name
        tmp_df = pd.read_csv(
            filepath_or_buffer=tmp_csv_file_path,
            encoding="latin-1",
@ -78,7 +88,12 @@ class HudHousingETL(ExtractTransformLoad):

        return tmp_df

-    def transform(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        table_8 = self._read_chas_table("Table8.csv")
        table_3 = self._read_chas_table("Table3.csv")

@ -86,6 +101,8 @@ class HudHousingETL(ExtractTransformLoad):
            table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME
        )

+    def transform(self) -> None:
+
        # Calculate share that lacks indoor plumbing or kitchen
        # This is computed as
        # (
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
@ -1,7 +1,9 @@
 import pandas as pd
-import requests
+
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.utils import get_module_logger


@ -11,44 +13,51 @@ logger = get_module_logger(__name__)
 class HudRecapETL(ExtractTransformLoad):
    def __init__(self):

+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.HUD_RECAP_CSV_URL = (
+            self.hud_recap_csv_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
            )
        else:
-            self.HUD_RECAP_CSV_URL = (
+            self.hud_recap_csv_url = (
                "https://opendata.arcgis.com/api/v3/datasets/"
                "56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
            )

-        self.HUD_RECAP_CSV = (
-            self.get_tmp_path()
+        # input
+        self.hud_recap_source = (
+            self.get_sources_path()
            / "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
        )
+
+        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"

-        # Definining some variable names
+        # Defining some variable names
        self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
            "hud_recap_priority_community"
        )

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        download = requests.get(
-            self.HUD_RECAP_CSV_URL,
-            verify=None,
-            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
-        )
-        file_contents = download.content
-        csv_file = open(self.HUD_RECAP_CSV, "wb")
-        csv_file.write(file_contents)
-        csv_file.close()
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.hud_recap_csv_url, destination=self.hud_recap_source
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        # Load comparison index (CalEnviroScreen 4)
+        self.df = pd.read_csv(self.hud_recap_source, dtype={"GEOID": "string"})

    def transform(self) -> None:
-        # Load comparison index (CalEnviroScreen 4)
-        self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})

        self.df.rename(
            columns={
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
@ -2,6 +2,8 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger

@ -10,16 +12,25 @@ logger = get_module_logger(__name__)

 class MappingForEJETL(ExtractTransformLoad):
    def __init__(self):
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"

-        self.MAPPING_FOR_EJ_VA_URL = (
+        # fetch
+        self.mapping_for_ej_va_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip"
        )
-        self.MAPPING_FOR_EJ_CO_URL = (
+        self.mapping_for_ej_co_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
        )
-        self.VA_SHP_FILE_PATH = self.get_tmp_path() / "mej_virginia_7_1.shp"
-        self.CO_SHP_FILE_PATH = self.get_tmp_path() / "mej_colorado_final.shp"
+
+        # input
+        self.va_shp_file_source = (
+            self.get_sources_path() / "mej_virginia_7_1.shp"
+        )
+        self.co_shp_file_source = (
+            self.get_sources_path() / "mej_colorado_final.shp"
+        )
+
+        # output
+        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"

        # Defining variables
        self.COLUMNS_TO_KEEP = [
@ -38,26 +49,35 @@ class MappingForEJETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
-        super().extract(
-            self.MAPPING_FOR_EJ_VA_URL,
-            self.get_tmp_path(),
-        )
-        super().extract(
-            self.MAPPING_FOR_EJ_CO_URL,
-            self.get_tmp_path(),
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.mapping_for_ej_va_url,
+                destination=self.get_sources_path(),
+            ),
+            ZIPDataSource(
+                source=self.mapping_for_ej_co_url,
+                destination=self.get_sources_path(),
+            ),
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

-    def transform(self) -> None:
        # Join (here, it's just concatenating) the two dataframes from
        # CO and VA
        self.df = pd.concat(
            [
-                gpd.read_file(self.VA_SHP_FILE_PATH),
-                gpd.read_file(self.CO_SHP_FILE_PATH),
+                gpd.read_file(self.va_shp_file_source),
+                gpd.read_file(self.co_shp_file_source),
            ]
        )

+    def transform(self) -> None:
+
        # Fill Census tract to get it to be 11 digits, incl. leading 0s
        # Note that VA and CO should never have leading 0s, so this isn't
        # strictly necessary, but if in the future, there are more states
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
@ -3,8 +3,9 @@ import pathlib
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings

@ -19,31 +20,35 @@ class MappingInequalityETL(ExtractTransformLoad):

    Information on the mapping of this data to census tracts is available at
    https://github.com/americanpanorama/Census_HOLC_Research.
-
    """

    def __init__(self):
+
+        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.MAPPING_INEQUALITY_CSV_URL = (
+            self.mapping_inequality_csv_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "mapping_inequality/holc_tract_lookup.csv"
            )
        else:
-            self.MAPPING_INEQUALITY_CSV_URL = (
+            self.mapping_inequality_csv_url = (
                "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
                "main/2010_Census_Tracts/holc_tract_lookup.csv"
            )
-        self.MAPPING_INEQUALITY_CSV = (
-            self.get_tmp_path() / "holc_tract_lookup.csv"
-        )
-        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"

-        self.HOLC_MANUAL_MAPPING_CSV_PATH = (
+        # input
+        self.mapping_inequality_source = (
+            self.get_sources_path() / "holc_tract_lookup.csv"
+        )
+        self.holc_manual_mapping_source = (  # here be dragons – this file is pulled from a different place than most
            pathlib.Path(__file__).parent
            / "data"
            / "holc_grades_manually_mapped.csv"
        )

+        # output
+        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
+
        # Some input field names. From documentation: 'Census Tracts were intersected
        # with HOLC Polygons. Census information can be joined via the "geoid" field.
        # There are two field "holc_prop" and "tract_prop" which give the proportion
@ -73,22 +78,39 @@ class MappingInequalityETL(ExtractTransformLoad):
        ]

        self.df: pd.DataFrame
+        self.holc_manually_mapped_df: pd.DataFrame

-    def extract(self) -> None:
-        download_file_from_url(
-            file_url=self.MAPPING_INEQUALITY_CSV_URL,
-            download_file_name=self.MAPPING_INEQUALITY_CSV,
-        )
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.mapping_inequality_csv_url,
+                destination=self.mapping_inequality_source,
+            )
+        ]

-    def transform(self) -> None:
-        df: pd.DataFrame = pd.read_csv(
-            self.MAPPING_INEQUALITY_CSV,
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df = pd.read_csv(
+            self.mapping_inequality_source,
            dtype={self.TRACT_INPUT_FIELD: "string"},
            low_memory=False,
        )

+        # Some data needs to be manually mapped to its grade.
+        # TODO: Investigate more data that may need to be manually mapped.
+        self.holc_manually_mapped_df = pd.read_csv(
+            filepath_or_buffer=self.holc_manual_mapping_source,
+            low_memory=False,
+        )
+
+    def transform(self) -> None:
+
        # rename Tract ID
-        df.rename(
+        self.df.rename(
            columns={
                self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
            },
@ -98,28 +120,21 @@ class MappingInequalityETL(ExtractTransformLoad):
        # Keep the first character, which is the HOLC grade (A, B, C, D).
        # TODO: investigate why this dataframe triggers these pylint errors.
        # pylint: disable=unsupported-assignment-operation, unsubscriptable-object
-        df[self.HOLC_GRADE_DERIVED_FIELD] = df[
+        self.df[self.HOLC_GRADE_DERIVED_FIELD] = self.df[
            self.HOLC_GRADE_AND_ID_FIELD
        ].str[0:1]

        # Remove nonsense when the field has no grade or invalid grades.
        valid_grades = ["A", "B", "C", "D"]
-        df.loc[
+        self.df.loc[
            # pylint: disable=unsubscriptable-object
-            ~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
+            ~self.df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
            self.HOLC_GRADE_DERIVED_FIELD,
        ] = None

-        # Some data needs to be manually mapped to its grade.
-        # TODO: Investigate more data that may need to be manually mapped.
-        holc_manually_mapped_df = pd.read_csv(
-            filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH,
-            low_memory=False,
-        )
-
        # Join on the existing data
-        merged_df = df.merge(
-            right=holc_manually_mapped_df,
+        merged_df = self.df.merge(
+            right=self.holc_manually_mapped_df,
            on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
            how="left",
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
@ -4,6 +4,8 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger

@ -17,11 +19,16 @@ class MarylandEJScreenETL(ExtractTransformLoad):
    """

    def __init__(self):
-        self.MARYLAND_EJSCREEN_URL = (
+
+        # fetch
+        self.maryland_ejscreen_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
        )

-        self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen"
+        # input
+        self.shape_files_source = self.get_sources_path() / "mdejscreen"
+
+        # output
        self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"

        self.COLUMNS_TO_KEEP = [
@ -31,37 +38,47 @@ class MarylandEJScreenETL(ExtractTransformLoad):
        ]

        self.df: pd.DataFrame
+        self.dfs_list: pd.DataFrame
+
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.maryland_ejscreen_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:

-    def extract(self) -> None:
-        logger.debug("Downloading 207MB Maryland EJSCREEN Data")
        super().extract(
-            self.MARYLAND_EJSCREEN_URL,
-            self.get_tmp_path(),
-        )
+            use_cached_data_sources
+        )  # download and extract data sources

-    def transform(self) -> None:
-        list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
+        logger.debug("Downloading 207MB Maryland EJSCREEN Data")
+        list_of_files = list(glob(str(self.shape_files_source) + "/*.shp"))

-        # Ignore counties becauses this is not the level of measurement
+        # Ignore counties because this is not the level of measurement
        # that is consistent with our current scoring and ranking methodology.
-        dfs_list = [
+        self.dfs_list = [
            gpd.read_file(f)
            for f in list_of_files
            if not f.endswith("CountiesEJScore.shp")
        ]

+    def transform(self) -> None:
+
        # Set the Census tract as the index and drop the geometry column
        # that produces the census tract boundaries.
        # The latter is because Geopandas raises an exception if there
        # are duplicate geometry columns.
        # Moreover, since the unit of measurement is at the tract level
        # we can consistantly merge this with other datasets
-        dfs_list = [
+        self.dfs_list = [
            df.set_index("Census_Tra").drop("geometry", axis=1)
-            for df in dfs_list
+            for df in self.dfs_list
        ]
        # pylint: disable=unsubscriptable-object
-        self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1))
+        self.df = gpd.GeoDataFrame(pd.concat(self.dfs_list, axis=1))

        # Reset index so that we no longer have the tract as our index
        self.df = self.df.reset_index()
--- a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
@ -1,6 +1,8 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger

@ -15,12 +17,21 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
    """

    def __init__(self):
-        self.MICHIGAN_EJSCREEN_S3_URL = (
+
+        # fetch
+        self.michigan_ejscreen_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/michigan_ejscore_12212021.csv"
        )

+        # input
+        self.michigan_ejscreen_source = (
+            self.get_sources_path() / "michigan_ejscore_12212021.csv"
+        )
+
+        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen"
+
        self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75

        self.COLUMNS_TO_KEEP = [
@ -32,14 +43,28 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):

        self.df: pd.DataFrame

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            FileDataSource(
+                source=self.michigan_ejscreen_url,
+                destination=self.michigan_ejscreen_source,
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        self.df = pd.read_csv(
-            filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
+            filepath_or_buffer=self.michigan_ejscreen_source,
            dtype={"GEO_ID": "string"},
            low_memory=False,
        )

    def transform(self) -> None:
+
        self.df.rename(
            columns={
                "GEO_ID": self.GEOID_TRACT_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -4,6 +4,8 @@
 # pylint: disable=unsupported-assignment-operation
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -16,17 +18,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):

    NAME = "national_risk_index"

-    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-        SOURCE_URL = (
-            f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
-            "national_risk_index/NRI_Table_CensusTracts.zip"
-        )
-    else:
-        SOURCE_URL = (
-            "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
-            "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
-        )
-
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -46,11 +37,28 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    AGRIVALUE_LOWER_BOUND = 408000

    def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"

+        # fetch
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            self.risk_index_url = (
+                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
+                "national_risk_index/NRI_Table_CensusTracts.zip"
+            )
+        else:
+            self.risk_index_url = (
+                "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
+                "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
+            )
+
+        # source
+        self.risk_index_source = (
+            self.get_sources_path() / "NRI_Table_CensusTracts.csv"
+        )
+
+        # output
        # this is the main dataframe
        self.df: pd.DataFrame
+        self.df_nri: pd.DataFrame

        # Start dataset-specific vars here
        self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
@ -65,14 +73,26 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
        self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"

-    def extract(self) -> None:
-        """Unzips NRI dataset from the FEMA data source and writes the files
-        to the temporary data folder for use in the transform() method
-        """
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.risk_index_url, destination=self.get_sources_path()
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:

        super().extract(
-            source_url=self.SOURCE_URL,
-            extract_path=self.get_tmp_path(),
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        # read in the unzipped csv from NRI data source then rename the
+        # Census Tract column for merging
+        self.df_nri = pd.read_csv(
+            self.risk_index_source,
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
+            na_values=["None"],
+            low_memory=False,
        )

    def transform(self) -> None:
@ -84,16 +104,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
          Groups inside of that Tract
        """

-        # read in the unzipped csv from NRI data source then rename the
-        # Census Tract column for merging
-        df_nri: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
-            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
-            na_values=["None"],
-            low_memory=False,
-        )
-
-        df_nri.rename(
+        self.df_nri.rename(
            columns={
                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
@ -123,42 +134,46 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        agriculture_columns = [
            f"{x}_EALA"
            for x in disaster_categories
-            if f"{x}_EALA" in list(df_nri.columns)
+            if f"{x}_EALA" in list(self.df_nri.columns)
        ]

        population_columns = [
            f"{x}_EALP"
            for x in disaster_categories
-            if f"{x}_EALP" in list(df_nri.columns)
+            if f"{x}_EALP" in list(self.df_nri.columns)
        ]

        buildings_columns = [
            f"{x}_EALB"
            for x in disaster_categories
-            if f"{x}_EALB" in list(df_nri.columns)
+            if f"{x}_EALB" in list(self.df_nri.columns)
        ]

-        disaster_population_sum_series = df_nri[population_columns].sum(axis=1)
-
-        disaster_agriculture_sum_series = df_nri[agriculture_columns].sum(
+        disaster_population_sum_series = self.df_nri[population_columns].sum(
            axis=1
        )

-        disaster_buildings_sum_series = df_nri[buildings_columns].sum(axis=1)
+        disaster_agriculture_sum_series = self.df_nri[agriculture_columns].sum(
+            axis=1
+        )
+
+        disaster_buildings_sum_series = self.df_nri[buildings_columns].sum(
+            axis=1
+        )

        # Population EAL Rate = Eal Valp / Population
-        df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
+        self.df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
            disaster_population_sum_series
-            / df_nri[self.POPULATION_INPUT_FIELD_NAME]
+            / self.df_nri[self.POPULATION_INPUT_FIELD_NAME]
        )

        # Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000)
        ## FORMULA ADJUSTMENT 2/17
        ## Because AGRIVALUE contains a lot of 0s, we are going to consider
        ## 90th percentile only for places that have some agrivalue at all
-        df_nri[
+        self.df_nri[
            self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
-        ] = disaster_agriculture_sum_series / df_nri[
+        ] = disaster_agriculture_sum_series / self.df_nri[
            self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME
        ].clip(
            lower=self.AGRIVALUE_LOWER_BOUND
@ -167,11 +182,11 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        ## Check that this clip worked -- that the only place the value has changed is when the clip took effect
        base_expectation = (
            disaster_agriculture_sum_series
-            / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
+            / self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
        )
        assert (
-            df_nri[
-                df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+            self.df_nri[
+                self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
                != base_expectation
            ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
            <= self.AGRIVALUE_LOWER_BOUND
@ -181,27 +196,27 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        )

        assert (
-            df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+            self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
            != base_expectation
        ).sum() > 0, "Clipping the agrivalue did nothing!"

        # This produces a boolean that is True in the case of non-zero agricultural value
-        df_nri[self.CONTAINS_AGRIVALUE] = (
-            df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
+        self.df_nri[self.CONTAINS_AGRIVALUE] = (
+            self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
        )

        # divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)).
-        df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
+        self.df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
            disaster_buildings_sum_series
-            / df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
+            / self.df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
        )

        # Round all float columns to just 10 digits.
        # Note: `round` is smart enough to only apply to float columns.
-        df_nri = df_nri.round(10)
+        self.df_nri = self.df_nri.round(10)

        # Assign the final df to the class' output_df for the load method
-        self.output_df = df_nri
+        self.output_df = self.df_nri

    def load(self) -> None:
        # Suppress scientific notation.
--- a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
@ -3,6 +3,8 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger

@ -13,10 +15,7 @@ class NatureDeprivedETL(ExtractTransformLoad):
    """ETL class for the Nature Deprived Communities dataset"""

    NAME = "nlcd_nature_deprived"
-    SOURCE_URL = (
-        settings.AWS_JUSTICE40_DATASOURCES_URL
-        + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
-    )
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -29,14 +28,25 @@ class NatureDeprivedETL(ExtractTransformLoad):
    TRACT_PERCENT_CROPLAND_FIELD_NAME: str

    def __init__(self):
-        # define the full path for the input CSV file
-        self.INPUT_CSV = (
-            self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
+
+        # fetch
+        self.nature_deprived_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL
+            + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
        )

+        # source
+        # define the full path for the input CSV file
+        self.nature_deprived_source = (
+            self.get_sources_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
+        )
+
+        # output
        # this is the main dataframe
        self.df: pd.DataFrame

+        self.df_ncld: pd.DataFrame
+
        # Start dataset-specific vars here
        self.PERCENT_NATURAL_FIELD_NAME = "PctNatural"
        self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv"
@ -47,28 +57,43 @@ class NatureDeprivedETL(ExtractTransformLoad):
        # for area. This does indeed remove tracts from the 90th+ percentile later on
        self.TRACT_ACRES_LOWER_BOUND = 35

-    def transform(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.nature_deprived_url,
+                destination=self.get_sources_path(),
+            )
+        ]
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:

        - Renames columns as needed
        """

-        df_ncld: pd.DataFrame = pd.read_csv(
-            self.INPUT_CSV,
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        self.df_ncld = pd.read_csv(
+            self.nature_deprived_source,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
            low_memory=False,
        )

-        df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
-            df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND
+    def transform(self) -> None:
+
+        self.df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
+            self.df_ncld[self.TRACT_ACRES_FIELD_NAME]
+            >= self.TRACT_ACRES_LOWER_BOUND
        )
-        df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
-            100 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
+        self.df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
+            100 - self.df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
        )

        # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_ncld.rename(
+        self.output_df = self.df_ncld.rename(
            columns={
                self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME,
                self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
@ -3,9 +3,10 @@ import functools
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url

 logger = get_module_logger(__name__)

@ -23,6 +24,26 @@ class PersistentPovertyETL(ExtractTransformLoad):
    PUERTO_RICO_EXPECTED_IN_DATA = False

    def __init__(self):
+
+        # fetch
+        self.poverty_url = (
+            settings.AWS_JUSTICE40_DATASOURCES_URL + "/LTDB_Std_All_Sample.zip"
+        )
+
+        # source
+        self.poverty_sources = [
+            self.get_sources_path()
+            / "ltdb_std_all_sample"
+            / "ltdb_std_1990_sample.csv",
+            self.get_sources_path()
+            / "ltdb_std_all_sample"
+            / "ltdb_std_2000_sample.csv",
+            self.get_sources_path()
+            / "ltdb_std_all_sample"
+            / "ltdb_std_2010_sample.csv",
+        ]
+
+        # output
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"

        # Need to change hyperlink to S3
@ -44,6 +65,13 @@ class PersistentPovertyETL(ExtractTransformLoad):

        self.df: pd.DataFrame

+    def get_data_sources(self) -> [DataSource]:
+        return [
+            ZIPDataSource(
+                source=self.poverty_url, destination=self.get_sources_path()
+            )
+        ]
+
    def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
        df = functools.reduce(
            lambda df_a, df_b: pd.merge(
@ -75,28 +103,17 @@ class PersistentPovertyETL(ExtractTransformLoad):

        return df

-    def extract(self) -> None:
-        unzipped_file_path = self.get_tmp_path()
+    def extract(self, use_cached_data_sources: bool = False) -> None:

-        unzip_file_from_url(
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
-            + "/LTDB_Std_All_Sample.zip",
-            download_path=self.get_tmp_path(),
-            unzipped_file_path=unzipped_file_path,
-        )
-
-        file_names = [
-            "ltdb_std_1990_sample.csv",
-            "ltdb_std_2000_sample.csv",
-            "ltdb_std_2010_sample.csv",
-        ]
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources

        temporary_input_dfs = []

-        for file_name in file_names:
+        for file_name in self.poverty_sources:
            temporary_input_df = pd.read_csv(
-                filepath_or_buffer=unzipped_file_path
-                / f"ltdb_std_all_sample/{file_name}",
+                filepath_or_buffer=file_name,
                dtype={
                    self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
                    self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",
--- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
@ -1,6 +1,8 @@
 import geopandas as gpd
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger

 logger = get_module_logger(__name__)
@ -20,10 +22,17 @@ class TreeEquityScoreETL(ExtractTransformLoad):
    """

    def __init__(self):
-        self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
-        self.TES_CSV = self.get_tmp_path() / "tes_2021_data.csv"
+
+        # input
+        self.TES_CSV = self.get_sources_path() / "tes_2021_data.csv"
+
+        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
        self.df: gpd.GeoDataFrame
+
+        self.tes_state_dfs = []
+
+        # config
        self.states = [
            "al",
            "az",
@ -76,21 +85,36 @@ class TreeEquityScoreETL(ExtractTransformLoad):
            "wy",
        ]

-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
+
+        tes_url = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
+
+        sources = []
        for state in self.states:
-            super().extract(
-                f"{self.TES_URL}{state}.zip.zip",
-                f"{self.get_tmp_path()}/{state}",
+            sources.append(
+                ZIPDataSource(
+                    source=f"{tes_url}{state}.zip.zip",
+                    destination=self.get_sources_path() / state,
+                )
+            )
+
+        return sources
+
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
+        for state in self.states:
+            self.tes_state_dfs.append(
+                gpd.read_file(f"{self.get_sources_path()}/{state}/{state}.shp")
            )

    def transform(self) -> None:
-        tes_state_dfs = []
-        for state in self.states:
-            tes_state_dfs.append(
-                gpd.read_file(f"{self.get_tmp_path()}/{state}/{state}.shp")
-            )
+
        self.df = gpd.GeoDataFrame(
-            pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs
+            pd.concat(self.tes_state_dfs), crs=self.tes_state_dfs[0].crs
        )

        # rename ID to Tract ID
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py
@ -4,63 +4,57 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url

 logger = get_module_logger(__name__)


 class TribalETL(ExtractTransformLoad):
    def __init__(self):
+
+        self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
+
        self.GEOGRAPHIC_BASE_PATH = (
            self.DATA_PATH / "tribal" / "geographic_data"
        )
-        self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
        self.NATIONAL_TRIBAL_GEOJSON_PATH = (
            self.GEOGRAPHIC_BASE_PATH / "usa.json"
        )
+
        self.USA_TRIBAL_DF_LIST = []

-    def extract(self) -> None:
-        """Extract the tribal geojson zip files from Justice40 S3 data folder
+    def get_data_sources(self) -> [DataSource]:

-        Returns:
-            None
-        """
-
-        bia_shapefile_zip_url = (
+        national_lar_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/BIA_National_LAR_updated_20220929.zip"
        )
-
-        tsa_and_aian_geojson_zip_url = (
+        tsa_and_aian_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/BIA_TSA_and_AIAN_json.zip"
        )
-
-        alaska_geojson_url = (
+        alaska_native_villages_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/Alaska_Native_Villages_json.zip"
        )

-        unzip_file_from_url(
-            bia_shapefile_zip_url,
-            self.TMP_PATH,
-            self.GEOGRAPHIC_BASE_PATH / "bia_national_lar",
-        )
-
-        unzip_file_from_url(
-            tsa_and_aian_geojson_zip_url,
-            self.TMP_PATH,
-            self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian",
-        )
-
-        unzip_file_from_url(
-            alaska_geojson_url,
-            self.TMP_PATH,
-            self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages",
-        )
+        return [
+            ZIPDataSource(
+                national_lar_url,
+                destination=self.get_sources_path() / "bia_national_lar",
+            ),
+            ZIPDataSource(
+                source=tsa_and_aian_url,
+                destination=self.get_sources_path() / "tsa_and_aian",
+            ),
+            ZIPDataSource(
+                source=alaska_native_villages_url,
+                destination=self.get_sources_path() / "alaska_native_villages",
+            ),
+        ]

    def _transform_bia_national_lar(self, path: Path) -> None:
        """Transform the Tribal BIA National Lar Geodataframe and appends it to the
@ -187,21 +181,21 @@ class TribalETL(ExtractTransformLoad):
        """
        # Set the filepaths:
        bia_national_lar_shapefile = (
-            self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
+            self.get_sources_path() / "bia_national_lar"
        )

        bia_aian_supplemental_geojson = (
-            self.GEOGRAPHIC_BASE_PATH
+            self.get_sources_path()
            / "tsa_and_aian"
            / "BIA_AIAN_Supplemental.json"
        )

        bia_tsa_geojson = (
-            self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json"
+            self.get_sources_path() / "tsa_and_aian" / "BIA_TSA.json"
        )

        alaska_native_villages_geojson = (
-            self.GEOGRAPHIC_BASE_PATH
+            self.get_sources_path()
            / "alaska_native_villages"
            / "AlaskaNativeVillages.gdb.geojson"
        )
@ -225,7 +219,11 @@ class TribalETL(ExtractTransformLoad):
            "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
        )

+        # note – this works a little different than many of the ETLs. The file
+        # being written here is used again downstream, so it's placed in a
+        # special directory.
        logger.debug("Writing national geojson file")
+        self.GEOGRAPHIC_BASE_PATH.mkdir(parents=True, exist_ok=True)
        usa_tribal_df.to_file(
            self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
@ -4,6 +4,7 @@ import geopandas as gpd
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 from data_pipeline.etl.sources.geo_utils import get_tract_geojson
@ -67,6 +68,9 @@ class TribalOverlapETL(ExtractTransformLoad):
        self.census_tract_gdf: gpd.GeoDataFrame
        self.tribal_gdf: gpd.GeoDataFrame

+    def get_data_sources(self) -> [DataSource]:
+        return []  # this uses already retrieved / calculated data
+
    @staticmethod
    def _create_string_from_list(series: pd.Series) -> str:
        """Helper method that creates a sorted string list (for tribal names)."""
@ -89,7 +93,12 @@ class TribalOverlapETL(ExtractTransformLoad):

        return percentage_float

-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
+
+        super().extract(
+            use_cached_data_sources
+        )  # download and extract data sources
+
        self.census_tract_gdf = get_tract_geojson()
        self.tribal_gdf = get_tribal_geojson()

--- a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
@ -4,9 +4,10 @@ import geopandas as gpd
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.datasource import DataSource
+from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
-from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings

@ -28,19 +29,6 @@ class USArmyFUDS(ExtractTransformLoad):

    def __init__(self):

-        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.FILE_URL = (
-                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
-                "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
-                "all_data_reported_to_Congress_in_FY2020.geojson"
-            )
-        else:
-            self.FILE_URL: str = (
-                "https://opendata.arcgis.com/api/v3/datasets/"
-                "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
-                "data?format=geojson&spatialRefId=4326&where=1%3D1"
-            )
-
        self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"

        # Constants for output
@ -50,17 +38,27 @@ class USArmyFUDS(ExtractTransformLoad):
            self.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
            self.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
        ]
-        self.DOWNLOAD_FILE_NAME = self.get_tmp_path() / "fuds.geojson"
+        self.fuds_source = self.get_sources_path() / "fuds.geojson"

        self.raw_df: gpd.GeoDataFrame
        self.output_df: pd.DataFrame

-    def extract(self) -> None:
-        download_file_from_url(
-            file_url=self.FILE_URL,
-            download_file_name=self.DOWNLOAD_FILE_NAME,
-            verify=True,
-        )
+    def get_data_sources(self) -> [DataSource]:
+
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
+            fuds_url = (
+                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
+                "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
+                "all_data_reported_to_Congress_in_FY2020.geojson"
+            )
+        else:
+            fuds_url: str = (
+                "https://opendata.arcgis.com/api/v3/datasets/"
+                "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
+                "data?format=geojson&spatialRefId=4326&where=1%3D1"
+            )
+
+        return [FileDataSource(source=fuds_url, destination=self.fuds_source)]

    def transform(self) -> None:
        # before we try to do any transformation, get the tract data
@ -68,7 +66,7 @@ class USArmyFUDS(ExtractTransformLoad):

        logger.debug("Loading FUDS data as GeoDataFrame for transform")
        raw_df = gpd.read_file(
-            filename=self.DOWNLOAD_FILE_NAME,
+            filename=self.fuds_source,
            low_memory=False,
        )