Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline
2025-02-22 17:44:20 -08:00 · 2023-03-03 12:26:24 -06:00 · 2023-03-03 12:26:24 -06:00 · 6f39033dde
commit 6f39033dde
parent 4d9c1dd11e
52 changed files with 1787 additions and 686 deletions
--- a/data/data-pipeline/README.md
+++ b/data/data-pipeline/README.md
@ -92,7 +92,6 @@ If you want to run specific data tasks, you can open a terminal window, navigate
 - Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles`
 To learn more about these commands and when they should be run, refer to [Running for Local Development](#running-for-local-development).
 </details>
 ---
@ -136,6 +135,9 @@ Once you've downloaded the census data, run the following commands – in order
 Many commands have options. For example, you can run a single dataset with `etl-run` by passing the command line parameter `-d name-of-dataset-to-run`. Please use the `--help` option to find out more.
 > :bulb: **NOTE**  
 > One important command line option is enabling cached data sources. Pass the command line parameter `-u` to many commands (e.g. `etl-run`) to use locally cached data sources within the ETL portion of the pipeline. This will ensure that you don't download many GB of data with each run of the data pipeline.
 ## How Scoring Works
 Scores are generated by running the `score-run` command via Poetry or Docker. This command executes [`data_pipeline/etl/score/etl_score.py`](data_pipeline/etl/score/etl_score.py). During execution,
--- a/data/data-pipeline/data_pipeline/application.py
+++ b/data/data-pipeline/data_pipeline/application.py
@ -7,6 +7,9 @@ from data_pipeline.etl.runner import etl_runner
 from data_pipeline.etl.runner import score_generate
 from data_pipeline.etl.runner import score_geo
 from data_pipeline.etl.runner import score_post
 from data_pipeline.etl.runner import get_data_sources
 from data_pipeline.etl.runner import extract_data_sources as extract_ds
 from data_pipeline.etl.runner import clear_data_source_cache as clear_ds_cache
 from data_pipeline.etl.sources.census.etl_utils import check_census_data_source
 from data_pipeline.etl.sources.census.etl_utils import (
    reset_data_directories as census_reset,
@ -79,7 +82,14 @@ def data_cleanup():
    is_flag=True,
    help="Upload to AWS S3 a zipped archive of the census data.",
 )
-def census_data_download(zip_compress):
+@click.option(
    "-u",
    "--use-cache",
    is_flag=True,
    default=False,
    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
 )
 def census_data_download(zip_compress, use_cache):
    """CLI command to download all census shape files from the Census FTP and extract the geojson
    to generate national and by state Census Block Group CSVs"""
    log_title("Download Census Data ")
@ -88,7 +98,7 @@ def census_data_download(zip_compress):
    census_reset(data_path)
    log_info("Downloading census data")
-    etl_runner("census")
+    etl_runner("census", use_cache)
    if zip_compress:
        log_info("Zipping census data")
@ -129,7 +139,14 @@ def pull_census_data(data_source: str):
    type=str,
    help=dataset_cli_help,
 )
-def etl_run(dataset: str):
+@click.option(
    "-u",
    "--use-cache",
    is_flag=True,
    default=False,
    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
 )
 def etl_run(dataset: str, use_cache: bool):
    """Run a specific or all ETL processes
    Args:
@ -141,7 +158,7 @@ def etl_run(dataset: str):
    log_title("Run ETL")
    log_info("Running dataset(s)")
-    etl_runner(dataset)
+    etl_runner(dataset, use_cache)
    log_goodbye()
    sys.exit()
@ -167,7 +184,14 @@ def score_run():
@cli.command(
    help="Run ETL + Score Generation",
 )
-def score_full_run():
+@click.option(
    "-u",
    "--use-cache",
    is_flag=True,
    default=False,
    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
 )
 def score_full_run(use_cache: bool):
    """CLI command to run ETL and generate the score in one command"""
    log_title("Score Full Run", "Run ETL and Generate Score (no tiles)")
@ -177,7 +201,7 @@ def score_full_run():
    temp_folder_cleanup()
    log_info("Running all ETLs")
-    etl_runner()
+    etl_runner(use_cache=use_cache)
    log_info("Generating score")
    score_generate()
@ -297,7 +321,14 @@ def generate_map_tiles(generate_tribal_layer):
    type=str,
    help=dataset_cli_help,
 )
-def data_full_run(check: bool, data_source: str):
+@click.option(
    "-u",
    "--use-cache",
    is_flag=True,
    default=False,
    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
 )
 def data_full_run(check: bool, data_source: str, use_cache: bool):
    """CLI command to run ETL, score, JSON combine and generate tiles in one command
    Args:
@ -330,10 +361,10 @@ def data_full_run(check: bool, data_source: str):
        if data_source == "local":
            log_info("Downloading census data")
-            etl_runner("census")
+            etl_runner("census", use_cache)
        log_info("Running all ETLs")
-        etl_runner()
+        etl_runner(use_cache=use_cache)
        log_info("Generating score")
        score_generate()
@ -357,6 +388,103 @@ def data_full_run(check: bool, data_source: str):
    sys.exit()
@cli.command(
    help="Print data sources for all ETL processes (or a specific one)",
 )
@click.option(
    "-d",
    "--dataset",
    required=False,
    type=str,
    help=dataset_cli_help,
 )
 def print_data_sources(dataset: str):
    """Print data sources for all ETL processes (or a specific one)
    Args:
        dataset (str): Name of the ETL module to be run (optional)
    Returns:
        None
    """
    log_title("Print ETL Datasources")
    log_info("Retrieving dataset(s)")
    sources = get_data_sources(dataset)
    log_info(f"Discovered {len(sources)} files")
    for s in sources:
        log_info(s)
    log_goodbye()
    sys.exit()
@cli.command(
    help="Fetch data sources for all ETL processes (or a specific one)",
 )
@click.option(
    "-d",
    "--dataset",
    required=False,
    type=str,
    help=dataset_cli_help,
 )
@click.option(
    "-u",
    "--use-cache",
    is_flag=True,
    default=False,
    help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
 )
 def extract_data_sources(dataset: str, use_cache: bool):
    """Extract and cache data source(s) for all ETL processes (or a specific one)
    Args:
        dataset (str): Name of the ETL module whose data sources you wish to fetch
        use_cache (bool): Use this flag if you wish to use the cached data sources (if they exist)
    Returns:
        None
    """
    log_title("Fetch ETL Datasources")
    log_info("Fetching data source(s)")
    extract_ds(dataset, use_cache)
    log_goodbye()
    sys.exit()
@cli.command(
    help="Clear data source cache for all ETL processes (or a specific one)",
 )
@click.option(
    "-d",
    "--dataset",
    required=False,
    type=str,
    help=dataset_cli_help,
 )
 def clear_data_source_cache(dataset: str):
    """Clear data source(s) cache for all ETL processes (or a specific one)
    Args:
        dataset (str): Name of the ETL module whose cache you wish to clear
    Returns:
        None
    """
    log_title("Fetch ETL Datasources")
    log_info("Clear data source cache")
    clear_ds_cache(dataset)
    log_goodbye()
    sys.exit()
 def log_title(title: str, subtitle: str = None):
    """Logs a title in our fancy title format"""
    logger.info("-" * LOG_LINE_WIDTH)
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -2,7 +2,9 @@ import enum
 import pathlib
 import sys
 import typing
 import shutil
 from typing import Optional
 from abc import ABC, abstractmethod
 import pandas as pd
 from data_pipeline.config import settings
@ -13,7 +15,7 @@ from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import remove_all_from_dir
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
 logger = get_module_logger(__name__)
@ -25,7 +27,7 @@ class ValidGeoLevel(enum.Enum):
    CENSUS_BLOCK_GROUP = enum.auto()
-class ExtractTransformLoad:
+class ExtractTransformLoad(ABC):
    """
    A class used to instantiate an ETL object to retrieve and process data from
    datasets.
@ -45,6 +47,7 @@ class ExtractTransformLoad:
    # Directories
    DATA_PATH: pathlib.Path = settings.DATA_PATH
    TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
    SOURCES_PATH: pathlib.Path = DATA_PATH / "sources"
    CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
    DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
    DATASET_CONFIG: Optional[dict] = None
@ -177,45 +180,60 @@ class ExtractTransformLoad:
        output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
        return output_file_path
-    def get_tmp_path(self) -> pathlib.Path:
+    def get_sources_path(self) -> pathlib.Path:
-        """Returns the temporary path associated with this ETL class."""
+        """Returns the sources path associated with this ETL class. The sources path
-        # Note: the temporary path will be defined on `init`, because it uses the class
+        is the home for cached data sources used by this ETL."""
-        # of the instance which is often a child class.
+
-        tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
+        sources_path = self.SOURCES_PATH / str(self.__class__.__name__)
        # Create directory if it doesn't exist
-        tmp_path.mkdir(parents=True, exist_ok=True)
+        sources_path.mkdir(parents=True, exist_ok=True)
-        return tmp_path
+        return sources_path
-    def extract(
+    @abstractmethod
-        self,
+    def get_data_sources(self) -> [DataSource]:
-        source_url: str = None,
+        pass
        extract_path: pathlib.Path = None,
        verify: Optional[bool] = True,
    ) -> None:
        """Extract the data from a remote source. By default it provides code
        to get the file from a source url, unzips it and stores it on an
        extract_path."""
-        if source_url is None:
+    def _fetch(self) -> None:
-            source_url = self.SOURCE_URL
+        """Fetch all data sources for this ETL. When data sources are fetched, they
        are stored in a cache directory for consistency between runs."""
        for ds in self.get_data_sources():
            ds.fetch()
-        if extract_path is None:
+    def clear_data_source_cache(self) -> None:
-            extract_path = self.get_tmp_path()
+        """Clears the cache for this ETLs data source(s)"""
        shutil.rmtree(self.get_sources_path())
-        unzip_file_from_url(
+    def extract(self, use_cached_data_sources: bool = False) -> None:
-            file_url=source_url,
+        """Extract (download) data from a remote source, and validate
-            download_path=self.get_tmp_path(),
+        that data. By default, this method fetches data from the set of
-            unzipped_file_path=extract_path,
+        data sources returned by get_data_sources.
-            verify=verify,
+
        If use_cached_data_sources is true, this method attempts to use cached data
        rather than re-downloading from the original source. The cache algorithm is very
        simple: it just looks to see if the directory has any contents. If so, it uses
        that content. If not, it downloads all data sources.
        Subclasses should call super() before performing any work if they wish to take
        advantage of the automatic downloading and caching ability of this superclass.
        """
        if use_cached_data_sources and any(self.get_sources_path().iterdir()):
            logger.info(
                f"Using cached data sources for {self.__class__.__name__}"
            )
        else:
            self.clear_data_source_cache()
            self._fetch()
        # the rest of the work should be performed here
    @abstractmethod
    def transform(self) -> None:
        """Transform the data extracted into a format that can be consumed by the
        score generator"""
-
+        pass
        raise NotImplementedError
    def validate(self) -> None:
        """Validates the output.
@ -380,3 +398,14 @@ class ExtractTransformLoad:
    def cleanup(self) -> None:
        """Clears out any files stored in the TMP folder"""
        remove_all_from_dir(self.get_tmp_path())
    def get_tmp_path(self) -> pathlib.Path:
        """Returns the temporary path associated with this ETL class."""
        # Note: the temporary path will be defined on `init`, because it uses the class
        # of the instance which is often a child class.
        tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
        # Create directory if it doesn't exist
        tmp_path.mkdir(parents=True, exist_ok=True)
        return tmp_path
--- a/data/data-pipeline/data_pipeline/etl/datasource.py
+++ b/data/data-pipeline/data_pipeline/etl/datasource.py
@ -0,0 +1,124 @@
 """This module defines a set of classes that can be used to fetch data
 from a remote source. They are meant to be used in conjuction with ETLs
 or other classes that require downloading data.
 There are three types of data sources defined in this file:
 FileDataSource – meant to be used when you have a single file to
 retrive from a remote location and save to a destination.
 ZipDataSource – used when you need to fetch and unzip a file, and save
 the contents of that file to a destination.
 CensusDataSource – used to download data from the Census API and store
 the contents to a destination.
 DataSource subclasses must implement the fetch method to define how
 they will reach out to a remote source, download the data, and save
 that data to the destination.
 """
 from pathlib import Path
 from typing import List
 from dataclasses import dataclass
 from abc import ABC, abstractmethod
 from data_pipeline.etl.downloader import Downloader
 from data_pipeline.etl.sources.census_acs.etl_utils import (
    retrieve_census_acs_data,
 )
@dataclass
 class DataSource(ABC):
    """A data source represents any source of data that is fetchable
    from a remote location.
    Attributes:
    source : str
            the location of this data source, as a url
    destination : Path
            the Path where the data source should be saved locally upon being fetched
    """
    source: str
    destination: Path
    @abstractmethod
    def fetch(self) -> None:
        pass
@dataclass
 class FileDataSource(DataSource):
    """A data source representing a single file.
    This single file will be fetched from the source and saved to a single
    destination.
    """
    def fetch(self) -> None:
        """Fetches a single file from a source and saves it to a destination."""
        self.destination.parent.mkdir(parents=True, exist_ok=True)
        Downloader.download_file_from_url(
            file_url=self.source,
            download_file_name=self.destination,
            verify=True,
        )
    def __str__(self):
        return f"File – {self.source}"
@dataclass
 class ZIPDataSource(DataSource):
    """A data source representing ZIP files.
    Zip files will be fetched and placed in the destination folder, then unzipped.
    """
    def fetch(self) -> None:
        self.destination.mkdir(parents=True, exist_ok=True)
        Downloader.download_zip_file_from_url(
            file_url=self.source,
            unzipped_file_path=self.destination,
            verify=True,
        )
    def __str__(self):
        return f"Zip – {self.source}"
@dataclass
 class CensusDataSource(DataSource):
    """A data source representing census data.
    Data will be fetched using the Census API and saved to the destination file. Source is ignored.
    """
    acs_year: int
    variables: List[str]
    tract_output_field_name: str
    data_path_for_fips_codes: Path
    acs_type: str
    def fetch(self) -> None:
        df = retrieve_census_acs_data(
            acs_year=self.acs_year,
            variables=self.variables,
            tract_output_field_name=self.tract_output_field_name,
            data_path_for_fips_codes=self.data_path_for_fips_codes,
            acs_type=self.acs_type,
        )
        self.destination.parent.mkdir(parents=True, exist_ok=True)
        # Write CSV representation of census data
        df.to_csv(self.destination, index=False)
    def __str__(self):
        return f"Census – {self.acs_type}, {self.acs_year}"
--- a/data/data-pipeline/data_pipeline/etl/downloader.py
+++ b/data/data-pipeline/data_pipeline/etl/downloader.py
@ -0,0 +1,95 @@
 import uuid
 import urllib3
 import requests
 import zipfile
 import shutil
 from pathlib import Path
 from data_pipeline.config import settings
 class Downloader:
    """A simple class to encapsulate the download capabilities of the application"""
    @classmethod
    def download_file_from_url(
        cls,
        file_url: str,
        download_file_name: Path,
        verify: bool = True,
    ) -> str:
        """Downloads a file from a remote URL location and returns the file location.
        Args:
                file_url (str): URL where the zip file is located
                download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default)
                verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
                error (optional, default to False)
        Returns:
                None
        """
        # disable https warning
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        download_file_name.parent.mkdir(parents=True, exist_ok=True)
        response = requests.get(
            file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
        )
        if response.status_code == 200:
            file_contents = response.content
        else:
            raise Exception(
                f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
            )
        # Write the contents to disk.
        file = open(download_file_name, "wb")
        file.write(file_contents)
        file.close()
        return download_file_name
    @classmethod
    def download_zip_file_from_url(
        cls,
        file_url: str,
        unzipped_file_path: Path,
        verify: bool = True,
    ) -> None:
        """Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after
        Args:
                file_url (str): URL where the zip file is located
                unzipped_file_path (pathlib.Path): directory and name of the extracted file
                verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
                error (optional, default to False)
        Returns:
                None
        """
        # dir_id allows us to evade race conditions on parallel ETLs
        dir_id = uuid.uuid4()
        zip_download_path = (
            settings.DATA_PATH
            / "tmp"
            / "downloads"
            / f"{dir_id}"
            / "download.zip"
        )
        zip_file_path = Downloader.download_file_from_url(
            file_url=file_url,
            download_file_name=zip_download_path,
            verify=verify,
        )
        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
            zip_ref.extractall(unzipped_file_path)
        # cleanup temporary file and directory
        shutil.rmtree(zip_download_path.parent)
--- a/data/data-pipeline/data_pipeline/etl/runner.py
+++ b/data/data-pipeline/data_pipeline/etl/runner.py
@ -2,10 +2,14 @@ import concurrent.futures
 import importlib
 import typing
 from functools import reduce
 from data_pipeline.etl.score.etl_score import ScoreETL
 from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
 from data_pipeline.etl.score.etl_score_post import PostScoreETL
 from data_pipeline.utils import get_module_logger
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from . import constants
@ -40,20 +44,26 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
    return dataset_list
-def _run_one_dataset(dataset: dict) -> None:
+def _get_dataset(dataset: dict) -> ExtractTransformLoad:
-    """Runs one etl process."""
+    """Instantiates a dataset object from a dictionary description of that object's class"""
    logger.info(f"Running ETL for {dataset['name']}")
    etl_module = importlib.import_module(
        f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
    )
    etl_class = getattr(etl_module, dataset["class_name"])
    etl_instance = etl_class()
    return etl_instance
 def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
    """Runs one etl process."""
    logger.info(f"Running ETL for {dataset['name']}")
    etl_instance = _get_dataset(dataset)
    # run extract
    logger.debug(f"Extracting {dataset['name']}")
-    etl_instance.extract()
+    etl_instance.extract(use_cache)
    # run transform
    logger.debug(f"Transforming {dataset['name']}")
@ -74,11 +84,12 @@ def _run_one_dataset(dataset: dict) -> None:
    logger.info(f"Finished ETL for dataset {dataset['name']}")
-def etl_runner(dataset_to_run: str = None) -> None:
+def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None:
    """Runs all etl processes or a specific one
    Args:
        dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
        use_cache (bool): Use the cached data sources – if they exist – rather than downloading them all from scratch
    Returns:
        None
@ -105,7 +116,9 @@ def etl_runner(dataset_to_run: str = None) -> None:
        logger.info("Running concurrent ETL jobs")
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = {
-                executor.submit(_run_one_dataset, dataset=dataset)
+                executor.submit(
                    _run_one_dataset, dataset=dataset, use_cache=use_cache
                )
                for dataset in concurrent_datasets
            }
@ -119,7 +132,50 @@ def etl_runner(dataset_to_run: str = None) -> None:
    if high_memory_datasets:
        logger.info("Running high-memory ETL jobs")
        for dataset in high_memory_datasets:
-            _run_one_dataset(dataset=dataset)
+            _run_one_dataset(dataset=dataset, use_cache=use_cache)
 def get_data_sources(dataset_to_run: str = None) -> [DataSource]:
    dataset_list = _get_datasets_to_run(dataset_to_run)
    sources = []
    for dataset in dataset_list:
        etl_instance = _get_dataset(dataset)
        sources.append(etl_instance.get_data_sources())
    sources = reduce(
        list.__add__, sources
    )  # flatten the list of lists into a single list
    return sources
 def extract_data_sources(
    dataset_to_run: str = None, use_cache: bool = False
 ) -> None:
    dataset_list = _get_datasets_to_run(dataset_to_run)
    for dataset in dataset_list:
        etl_instance = _get_dataset(dataset)
        logger.info(
            f"Extracting data set for {etl_instance.__class__.__name__}"
        )
        etl_instance.extract(use_cache)
 def clear_data_source_cache(dataset_to_run: str = None) -> None:
    dataset_list = _get_datasets_to_run(dataset_to_run)
    for dataset in dataset_list:
        etl_instance = _get_dataset(dataset)
        logger.info(
            f"Clearing data set cache for {etl_instance.__class__.__name__}"
        )
        etl_instance.clear_data_source_cache()
 def score_generate() -> None:
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -22,6 +22,8 @@ from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
 from data_pipeline.score import field_names
 from data_pipeline.score.score_runner import ScoreRunner
 from data_pipeline.utils import get_module_logger
 from data_pipeline.etl.datasource import DataSource
 logger = get_module_logger(__name__)
@ -55,7 +57,13 @@ class ScoreETL(ExtractTransformLoad):
        self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        return (
            []
        )  # we have all prerequisite sources locally as a result of running the ETLs
    def extract(self, use_cached_data_sources: bool = False) -> None:
        # EJSCreen csv Load
        ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
        self.ejscreen_df = pd.read_csv(
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py
@ -15,6 +15,7 @@ from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_dict_from_yaml_object_fields
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import zip_files
 from data_pipeline.etl.datasource import DataSource
 logger = get_module_logger(__name__)
@ -68,7 +69,13 @@ class GeoScoreETL(ExtractTransformLoad):
        self.geojson_score_usa_high: gpd.GeoDataFrame
        self.geojson_score_usa_low: gpd.GeoDataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        return (
            []
        )  # we have all prerequisite sources locally as a result of generating the previous steps in the pipeline
    def extract(self, use_cached_data_sources: bool = False) -> None:
        # check census data
        check_census_data_source(
            census_data_path=self.DATA_PATH / "census",
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -2,7 +2,9 @@ import json
 from pathlib import Path
 import numpy as np
 from numpy import float64
 import pandas as pd
 from data_pipeline.content.schemas.download_schemas import CodebookConfig
 from data_pipeline.content.schemas.download_schemas import CSVConfig
 from data_pipeline.content.schemas.download_schemas import ExcelConfig
@ -16,7 +18,8 @@ from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import load_dict_from_yaml_object_fields
 from data_pipeline.utils import load_yaml_dict_from_file
 from data_pipeline.utils import zip_files
-from numpy import float64
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.downloader import Downloader
 from . import constants
@ -61,6 +64,11 @@ class PostScoreETL(ExtractTransformLoad):
        self.yaml_global_config_sort_by_label = "sort_by_label"
        # End YAML definition constants
    def get_data_sources(self) -> [DataSource]:
        return (
            []
        )  # we have all prerequisite sources locally as a result of generating the score
    def _extract_counties(self, county_path: Path) -> pd.DataFrame:
        logger.debug("Reading Counties CSV")
        return pd.read_csv(
@ -97,17 +105,23 @@ class PostScoreETL(ExtractTransformLoad):
        return df
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        # check census data
        check_census_data_source(
            census_data_path=self.DATA_PATH / "census",
            census_data_source=self.DATA_SOURCE,
        )
-        super().extract(
+        # TODO would could probably add this to the data sources for this file
-            constants.CENSUS_COUNTIES_ZIP_URL,
+        Downloader.download_zip_file_from_url(
-            constants.TMP_PATH,
+            constants.CENSUS_COUNTIES_ZIP_URL, constants.TMP_PATH
        )
        self.input_counties_df = self._extract_counties(
            constants.CENSUS_COUNTIES_FILE_NAME
        )
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -13,7 +13,7 @@ from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
 from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.score import field_names
-from data_pipeline.utils import download_file_from_url
+from data_pipeline.etl.downloader import Downloader
 from data_pipeline.utils import get_module_logger
 from . import constants
@ -48,7 +48,7 @@ def check_score_data_source(
    # download from s3 if census_data_source is aws
    if score_data_source == "aws":
        logger.debug("Fetching Score Tile data from AWS S3")
-        download_file_from_url(
+        Downloader.download_file_from_url(
            file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
        )
    else:
--- a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py
@ -1,23 +1,36 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)
 class CalEnviroScreenETL(ExtractTransformLoad):
    """California environmental screen
    TODO: Need good description
    """
    def __init__(self):
-        self.CALENVIROSCREEN_FTP_URL = (
+
        # fetch
        self.calenviroscreen_ftp_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/CalEnviroScreen_4.0_2021.zip"
        )
        self.CALENVIROSCREEN_CSV = (
            self.get_tmp_path() / "CalEnviroScreen_4.0_2021.csv"
        )
        self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
-        # Definining some variable names
+        # input
        self.calenviroscreen_source = (
            self.get_sources_path() / "CalEnviroScreen_4.0_2021.csv"
        )
        # output
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
        # Defining some variable names
        self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
        self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
            "calenviroscreen_percentile"
@ -32,19 +45,28 @@ class CalEnviroScreenETL(ExtractTransformLoad):
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        return [
            ZIPDataSource(
                source=self.calenviroscreen_ftp_url,
                destination=self.get_sources_path(),
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
-            self.CALENVIROSCREEN_FTP_URL,
+            use_cached_data_sources
-            self.get_tmp_path(),
+        )  # download and extract data sources
        self.df = pd.read_csv(
            self.calenviroscreen_source, dtype={"Census Tract": "string"}
        )
    def transform(self) -> None:
        # Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
        # https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
        # Load comparison index (CalEnviroScreen 4)
        self.df = pd.read_csv(
            self.CALENVIROSCREEN_CSV, dtype={"Census Tract": "string"}
        )
        self.df.rename(
            columns={
@ -68,5 +90,5 @@ class CalEnviroScreenETL(ExtractTransformLoad):
    def load(self) -> None:
        # write nationwide csv
-        self.CSV_PATH.mkdir(parents=True, exist_ok=True)
+        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
-        self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)
+        self.df.to_csv(self.OUTPUT_PATH / "data06.csv", index=False)
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@ -7,8 +7,9 @@ from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.score.etl_utils import (
    compare_to_list_of_expected_state_fips_codes,
 )
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -17,59 +18,74 @@ logger = get_module_logger(__name__)
 class CDCLifeExpectancy(ExtractTransformLoad):
    """#TODO: create description"""
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    NAME = "cdc_life_expectancy"
    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
        USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
    else:
        USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
    LOAD_YAML_CONFIG: bool = False
    LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
    INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"
    STATES_MISSING_FROM_USA_FILE = ["23", "55"]
    # For some reason, LEEP does not include Maine or Wisconsin in its "All of
    # USA" file. Load these separately.
    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
        WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
        MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
    else:
        WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
        MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
    TRACT_INPUT_COLUMN_NAME = "Tract ID"
    STATE_INPUT_COLUMN_NAME = "STATE2KX"
-    raw_df: pd.DataFrame
+    raw_df: pd.DataFrame  # result of extraction
-    output_df: pd.DataFrame
+    output_df: pd.DataFrame  # result of transformation
    def __init__(self):
        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
            self.usa_file_url = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
        else:
            self.usa_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
        # For some reason, LEEP does not include Maine or Wisconsin in its "All of USA" file. Load these separately.
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
            self.wisconsin_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
            self.maine_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
        else:
            self.wisconsin_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
            self.maine_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
        # input
        self.usa_source = self.get_sources_path() / "US_A.CSV"
        self.maine_source = self.get_sources_path() / "ME_A.CSV"
        self.wisconsin_source = self.get_sources_path() / "WI_A.CSV"
        # output
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "cdc_life_expectancy"
        )
-        # Constants for output
+        self.COLUMNS_TO_KEEP = [  # the columns to save on output
        self.COLUMNS_TO_KEEP = [
            self.GEOID_TRACT_FIELD_NAME,
            field_names.LIFE_EXPECTANCY_FIELD,
        ]
-    def _download_and_prep_data(
+    def get_data_sources(self) -> [DataSource]:
-        self, file_url: str, download_file_name: pathlib.Path
+        return [
-    ) -> pd.DataFrame:
+            FileDataSource(
-        download_file_from_url(
+                source=self.usa_file_url, destination=self.usa_source
-            file_url=file_url,
+            ),
-            download_file_name=download_file_name,
+            FileDataSource(
-            verify=True,
+                source=self.maine_file_url, destination=self.maine_source
-        )
+            ),
            FileDataSource(
                source=self.wisconsin_file_url,
                destination=self.wisconsin_source,
            ),
        ]
    def _read_data(self, file_name: pathlib.Path) -> pd.DataFrame:
        df = pd.read_csv(
-            filepath_or_buffer=download_file_name,
+            filepath_or_buffer=file_name,
            dtype={
                # The following need to remain as strings for all of their digits, not get converted to numbers.
                self.TRACT_INPUT_COLUMN_NAME: "string",
@ -80,12 +96,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):
        return df
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
-        all_usa_raw_df = self._download_and_prep_data(
+        super().extract(
-            file_url=self.USA_FILE_URL,
+            use_cached_data_sources
-            download_file_name=self.get_tmp_path() / "US_A.CSV",
+        )  # download and extract data sources
-        )
+
        all_usa_raw_df = self._read_data(self.usa_source)
        # Check which states are missing
        states_in_life_expectancy_usa_file = list(
@ -101,17 +118,11 @@ class CDCLifeExpectancy(ExtractTransformLoad):
            additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
        )
-        logger.debug("Downloading data for Maine")
+        maine_raw_df = self._read_data(
-        maine_raw_df = self._download_and_prep_data(
+            self.maine_source,
            file_url=self.MAINE_FILE_URL,
            download_file_name=self.get_tmp_path() / "maine.csv",
        )
-        logger.debug("Downloading data for Wisconsin")
+        wisconsin_raw_df = self._read_data(self.wisconsin_source)
        wisconsin_raw_df = self._download_and_prep_data(
            file_url=self.WISCONSIN_FILE_URL,
            download_file_name=self.get_tmp_path() / "wisconsin.csv",
        )
        combined_df = pd.concat(
            objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df],
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py
@ -4,14 +4,17 @@ import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.score import field_names
 from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import FileDataSource
 logger = get_module_logger(__name__)
 class CDCPlacesETL(ExtractTransformLoad):
    """#TODO: Need description"""
    NAME = "cdc_places"
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
@ -21,15 +24,21 @@ class CDCPlacesETL(ExtractTransformLoad):
    CDC_MEASURE_FIELD_NAME = "Measure"
    def __init__(self):
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.CDC_PLACES_URL = (
+            self.cdc_places_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
            )
        else:
-            self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
+            self.cdc_places_url = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
        # input
        self.places_source = self.get_sources_path() / "census_tract.csv"
        # output
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
        self.COLUMNS_TO_KEEP: typing.List[str] = [
            self.GEOID_TRACT_FIELD_NAME,
@ -43,19 +52,27 @@ class CDCPlacesETL(ExtractTransformLoad):
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        file_path = download_file_from_url(
+        return [
-            file_url=self.CDC_PLACES_URL,
+            FileDataSource(
-            download_file_name=self.get_tmp_path() / "census_tract.csv",
+                source=self.cdc_places_url, destination=self.places_source
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.df = pd.read_csv(
-            filepath_or_buffer=file_path,
+            filepath_or_buffer=self.places_source,
            dtype={self.CDC_GEOID_FIELD_NAME: "string"},
            low_memory=False,
        )
    def transform(self) -> None:
        # Rename GEOID field
        self.df.rename(
            columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py
@ -1,6 +1,8 @@
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -11,22 +13,28 @@ logger = get_module_logger(__name__)
 class CDCSVIIndex(ExtractTransformLoad):
    """CDC SVI Index class ingests 2018 dataset located
    here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html
    Please see the README in this module for further details.
    """
    def __init__(self):
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.CDC_SVI_INDEX_URL = (
+            self.cdc_svi_index_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "cdc_svi_index/SVI2018_US.csv"
            )
        else:
-            self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
+            self.cdc_svi_index_url = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
        # input
        self.svi_source = self.get_sources_path() / "SVI2018_US.csv"
        # output
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
        self.CDC_RPL_THEMES_THRESHOLD = 0.90
        self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS"
        self.COLUMNS_TO_KEEP = [
@ -47,9 +55,21 @@ class CDCSVIIndex(ExtractTransformLoad):
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        return [
            FileDataSource(
                source=self.cdc_svi_index_url, destination=self.svi_source
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.df = pd.read_csv(
-            filepath_or_buffer=self.CDC_SVI_INDEX_URL,
+            filepath_or_buffer=self.svi_source,
            dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
            low_memory=False,
        )
@ -107,8 +127,8 @@ class CDCSVIIndex(ExtractTransformLoad):
            )
    def load(self) -> None:
        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
        self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
        self.df[self.COLUMNS_TO_KEEP].to_csv(
            path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py
@ -8,7 +8,8 @@ import geopandas as gpd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 logger = get_module_logger(__name__)
@ -20,7 +21,7 @@ class GeoFileType(Enum):
 class CensusETL(ExtractTransformLoad):
-    SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
+    # SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
    GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
    CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
    GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
@ -29,6 +30,9 @@ class CensusETL(ExtractTransformLoad):
    GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
    def __init__(self):
        self.shape_file_path = self.get_sources_path() / "shp"
        # the fips_states_2010.csv is generated from data here
        # https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
        self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
@ -50,7 +54,7 @@ class CensusETL(ExtractTransformLoad):
        file_path: Path
        if file_type == GeoFileType.SHP:
            file_path = Path(
-                self.SHP_BASE_PATH
+                self.shape_file_path
                / fips_code
                / f"tl_2010_{fips_code}_tract10.shp"
            )
@ -60,33 +64,22 @@ class CensusETL(ExtractTransformLoad):
            file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
        return file_path
-    def _extract_shp(self, fips_code: str) -> None:
+    def get_data_sources(self) -> [DataSource]:
        """Download the SHP file for the provided FIPS code
-        Args:
+        sources = []
            fips_code (str): the FIPS code for the region of interest
-        Returns:
+        for fips_code in self.STATE_FIPS_CODES:
            None
        """
        shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
        # check if file exists
        if not shp_file_path.is_file():
            tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
-            unzip_file_from_url(
+            destination_path = self.shape_file_path / fips_code
-                tract_state_url,
+
-                self.TMP_PATH,
+            sources.append(
-                self.DATA_PATH / "census" / "shp" / fips_code,
+                ZIPDataSource(
                    source=tract_state_url, destination=destination_path
                )
            )
-    def extract(self) -> None:
+        return sources
        logger.debug("Extracting census data")
        for index, fips_code in enumerate(self.STATE_FIPS_CODES):
            logger.debug(
                f"Extracting shape for FIPS {fips_code} – {index+1} of {len(self.STATE_FIPS_CODES)}"
            )
            self._extract_shp(fips_code)
    def _transform_to_geojson(self, fips_code: str) -> None:
        """Convert the downloaded SHP file for the associated FIPS to geojson
--- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py
@ -56,6 +56,7 @@ def get_state_fips_codes(data_path: Path) -> list:
            else:
                fips = row[0].strip()
                fips_state_list.append(fips)
    return fips_state_list
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py
@ -8,12 +8,11 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census_acs.etl_imputations import (
    calculate_income_measures,
 )
 from data_pipeline.etl.sources.census_acs.etl_utils import (
    retrieve_census_acs_data,
 )
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import unzip_file_from_url
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import CensusDataSource
 logger = get_module_logger(__name__)
@ -28,6 +27,9 @@ class CensusACSETL(ExtractTransformLoad):
    MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
    def __init__(self):
        self.census_acs_source = self.get_sources_path() / "acs.csv"
        self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
        self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
        self.EMPLOYMENT_FIELDS = [
@ -311,6 +313,34 @@ class CensusACSETL(ExtractTransformLoad):
        self.df: pd.DataFrame
    def get_data_sources(self) -> [DataSource]:
        # Define the variables to retrieve
        variables = (
            [
                self.MEDIAN_INCOME_FIELD,
                self.MEDIAN_HOUSE_VALUE_FIELD,
            ]
            + self.EMPLOYMENT_FIELDS
            + self.LINGUISTIC_ISOLATION_FIELDS
            + self.POVERTY_FIELDS
            + self.EDUCATIONAL_FIELDS
            + self.RE_FIELDS
            + self.COLLEGE_ATTENDANCE_FIELDS
            + self.AGE_INPUT_FIELDS
        )
        return [
            CensusDataSource(
                source=None,
                destination=self.census_acs_source,
                acs_year=self.ACS_YEAR,
                variables=variables,
                tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
                data_path_for_fips_codes=self.DATA_PATH,
                acs_type="acs5",
            )
        ]
    # pylint: disable=too-many-arguments
    def _merge_geojson(
        self,
@ -339,27 +369,15 @@ class CensusACSETL(ExtractTransformLoad):
            )
        )
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
        # Define the variables to retrieve
        variables = (
            [
                self.MEDIAN_INCOME_FIELD,
                self.MEDIAN_HOUSE_VALUE_FIELD,
            ]
            + self.EMPLOYMENT_FIELDS
            + self.LINGUISTIC_ISOLATION_FIELDS
            + self.POVERTY_FIELDS
            + self.EDUCATIONAL_FIELDS
            + self.RE_FIELDS
            + self.COLLEGE_ATTENDANCE_FIELDS
            + self.AGE_INPUT_FIELDS
        )
-        self.df = retrieve_census_acs_data(
+        super().extract(
-            acs_year=self.ACS_YEAR,
+            use_cached_data_sources
-            variables=variables,
+        )  # download and extract data sources
-            tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
+
-            data_path_for_fips_codes=self.DATA_PATH,
+        self.df = pd.read_csv(
            self.census_acs_source,
            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
        )
    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py
@ -1,10 +1,9 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.census_acs.etl_utils import (
    retrieve_census_acs_data,
 )
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import CensusDataSource
 logger = get_module_logger(__name__)
@ -18,6 +17,9 @@ class CensusACS2010ETL(ExtractTransformLoad):
    """
    def __init__(self):
        self.census_acs_source = self.get_sources_path() / "acs_2010.csv"
        self.ACS_YEAR = 2010
        self.ACS_TYPE = "acs5"
        self.OUTPUT_PATH = (
@ -99,7 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        # Define the variables to retrieve
        variables = (
            self.UNEMPLOYED_FIELDS
@ -107,14 +109,27 @@ class CensusACS2010ETL(ExtractTransformLoad):
            + self.POVERTY_FIELDS
        )
-        # Use the method defined on CensusACSETL to reduce coding redundancy.
+        return [
-        self.df = retrieve_census_acs_data(
+            CensusDataSource(
                source=None,
                destination=self.census_acs_source,
                acs_year=self.ACS_YEAR,
                variables=variables,
                tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
                data_path_for_fips_codes=self.DATA_PATH,
                acs_type=self.ACS_TYPE,
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.df = pd.read_csv(
            self.census_acs_source, dtype={"GEOID10_TRACT": "string"}
        )
    def transform(self) -> None:
        df = self.df
--- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py
@ -1,14 +1,16 @@
 import os
 import json
 from pathlib import Path
 import numpy as np
 import pandas as pd
-import requests
+
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.datasource import FileDataSource
 logger = get_module_logger(__name__)
@ -22,6 +24,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            / f"census_acs_median_income_{self.ACS_YEAR}"
        )
        self.GEOCORR_ALL_STATES_URL = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/geocorr2014_all_states_tracts_only.csv.zip"
        )
        self.GEOCORR_ALL_STATES_PATH = self.get_sources_path() / "geocorr"
        self.GEOCORR_ALL_STATES_SOURCE = (
            self.GEOCORR_ALL_STATES_PATH
            / "geocorr2014_all_states_tracts_only.csv"
        )
        # Set constants for Geocorr MSAs data.
        self.PLACE_FIELD_NAME: str = "Census Place Name"
        self.COUNTY_FIELD_NAME: str = "County Name"
@ -39,10 +51,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E"
            + "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area"
        )
        self.MSA_MEDIAN_INCOME_SOURCE = (
            self.get_sources_path() / "msa" / "msa_median_income.json"
        )
        self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)"
        # Set constants for state median incomes
        self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state"
        self.STATE_MEDIAN_INCOME_SOURCE = (
            self.get_sources_path() / "state" / "state_median_income.json"
        )
        self.STATE_GEOID_FIELD_NAME: str = "GEOID2"
        self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)"
@ -50,6 +68,18 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        self.PUERTO_RICO_S3_LINK: str = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv"
        )
        self.PUERTO_RICO_ALL_STATES_SOURCE = (
            self.get_sources_path() / "pr_tracts" / "pr_tracts.csv"
        )
        census_api_key = os.environ.get("CENSUS_API_KEY")
        if census_api_key:
            self.MSA_MEDIAN_INCOME_URL = (
                self.MSA_MEDIAN_INCOME_URL + f"&key={census_api_key}"
            )
            self.STATE_MEDIAN_INCOME_URL = (
                self.STATE_MEDIAN_INCOME_URL + f"&key={census_api_key}"
            )
        # Constants for output
        self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
@ -76,6 +106,27 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        self.state_median_incomes: dict
        self.pr_tracts: pd.DataFrame
    def get_data_sources(self) -> [DataSource]:
        return [
            ZIPDataSource(
                source=self.GEOCORR_ALL_STATES_URL,
                destination=self.GEOCORR_ALL_STATES_PATH,
            ),
            FileDataSource(
                source=self.PUERTO_RICO_S3_LINK,
                destination=self.PUERTO_RICO_ALL_STATES_SOURCE,
            ),
            FileDataSource(
                source=self.MSA_MEDIAN_INCOME_URL,
                destination=self.MSA_MEDIAN_INCOME_SOURCE,
            ),
            FileDataSource(
                source=self.STATE_MEDIAN_INCOME_URL,
                destination=self.STATE_MEDIAN_INCOME_SOURCE,
            ),
        ]
    def _transform_geocorr(self) -> pd.DataFrame:
        # Transform the geocorr data
        geocorr_df = self.raw_geocorr_df
@ -223,7 +274,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        )
        return state_median_incomes_df
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
        # Load and clean GEOCORR data
        # Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
        # The specific query used is the following, which takes a couple of minutes to run:
@ -239,18 +291,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
        # - Core based statistical area (CBSA)
        # - CBSA Type (Metro or Micro)
        logger.debug("Starting download of 1.5MB Geocorr information.")
-
+        super().extract(
-        unzip_file_from_url(
+            use_cached_data_sources
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+        )  # download and extract data sources
            + "/geocorr2014_all_states_tracts_only.csv.zip",
            download_path=self.get_tmp_path(),
            unzipped_file_path=self.get_tmp_path() / "geocorr",
        )
        self.raw_geocorr_df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
+            filepath_or_buffer=self.GEOCORR_ALL_STATES_SOURCE,
            / "geocorr"
            / "geocorr2014_all_states_tracts_only.csv",
            # Skip second row, which has descriptions.
            skiprows=[1],
            # The following need to remain as strings for all of their digits, not get converted to numbers.
@ -264,39 +310,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
            low_memory=False,
        )
        logger.debug("Pulling PR tract list down.")
        # This step is necessary because PR is not in geocorr at the level that gets joined
        pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
        download_file_from_url(
            file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file
        )
        self.pr_tracts = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
+            filepath_or_buffer=self.PUERTO_RICO_ALL_STATES_SOURCE,
            / "pr_tracts"
            / "pr_tracts.csv",
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={"GEOID10_TRACT": str},
            low_memory=False,
        )
        self.pr_tracts["State Abbreviation"] = "PR"
-        # Download MSA median incomes
+        with self.MSA_MEDIAN_INCOME_SOURCE.open() as source:
-        logger.debug("Starting download of MSA median incomes.")
+            self.msa_median_incomes = json.load(source)
        download = requests.get(
            self.MSA_MEDIAN_INCOME_URL,
            verify=None,
            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
        )
        self.msa_median_incomes = json.loads(download.content)
-        # Download state median incomes
+        with self.STATE_MEDIAN_INCOME_SOURCE.open() as source:
-        logger.debug("Starting download of state median incomes.")
+            self.state_median_incomes = json.load(source)
        download_state = requests.get(
            self.STATE_MEDIAN_INCOME_URL,
            verify=None,
            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
        )
        self.state_median_incomes = json.loads(download_state.content)
        ## NOTE we already have PR's MI here
    def transform(self) -> None:
--- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py
@ -1,13 +1,14 @@
 import json
 from typing import List
 import os
 import numpy as np
 import pandas as pd
 import requests
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import FileDataSource
 pd.options.mode.chained_assignment = "raise"
@ -342,20 +343,23 @@ class CensusDecennialETL(ExtractTransformLoad):
            + "&for=tract:*&in=state:{}%20county:{}"
        )
        census_api_key = os.environ.get("CENSUS_API_KEY")
        if census_api_key:
            self.API_URL = self.API_URL + f"&key={census_api_key}"
        self.final_race_fields: List[str] = []
        self.df: pd.DataFrame
        self.df_vi: pd.DataFrame
        self.df_all: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        dfs = []
+
-        dfs_vi = []
+        sources = []
        for island in self.ISLAND_TERRITORIES:
            logger.debug(
                f"Downloading data for state/territory {island['state_abbreviation']}"
            )
            for county in island["county_fips"]:
                api_url = self.API_URL.format(
                    self.DECENNIAL_YEAR,
                    island["state_abbreviation"],
@ -363,17 +367,48 @@ class CensusDecennialETL(ExtractTransformLoad):
                    island["fips"],
                    county,
                )
-                logger.debug(f"CENSUS: Requesting {api_url}")
+
-                download = requests.get(
+                sources.append(
-                    api_url,
+                    FileDataSource(
-                    timeout=settings.REQUESTS_DEFAULT_TIMOUT,
+                        source=api_url,
                        destination=self.get_sources_path()
                        / str(self.DECENNIAL_YEAR)
                        / island["state_abbreviation"]
                        / island["fips"]
                        / county
                        / "census.json",
                    )
                )
        return sources
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        dfs = []
        dfs_vi = []
        for island in self.ISLAND_TERRITORIES:
            logger.debug(
                f"Downloading data for state/territory {island['state_abbreviation']}"
            )
            for county in island["county_fips"]:
                try:
-                    df = json.loads(download.content)
+                    filepath = (
                        self.get_sources_path()
                        / str(self.DECENNIAL_YEAR)
                        / island["state_abbreviation"]
                        / island["fips"]
                        / county
                        / "census.json"
                    )
                    df = json.load(filepath.open())
                except ValueError as e:
                    logger.error(
-                        f"Could not load content in census decennial ETL because {e}. Content is {download.content}."
+                        f"Could not load content in census decennial ETL because {e}."
                    )
                # First row is the header
--- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
@ -5,6 +5,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 logger = get_module_logger(__name__)
@ -38,17 +40,26 @@ class ChildOpportunityIndex(ExtractTransformLoad):
    PUERTO_RICO_EXPECTED_IN_DATA = False
    def __init__(self):
        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.SOURCE_URL = (
+            self.child_opportunity_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "child_opportunity_index/raw.zip"
            )
        else:
-            self.SOURCE_URL = (
+            self.child_opportunity_url = (
                "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
                "3a0ededa30a0?format=csv"
            )
        # input
        self.child_opportunity_index_source = (
            self.get_sources_path() / "raw.csv"
        )
        # output
        # TODO: Decide about nixing this
        self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
@ -62,17 +73,25 @@ class ChildOpportunityIndex(ExtractTransformLoad):
        self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN"
        self.READING_INPUT_FIELD = "ED_READING"
        self.raw_df: pd.DataFrame
        self.output_df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        super().extract(
+        return [
-            source_url=self.SOURCE_URL,
+            ZIPDataSource(
-            extract_path=self.get_tmp_path(),
+                source=self.child_opportunity_url,
                destination=self.get_sources_path(),
            )
        ]
-    def transform(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
-        raw_df = pd.read_csv(
+
-            filepath_or_buffer=self.get_tmp_path() / "raw.csv",
+        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.raw_df = pd.read_csv(
            filepath_or_buffer=self.child_opportunity_index_source,
            # The following need to remain as strings for all of their digits, not get
            # converted to numbers.
            dtype={
@ -81,7 +100,9 @@ class ChildOpportunityIndex(ExtractTransformLoad):
            low_memory=False,
        )
-        output_df = raw_df.rename(
+    def transform(self) -> None:
        output_df = self.raw_df.rename(
            columns={
                self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD,
--- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
@ -5,22 +5,35 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 logger = get_module_logger(__name__)
 class DOEEnergyBurden(ExtractTransformLoad):
    NAME = "doe_energy_burden"
-    SOURCE_URL: str = (
+
        settings.AWS_JUSTICE40_DATASOURCES_URL
        + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
    )
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    LOAD_YAML_CONFIG: bool = True
    REVISED_ENERGY_BURDEN_FIELD_NAME: str
    def __init__(self):
        # fetch
        self.doe_energy_burden_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
        )
        # input
        self.doe_energy_burden_source = (
            self.get_sources_path() / "DOE_LEAD_AMI_TRACT_2018_ALL.csv"
        )
        # output
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "doe_energy_burden"
        )
@ -29,10 +42,22 @@ class DOEEnergyBurden(ExtractTransformLoad):
        self.raw_df: pd.DataFrame
        self.output_df: pd.DataFrame
-    def transform(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        raw_df: pd.DataFrame = pd.read_csv(
+        return [
-            filepath_or_buffer=self.get_tmp_path()
+            ZIPDataSource(
-            / "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
+                source=self.doe_energy_burden_url,
                destination=self.get_sources_path(),
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.raw_df = pd.read_csv(
            filepath_or_buffer=self.doe_energy_burden_source,
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={
                self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
@ -40,8 +65,10 @@ class DOEEnergyBurden(ExtractTransformLoad):
            low_memory=False,
        )
    def transform(self) -> None:
        logger.debug("Renaming columns and ensuring output format is correct")
-        output_df = raw_df.rename(
+        output_df = self.raw_df.rename(
            columns={
                self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py
@ -3,6 +3,8 @@
 import geopandas as gpd
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -15,14 +17,6 @@ class TravelCompositeETL(ExtractTransformLoad):
    NAME = "travel_composite"
    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
        SOURCE_URL = (
            f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
            "dot_travel_composite/Shapefile_and_Metadata.zip"
        )
    else:
        SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -31,14 +25,29 @@ class TravelCompositeETL(ExtractTransformLoad):
    TRAVEL_BURDEN_FIELD_NAME: str
    def __init__(self):
        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
            self.travel_composite_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "dot_travel_composite/Shapefile_and_Metadata.zip"
            )
        else:
            self.travel_composite_url = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
        # input
        # define the full path for the input CSV file
-        self.INPUT_SHP = (
+        self.disadvantage_layer_shape_source = (
-            self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
+            self.get_sources_path()
            / "DOT_Disadvantage_Layer_Final_April2022.shp"
        )
        # output
        # this is the main dataframe
        self.df: pd.DataFrame
        self.df_dot: pd.DataFrame
        # Start dataset-specific vars here
        ## Average of Transportation Indicator Percentiles (calculated)
        ## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
@ -46,6 +55,22 @@ class TravelCompositeETL(ExtractTransformLoad):
        self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
        self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
    def get_data_sources(self) -> [DataSource]:
        return [
            ZIPDataSource(
                source=self.travel_composite_url,
                destination=self.get_sources_path(),
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.df_dot = gpd.read_file(self.disadvantage_layer_shape_source)
    def transform(self) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:
@ -54,15 +79,15 @@ class TravelCompositeETL(ExtractTransformLoad):
        - Converts to CSV
        """
        # read in the unzipped shapefile from data source
        # reformat it to be standard df, remove unassigned rows, and
        # then rename the Census Tract column for merging
-        df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
+
-        df_dot = df_dot.rename(
+        self.df_dot = self.df_dot.rename(
            columns={
                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
            }
        ).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
        # Assign the final df to the class' output_df for the load method
-        self.output_df = df_dot
+        self.output_df = self.df_dot
--- a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py
@ -1,12 +1,15 @@
 from pathlib import Path
 import geopandas as gpd
 import pandas as pd
 import geopandas as gpd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 from data_pipeline.utils import get_module_logger
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 logger = get_module_logger(__name__)
@ -39,13 +42,20 @@ class AbandonedMineETL(ExtractTransformLoad):
        "55",
    ]
    # Define these for easy code completion
    def __init__(self):
-        self.SOURCE_URL = (
+
        # fetch
        self.eamlis_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/eAMLIS export of all data.tsv.zip"
        )
        # input
        self.eamlis_source = (
            self.get_sources_path() / "eAMLIS export of all data.tsv"
        )
        # output
        self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
        self.OUTPUT_PATH: Path = (
@ -58,18 +68,34 @@ class AbandonedMineETL(ExtractTransformLoad):
        ]
        self.output_df: pd.DataFrame
        self.df: pd.DataFrame
-    def transform(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        df = pd.read_csv(
+        return [
-            self.get_tmp_path() / "eAMLIS export of all data.tsv",
+            ZIPDataSource(
                source=self.eamlis_url, destination=self.get_sources_path()
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.df = pd.read_csv(
            self.eamlis_source,
            sep="\t",
            low_memory=False,
        )
    def transform(self) -> None:
        gdf = gpd.GeoDataFrame(
-            df,
+            self.df,
            geometry=gpd.points_from_xy(
-                x=df["Longitude"],
+                x=self.df["Longitude"],
-                y=df["Latitude"],
+                y=self.df["Latitude"],
            ),
            crs="epsg:4326",
        )
@ -77,4 +103,5 @@ class AbandonedMineETL(ExtractTransformLoad):
        gdf_tracts = add_tracts_for_geometries(gdf)
        gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
        gdf_tracts[self.AML_BOOLEAN] = True
        self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py
@ -3,6 +3,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 logger = get_module_logger(__name__)
@ -15,11 +17,18 @@ class EJSCREENETL(ExtractTransformLoad):
    INPUT_GEOID_TRACT_FIELD_NAME: str = "ID"
    def __init__(self):
-        self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
+
-        self.EJSCREEN_CSV = (
+        # fetch
-            self.get_tmp_path() / "EJSCREEN_2021_USPR_Tracts.csv"
+        self.ejscreen_url = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
        # input
        self.ejscreen_source = (
            self.get_sources_path() / "EJSCREEN_2021_USPR_Tracts.csv"
        )
        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen"
        self.df: pd.DataFrame
        self.COLUMNS_TO_KEEP = [
@ -43,22 +52,29 @@ class EJSCREENETL(ExtractTransformLoad):
            field_names.UST_FIELD,
        ]
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        super().extract(
+        return [
-            self.EJSCREEN_FTP_URL,
+            ZIPDataSource(
-            self.get_tmp_path(),
+                source=self.ejscreen_url, destination=self.get_sources_path()
            verify=False,  # EPA EJScreen end point has certificate issues often
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
    def transform(self) -> None:
        self.df = pd.read_csv(
-            self.EJSCREEN_CSV,
+            self.ejscreen_source,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
            # EJSCREEN writes the word "None" for NA data.
            na_values=["None"],
            low_memory=False,
        )
    def transform(self) -> None:
        # rename ID to Tract ID
        self.output_df = self.df.rename(
            columns={
--- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py
@ -1,5 +1,6 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)
@ -9,12 +10,17 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
    # Note: while we normally set these properties in `__init__`,
    # we are setting them as class properties here so they can be accessed by the
    # class method `ejscreen_areas_of_concern_data_exists`.
-    LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
+
-    EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
+    EJSCREEN_AREAS_OF_CONCERN_SOURCE = (
-        LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
+        ExtractTransformLoad.DATA_PATH
        / "sources"
        / "EJSCREENAreasOfConcernETL"
        / "ejscreen_areas_of_concerns_indicators.csv"
    )
    def __init__(self):
        # output
        self.OUTPUT_PATH = (
            self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
        )
@ -22,6 +28,10 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
        # TO DO: Load from actual source; the issue is that this dataset is not public for now
        self.df: pd.DataFrame
    def get_data_sources(self) -> [DataSource]:
        """The source for this must be downloaded and saved manually. It is not publicly available"""
        return []
    @classmethod
    def ejscreen_areas_of_concern_data_exists(cls):
        """Check whether or not the EJSCREEN areas of concern data exists.
@ -35,13 +45,19 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
        not reference this data.
        """
-        return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
+        return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE.is_file()
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        logger.info(self.EJSCREEN_AREAS_OF_CONCERN_SOURCE)
        if self.ejscreen_areas_of_concern_data_exists():
            logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
            self.df = pd.read_csv(
-                filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
+                filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE,
                dtype={
                    self.GEOID_FIELD_NAME: "string",
                },
--- a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py
@ -5,18 +5,27 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 logger = get_module_logger(__name__)
 class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
    def __init__(self):
-        self.DEFINITION_ALTERNATIVE_FILE_URL = (
+
        # fetch
        self.definition_alternative_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/alternative DAC definition.csv.zip"
        )
        # input
        self.definition_alternative_source = (
            self.get_sources_path() / "J40 alternative DAC definition.csv"
        )
        # output
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "energy_definition_alternative_draft"
        )
@ -48,18 +57,22 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        unzip_file_from_url(
+        return [
-            file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
+            ZIPDataSource(
-            download_path=self.get_tmp_path(),
+                source=self.definition_alternative_url,
-            unzipped_file_path=self.get_tmp_path()
+                destination=self.get_sources_path(),
            / "energy_definition_alternative_draft",
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
+            filepath_or_buffer=self.definition_alternative_source,
            / "energy_definition_alternative_draft"
            / "J40 alternative DAC definition.csv",
            # The following need to remain as strings for all of their digits, not get converted to numbers.
            dtype={
                self.TRACT_INPUT_COLUMN_NAME: "string",
@ -68,6 +81,7 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
        )
    def transform(self) -> None:
        self.df = self.df.rename(
            columns={
                self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py
@ -4,8 +4,9 @@ import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import unzip_file_from_url
 from data_pipeline.config import settings
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 logger = get_module_logger(__name__)
@ -23,17 +24,25 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
    def __init__(self):
        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.AGGREGATED_RSEI_SCORE_FILE_URL = (
+            self.aggregated_rsei_score_file_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
            )
        else:
-            self.AGGREGATED_RSEI_SCORE_FILE_URL = (
+            self.aggregated_rsei_score_file_url = (
                "http://abt-rsei.s3.amazonaws.com/microdata2019/"
                "census_agg/CensusMicroTracts2019_2019_aggregated.zip"
            )
        # input
        self.aggregated_rsei_score_source = (
            self.get_sources_path()
            / "CensusMicroTracts2019_2019_aggregated.csv"
        )
        # output
        self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
        self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
        self.TRACT_INPUT_COLUMN_NAME = "GEOID10"
@ -64,7 +73,20 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        return [
            ZIPDataSource(
                source=self.aggregated_rsei_score_file_url,
                destination=self.get_sources_path(),
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        # the column headers from the above dataset are actually a census tract's data at this point
        # We will use this data structure later to specify the column names
        input_columns = [
@ -79,16 +101,8 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
            self.NCSCORE_INPUT_FIELD,
        ]
        unzip_file_from_url(
            file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL,
            download_path=self.get_tmp_path(),
            unzipped_file_path=self.get_tmp_path() / "epa_rsei",
        )
        self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path()
+            filepath_or_buffer=self.aggregated_rsei_score_source,
            / "epa_rsei"
            / "CensusMicroTracts2019_2019_aggregated.csv",
            # The following need to remain as strings for all of their digits, not get
            # converted to numbers.
            low_memory=False,
--- a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py
@ -5,6 +5,8 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 logger = get_module_logger(__name__)
@ -15,7 +17,7 @@ class FloodRiskETL(ExtractTransformLoad):
    NAME = "fsf_flood_risk"
    # These data were emailed to the J40 team while first street got
    # their official data sharing channels setup.
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    LOAD_YAML_CONFIG: bool = True
@ -27,13 +29,16 @@ class FloodRiskETL(ExtractTransformLoad):
    SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str
    def __init__(self):
-        # define the full path for the input CSV file
+
-        self.INPUT_CSV = (
+        # fetch
-            self.get_tmp_path() / "fsf_flood" / "flood-tract2010.csv"
+        self.flood_tract_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
        )
-        # this is the main dataframe
+        # input
-        self.df: pd.DataFrame
+        self.flood_tract_source = (
            self.get_sources_path() / "fsf_flood" / "flood-tract2010.csv"
        )
        # Start dataset-specific vars here
        self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
@ -41,6 +46,29 @@ class FloodRiskETL(ExtractTransformLoad):
        self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30"
        self.CLIP_PROPERTIES_COUNT = 250
        self.df_fsf_flood: pd.DataFrame
    def get_data_sources(self) -> [DataSource]:
        return [
            ZIPDataSource(
                source=self.flood_tract_url, destination=self.get_sources_path()
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        # read in the unzipped csv data source then rename the
        # Census Tract column for merging
        self.df_fsf_flood = pd.read_csv(
            self.flood_tract_source,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
            low_memory=False,
        )
    def transform(self) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:
@ -49,35 +77,29 @@ class FloodRiskETL(ExtractTransformLoad):
        - Calculates share of properties at risk, left-clipping number of properties at 250
        """
-        # read in the unzipped csv data source then rename the
+        self.df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_flood[
        # Census Tract column for merging
        df_fsf_flood: pd.DataFrame = pd.read_csv(
            self.INPUT_CSV,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
            low_memory=False,
        )
        df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = df_fsf_flood[
            self.INPUT_GEOID_TRACT_FIELD_NAME
        ].str.zfill(11)
-        df_fsf_flood[self.COUNT_PROPERTIES] = df_fsf_flood[
+        self.df_fsf_flood[self.COUNT_PROPERTIES] = self.df_fsf_flood[
            self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
        ].clip(lower=self.CLIP_PROPERTIES_COUNT)
-        df_fsf_flood[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY] = (
+        self.df_fsf_flood[
-            df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
+            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY
-            / df_fsf_flood[self.COUNT_PROPERTIES]
+        ] = (
            self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
            / self.df_fsf_flood[self.COUNT_PROPERTIES]
        )
-        df_fsf_flood[
+        self.df_fsf_flood[
            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS
        ] = (
-            df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
+            self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
-            / df_fsf_flood[self.COUNT_PROPERTIES]
+            / self.df_fsf_flood[self.COUNT_PROPERTIES]
        )
        # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_fsf_flood.rename(
+        self.output_df = self.df_fsf_flood.rename(
            columns={
                self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY,
                self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS,
--- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
@ -4,6 +4,8 @@ import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)
@ -15,7 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad):
    NAME = "fsf_wildfire_risk"
    # These data were emailed to the J40 team while first street got
    # their official data sharing channels setup.
-    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
+
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -29,18 +31,48 @@ class WildfireRiskETL(ExtractTransformLoad):
    SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str
    def __init__(self):
        # define the full path for the input CSV file
        self.INPUT_CSV = self.get_tmp_path() / "fsf_fire" / "fire-tract2010.csv"
        # fetch
        self.fsf_fire_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
        )
        # input
        self.fsf_fire_source = (
            self.get_sources_path() / "fsf_fire" / "fire-tract2010.csv"
        )
        # output
        # this is the main dataframe
        self.df: pd.DataFrame
        self.df_fsf_fire: pd.DataFrame
        # Start dataset-specific vars here
        self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
        self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag"
        self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag"
        self.CLIP_PROPERTIES_COUNT = 250
    def get_data_sources(self) -> [DataSource]:
        return [
            ZIPDataSource(
                source=self.fsf_fire_url, destination=self.get_sources_path()
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.df_fsf_fire = pd.read_csv(
            self.fsf_fire_source,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
            low_memory=False,
        )
    def transform(self) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:
@ -50,31 +82,28 @@ class WildfireRiskETL(ExtractTransformLoad):
        """
        # read in the unzipped csv data source then rename the
        # Census Tract column for merging
        df_fsf_fire: pd.DataFrame = pd.read_csv(
            self.INPUT_CSV,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
            low_memory=False,
        )
-        df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = df_fsf_fire[
+        self.df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_fire[
            self.INPUT_GEOID_TRACT_FIELD_NAME
        ].str.zfill(11)
-        df_fsf_fire[self.COUNT_PROPERTIES] = df_fsf_fire[
+        self.df_fsf_fire[self.COUNT_PROPERTIES] = self.df_fsf_fire[
            self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
        ].clip(lower=self.CLIP_PROPERTIES_COUNT)
-        df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
+        self.df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
-            df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
+            self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
-            / df_fsf_fire[self.COUNT_PROPERTIES]
+            / self.df_fsf_fire[self.COUNT_PROPERTIES]
        )
-        df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS] = (
+        self.df_fsf_fire[
-            df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
+            self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS
-            / df_fsf_fire[self.COUNT_PROPERTIES]
+        ] = (
            self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
            / self.df_fsf_fire[self.COUNT_PROPERTIES]
        )
        # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_fsf_fire.rename(
+        self.output_df = self.df_fsf_fire.rename(
            columns={
                self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY,
                self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS,
--- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py
@ -3,17 +3,33 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
-from data_pipeline.utils import unzip_file_from_url
+from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 logger = get_module_logger(__name__)
 class GeoCorrETL(ExtractTransformLoad):
    NAME = "geocorr"
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    def __init__(self):
        # fetch
        self.geocorr_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/geocorr_urban_rural.csv.zip"
        )
        # input
        self.geocorr_source = (
            self.get_sources_path() / "geocorr_urban_rural.csv"
        )
        # output
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"
        # Need to change hyperlink to S3
@ -23,7 +39,7 @@ class GeoCorrETL(ExtractTransformLoad):
        # The source data for this notebook was downloaded from GeoCorr;
        # the instructions for generating the source data is here:
        # https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787
-        self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
+        # self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
        self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
        self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag"
        self.COLUMNS_TO_KEEP = [
@ -33,16 +49,21 @@ class GeoCorrETL(ExtractTransformLoad):
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        unzip_file_from_url(
+        return [
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+            ZIPDataSource(
-            + "/geocorr_urban_rural.csv.zip",
+                source=self.geocorr_url, destination=self.get_sources_path()
            download_path=self.get_tmp_path(),
            unzipped_file_path=self.get_tmp_path(),
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.df = pd.read_csv(
-            filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv",
+            filepath_or_buffer=self.geocorr_source,
            dtype={
                self.GEOCORR_GEOID_FIELD_NAME: "string",
            },
--- a/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py
@ -3,12 +3,16 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 logger = get_module_logger(__name__)
 class HistoricRedliningETL(ExtractTransformLoad):
    NAME = "historic_redlining"
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
    EXPECTED_MISSING_STATES = [
        "10",
@ -25,14 +29,14 @@ class HistoricRedliningETL(ExtractTransformLoad):
    ]
    PUERTO_RICO_EXPECTED_IN_DATA = False
    ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False
    SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
    def __init__(self):
        self.CSV_PATH = self.DATA_PATH / "dataset" / "historic_redlining"
-        self.HISTORIC_REDLINING_FILE_PATH = (
+        # fetch
-            self.get_tmp_path() / "HRS_2010.xlsx"
+        self.hrs_url = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
-        )
+
        # input
        self.hrs_source = self.get_sources_path() / "HRS_2010.xlsx"
        self.REDLINING_SCALAR = "Tract-level redlining score"
@ -40,30 +44,47 @@ class HistoricRedliningETL(ExtractTransformLoad):
            self.GEOID_TRACT_FIELD_NAME,
            self.REDLINING_SCALAR,
        ]
        self.df: pd.DataFrame
        self.historic_redlining_data: pd.DataFrame
    def get_data_sources(self) -> [DataSource]:
        return [
            ZIPDataSource(
                source=self.hrs_url, destination=self.get_sources_path()
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.historic_redlining_data = pd.read_excel(self.hrs_source)
    def transform(self) -> None:
        # this is obviously temporary
-        historic_redlining_data = pd.read_excel(
+
-            self.HISTORIC_REDLINING_FILE_PATH
+        self.historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
            self.historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
        )
-        historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
+        self.historic_redlining_data = self.historic_redlining_data.rename(
            historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
        )
        historic_redlining_data = historic_redlining_data.rename(
            columns={"HRS2010": self.REDLINING_SCALAR}
        )
-        logger.debug(f"{historic_redlining_data.columns}")
+        logger.debug(f"{self.historic_redlining_data.columns}")
        # Calculate lots of different score thresholds for convenience
        for threshold in [3.25, 3.5, 3.75]:
-            historic_redlining_data[
+            self.historic_redlining_data[
                f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
-            ] = (historic_redlining_data[self.REDLINING_SCALAR] >= threshold)
+            ] = (
                self.historic_redlining_data[self.REDLINING_SCALAR] >= threshold
            )
            ## NOTE We add to columns to keep here
            self.COLUMNS_TO_KEEP.append(
                f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
            )
-        self.output_df = historic_redlining_data
+        self.output_df = self.historic_redlining_data
--- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py
@ -1,8 +1,9 @@
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import unzip_file_from_url
 from pandas.errors import EmptyDataError
 logger = get_module_logger(__name__)
@ -10,36 +11,46 @@ logger = get_module_logger(__name__)
 class HousingTransportationETL(ExtractTransformLoad):
    def __init__(self):
-        self.HOUSING_FTP_URL = (
+
            "https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
        )
        self.OUTPUT_PATH = (
            self.DATA_PATH / "dataset" / "housing_and_transportation_index"
        )
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        housing_url = (
            "https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
        )
        sources = []
        for fips in get_state_fips_codes(self.DATA_PATH):
            sources.append(
                ZIPDataSource(
                    source=f"{housing_url}{fips}",
                    destination=self.get_sources_path(),
                )
            )
        return sources
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        # Download each state / territory individually
        dfs = []
        zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
        for fips in get_state_fips_codes(self.DATA_PATH):
            logger.debug(
                f"Downloading housing data for state/territory with FIPS code {fips}"
            )
-            unzip_file_from_url(
+            csv_source = (
-                f"{self.HOUSING_FTP_URL}{fips}",
+                self.get_sources_path() / f"htaindex2019_data_tracts_{fips}.csv"
                self.get_tmp_path(),
                zip_file_dir,
            )
            # New file name:
            tmp_csv_file_path = (
                zip_file_dir / f"htaindex2019_data_tracts_{fips}.csv"
            )
            try:
-                tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
+                tmp_df = pd.read_csv(filepath_or_buffer=csv_source)
            except EmptyDataError:
                logger.error(
                    f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py
@ -3,24 +3,33 @@ from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 logger = get_module_logger(__name__)
 class HudHousingETL(ExtractTransformLoad):
    NAME = "hud_housing"
    GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
    def __init__(self):
        self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.HOUSING_FTP_URL = (
+            self.housing_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "hud_housing/2014thru2018-140-csv.zip"
            )
        else:
-            self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
+            self.housing_url = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
        # source
        # output
        self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
        self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()
@ -55,15 +64,16 @@ class HudHousingETL(ExtractTransformLoad):
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        super().extract(
+        return [
-            self.HOUSING_FTP_URL,
+            ZIPDataSource(
-            self.HOUSING_ZIP_FILE_DIR,
+                source=self.housing_url, destination=self.get_sources_path()
            )
        ]
    def _read_chas_table(self, file_name):
-        # New file name:
+
-        tmp_csv_file_path = self.HOUSING_ZIP_FILE_DIR / "140" / file_name
+        tmp_csv_file_path = self.get_sources_path() / "140" / file_name
        tmp_df = pd.read_csv(
            filepath_or_buffer=tmp_csv_file_path,
            encoding="latin-1",
@ -78,7 +88,12 @@ class HudHousingETL(ExtractTransformLoad):
        return tmp_df
-    def transform(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        table_8 = self._read_chas_table("Table8.csv")
        table_3 = self._read_chas_table("Table3.csv")
@ -86,6 +101,8 @@ class HudHousingETL(ExtractTransformLoad):
            table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME
        )
    def transform(self) -> None:
        # Calculate share that lacks indoor plumbing or kitchen
        # This is computed as
        # (
--- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py
@ -1,7 +1,9 @@
 import pandas as pd
-import requests
+
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.utils import get_module_logger
@ -11,44 +13,51 @@ logger = get_module_logger(__name__)
 class HudRecapETL(ExtractTransformLoad):
    def __init__(self):
        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.HUD_RECAP_CSV_URL = (
+            self.hud_recap_csv_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
            )
        else:
-            self.HUD_RECAP_CSV_URL = (
+            self.hud_recap_csv_url = (
                "https://opendata.arcgis.com/api/v3/datasets/"
                "56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
            )
-        self.HUD_RECAP_CSV = (
+        # input
-            self.get_tmp_path()
+        self.hud_recap_source = (
            self.get_sources_path()
            / "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
        )
        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"
-        # Definining some variable names
+        # Defining some variable names
        self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
            "hud_recap_priority_community"
        )
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        download = requests.get(
+        return [
-            self.HUD_RECAP_CSV_URL,
+            FileDataSource(
-            verify=None,
+                source=self.hud_recap_csv_url, destination=self.hud_recap_source
            timeout=settings.REQUESTS_DEFAULT_TIMOUT,
            )
-        file_contents = download.content
+        ]
-        csv_file = open(self.HUD_RECAP_CSV, "wb")
+
-        csv_file.write(file_contents)
+    def extract(self, use_cached_data_sources: bool = False) -> None:
-        csv_file.close()
+
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        # Load comparison index (CalEnviroScreen 4)
        self.df = pd.read_csv(self.hud_recap_source, dtype={"GEOID": "string"})
    def transform(self) -> None:
        # Load comparison index (CalEnviroScreen 4)
        self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
        self.df.rename(
            columns={
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py
@ -2,6 +2,8 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
@ -10,16 +12,25 @@ logger = get_module_logger(__name__)
 class MappingForEJETL(ExtractTransformLoad):
    def __init__(self):
        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
-        self.MAPPING_FOR_EJ_VA_URL = (
+        # fetch
        self.mapping_for_ej_va_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip"
        )
-        self.MAPPING_FOR_EJ_CO_URL = (
+        self.mapping_for_ej_co_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
        )
-        self.VA_SHP_FILE_PATH = self.get_tmp_path() / "mej_virginia_7_1.shp"
+
-        self.CO_SHP_FILE_PATH = self.get_tmp_path() / "mej_colorado_final.shp"
+        # input
        self.va_shp_file_source = (
            self.get_sources_path() / "mej_virginia_7_1.shp"
        )
        self.co_shp_file_source = (
            self.get_sources_path() / "mej_colorado_final.shp"
        )
        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
        # Defining variables
        self.COLUMNS_TO_KEEP = [
@ -38,26 +49,35 @@ class MappingForEJETL(ExtractTransformLoad):
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        super().extract(
+        return [
-            self.MAPPING_FOR_EJ_VA_URL,
+            ZIPDataSource(
-            self.get_tmp_path(),
+                source=self.mapping_for_ej_va_url,
-        )
+                destination=self.get_sources_path(),
-        super().extract(
+            ),
-            self.MAPPING_FOR_EJ_CO_URL,
+            ZIPDataSource(
-            self.get_tmp_path(),
+                source=self.mapping_for_ej_co_url,
-        )
+                destination=self.get_sources_path(),
            ),
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
    def transform(self) -> None:
        # Join (here, it's just concatenating) the two dataframes from
        # CO and VA
        self.df = pd.concat(
            [
-                gpd.read_file(self.VA_SHP_FILE_PATH),
+                gpd.read_file(self.va_shp_file_source),
-                gpd.read_file(self.CO_SHP_FILE_PATH),
+                gpd.read_file(self.co_shp_file_source),
            ]
        )
    def transform(self) -> None:
        # Fill Census tract to get it to be 11 digits, incl. leading 0s
        # Note that VA and CO should never have leading 0s, so this isn't
        # strictly necessary, but if in the future, there are more states
--- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py
@ -3,8 +3,9 @@ import pathlib
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -19,31 +20,35 @@ class MappingInequalityETL(ExtractTransformLoad):
    Information on the mapping of this data to census tracts is available at
    https://github.com/americanpanorama/Census_HOLC_Research.
    """
    def __init__(self):
        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            self.MAPPING_INEQUALITY_CSV_URL = (
+            self.mapping_inequality_csv_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "mapping_inequality/holc_tract_lookup.csv"
            )
        else:
-            self.MAPPING_INEQUALITY_CSV_URL = (
+            self.mapping_inequality_csv_url = (
                "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
                "main/2010_Census_Tracts/holc_tract_lookup.csv"
            )
        self.MAPPING_INEQUALITY_CSV = (
            self.get_tmp_path() / "holc_tract_lookup.csv"
        )
        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
-        self.HOLC_MANUAL_MAPPING_CSV_PATH = (
+        # input
        self.mapping_inequality_source = (
            self.get_sources_path() / "holc_tract_lookup.csv"
        )
        self.holc_manual_mapping_source = (  # here be dragons – this file is pulled from a different place than most
            pathlib.Path(__file__).parent
            / "data"
            / "holc_grades_manually_mapped.csv"
        )
        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
        # Some input field names. From documentation: 'Census Tracts were intersected
        # with HOLC Polygons. Census information can be joined via the "geoid" field.
        # There are two field "holc_prop" and "tract_prop" which give the proportion
@ -73,22 +78,39 @@ class MappingInequalityETL(ExtractTransformLoad):
        ]
        self.df: pd.DataFrame
        self.holc_manually_mapped_df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        download_file_from_url(
+        return [
-            file_url=self.MAPPING_INEQUALITY_CSV_URL,
+            FileDataSource(
-            download_file_name=self.MAPPING_INEQUALITY_CSV,
+                source=self.mapping_inequality_csv_url,
                destination=self.mapping_inequality_source,
            )
        ]
-    def transform(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
-        df: pd.DataFrame = pd.read_csv(
+
-            self.MAPPING_INEQUALITY_CSV,
+        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.df = pd.read_csv(
            self.mapping_inequality_source,
            dtype={self.TRACT_INPUT_FIELD: "string"},
            low_memory=False,
        )
        # Some data needs to be manually mapped to its grade.
        # TODO: Investigate more data that may need to be manually mapped.
        self.holc_manually_mapped_df = pd.read_csv(
            filepath_or_buffer=self.holc_manual_mapping_source,
            low_memory=False,
        )
    def transform(self) -> None:
        # rename Tract ID
-        df.rename(
+        self.df.rename(
            columns={
                self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
            },
@ -98,28 +120,21 @@ class MappingInequalityETL(ExtractTransformLoad):
        # Keep the first character, which is the HOLC grade (A, B, C, D).
        # TODO: investigate why this dataframe triggers these pylint errors.
        # pylint: disable=unsupported-assignment-operation, unsubscriptable-object
-        df[self.HOLC_GRADE_DERIVED_FIELD] = df[
+        self.df[self.HOLC_GRADE_DERIVED_FIELD] = self.df[
            self.HOLC_GRADE_AND_ID_FIELD
        ].str[0:1]
        # Remove nonsense when the field has no grade or invalid grades.
        valid_grades = ["A", "B", "C", "D"]
-        df.loc[
+        self.df.loc[
            # pylint: disable=unsubscriptable-object
-            ~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
+            ~self.df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
            self.HOLC_GRADE_DERIVED_FIELD,
        ] = None
        # Some data needs to be manually mapped to its grade.
        # TODO: Investigate more data that may need to be manually mapped.
        holc_manually_mapped_df = pd.read_csv(
            filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH,
            low_memory=False,
        )
        # Join on the existing data
-        merged_df = df.merge(
+        merged_df = self.df.merge(
-            right=holc_manually_mapped_df,
+            right=self.holc_manually_mapped_df,
            on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
            how="left",
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py
@ -4,6 +4,8 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
@ -17,11 +19,16 @@ class MarylandEJScreenETL(ExtractTransformLoad):
    """
    def __init__(self):
-        self.MARYLAND_EJSCREEN_URL = (
+
        # fetch
        self.maryland_ejscreen_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
        )
-        self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen"
+        # input
        self.shape_files_source = self.get_sources_path() / "mdejscreen"
        # output
        self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"
        self.COLUMNS_TO_KEEP = [
@ -31,37 +38,47 @@ class MarylandEJScreenETL(ExtractTransformLoad):
        ]
        self.df: pd.DataFrame
        self.dfs_list: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        logger.debug("Downloading 207MB Maryland EJSCREEN Data")
+        return [
-        super().extract(
+            ZIPDataSource(
-            self.MARYLAND_EJSCREEN_URL,
+                source=self.maryland_ejscreen_url,
-            self.get_tmp_path(),
+                destination=self.get_sources_path(),
            )
        ]
-    def transform(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
        list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
-        # Ignore counties becauses this is not the level of measurement
+        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        logger.debug("Downloading 207MB Maryland EJSCREEN Data")
        list_of_files = list(glob(str(self.shape_files_source) + "/*.shp"))
        # Ignore counties because this is not the level of measurement
        # that is consistent with our current scoring and ranking methodology.
-        dfs_list = [
+        self.dfs_list = [
            gpd.read_file(f)
            for f in list_of_files
            if not f.endswith("CountiesEJScore.shp")
        ]
    def transform(self) -> None:
        # Set the Census tract as the index and drop the geometry column
        # that produces the census tract boundaries.
        # The latter is because Geopandas raises an exception if there
        # are duplicate geometry columns.
        # Moreover, since the unit of measurement is at the tract level
        # we can consistantly merge this with other datasets
-        dfs_list = [
+        self.dfs_list = [
            df.set_index("Census_Tra").drop("geometry", axis=1)
-            for df in dfs_list
+            for df in self.dfs_list
        ]
        # pylint: disable=unsubscriptable-object
-        self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1))
+        self.df = gpd.GeoDataFrame(pd.concat(self.dfs_list, axis=1))
        # Reset index so that we no longer have the tract as our index
        self.df = self.df.reset_index()
--- a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py
@ -1,6 +1,8 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
@ -15,12 +17,21 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
    """
    def __init__(self):
-        self.MICHIGAN_EJSCREEN_S3_URL = (
+
        # fetch
        self.michigan_ejscreen_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/michigan_ejscore_12212021.csv"
        )
        # input
        self.michigan_ejscreen_source = (
            self.get_sources_path() / "michigan_ejscore_12212021.csv"
        )
        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen"
        self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75
        self.COLUMNS_TO_KEEP = [
@ -32,14 +43,28 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
        self.df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        return [
            FileDataSource(
                source=self.michigan_ejscreen_url,
                destination=self.michigan_ejscreen_source,
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.df = pd.read_csv(
-            filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
+            filepath_or_buffer=self.michigan_ejscreen_source,
            dtype={"GEO_ID": "string"},
            low_memory=False,
        )
    def transform(self) -> None:
        self.df.rename(
            columns={
                "GEO_ID": self.GEOID_TRACT_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -4,6 +4,8 @@
 # pylint: disable=unsupported-assignment-operation
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -16,17 +18,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    NAME = "national_risk_index"
    if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
        SOURCE_URL = (
            f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
            "national_risk_index/NRI_Table_CensusTracts.zip"
        )
    else:
        SOURCE_URL = (
            "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
            "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
        )
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -46,11 +37,28 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    AGRIVALUE_LOWER_BOUND = 408000
    def __init__(self):
        # define the full path for the input CSV file
        self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
        # fetch
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
            self.risk_index_url = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "national_risk_index/NRI_Table_CensusTracts.zip"
            )
        else:
            self.risk_index_url = (
                "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
                "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
            )
        # source
        self.risk_index_source = (
            self.get_sources_path() / "NRI_Table_CensusTracts.csv"
        )
        # output
        # this is the main dataframe
        self.df: pd.DataFrame
        self.df_nri: pd.DataFrame
        # Start dataset-specific vars here
        self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
@ -65,14 +73,26 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
        self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        """Unzips NRI dataset from the FEMA data source and writes the files
+        return [
-        to the temporary data folder for use in the transform() method
+            ZIPDataSource(
-        """
+                source=self.risk_index_url, destination=self.get_sources_path()
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
-            source_url=self.SOURCE_URL,
+            use_cached_data_sources
-            extract_path=self.get_tmp_path(),
+        )  # download and extract data sources
        # read in the unzipped csv from NRI data source then rename the
        # Census Tract column for merging
        self.df_nri = pd.read_csv(
            self.risk_index_source,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
            na_values=["None"],
            low_memory=False,
        )
    def transform(self) -> None:
@ -84,16 +104,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
          Groups inside of that Tract
        """
-        # read in the unzipped csv from NRI data source then rename the
+        self.df_nri.rename(
        # Census Tract column for merging
        df_nri: pd.DataFrame = pd.read_csv(
            self.INPUT_CSV,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
            na_values=["None"],
            low_memory=False,
        )
        df_nri.rename(
            columns={
                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
@ -123,42 +134,46 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        agriculture_columns = [
            f"{x}_EALA"
            for x in disaster_categories
-            if f"{x}_EALA" in list(df_nri.columns)
+            if f"{x}_EALA" in list(self.df_nri.columns)
        ]
        population_columns = [
            f"{x}_EALP"
            for x in disaster_categories
-            if f"{x}_EALP" in list(df_nri.columns)
+            if f"{x}_EALP" in list(self.df_nri.columns)
        ]
        buildings_columns = [
            f"{x}_EALB"
            for x in disaster_categories
-            if f"{x}_EALB" in list(df_nri.columns)
+            if f"{x}_EALB" in list(self.df_nri.columns)
        ]
-        disaster_population_sum_series = df_nri[population_columns].sum(axis=1)
+        disaster_population_sum_series = self.df_nri[population_columns].sum(
        disaster_agriculture_sum_series = df_nri[agriculture_columns].sum(
            axis=1
        )
-        disaster_buildings_sum_series = df_nri[buildings_columns].sum(axis=1)
+        disaster_agriculture_sum_series = self.df_nri[agriculture_columns].sum(
            axis=1
        )
        disaster_buildings_sum_series = self.df_nri[buildings_columns].sum(
            axis=1
        )
        # Population EAL Rate = Eal Valp / Population
-        df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
+        self.df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
            disaster_population_sum_series
-            / df_nri[self.POPULATION_INPUT_FIELD_NAME]
+            / self.df_nri[self.POPULATION_INPUT_FIELD_NAME]
        )
        # Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000)
        ## FORMULA ADJUSTMENT 2/17
        ## Because AGRIVALUE contains a lot of 0s, we are going to consider
        ## 90th percentile only for places that have some agrivalue at all
-        df_nri[
+        self.df_nri[
            self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
-        ] = disaster_agriculture_sum_series / df_nri[
+        ] = disaster_agriculture_sum_series / self.df_nri[
            self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME
        ].clip(
            lower=self.AGRIVALUE_LOWER_BOUND
@ -167,11 +182,11 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        ## Check that this clip worked -- that the only place the value has changed is when the clip took effect
        base_expectation = (
            disaster_agriculture_sum_series
-            / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
+            / self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
        )
        assert (
-            df_nri[
+            self.df_nri[
-                df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+                self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
                != base_expectation
            ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
            <= self.AGRIVALUE_LOWER_BOUND
@ -181,27 +196,27 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        )
        assert (
-            df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
+            self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
            != base_expectation
        ).sum() > 0, "Clipping the agrivalue did nothing!"
        # This produces a boolean that is True in the case of non-zero agricultural value
-        df_nri[self.CONTAINS_AGRIVALUE] = (
+        self.df_nri[self.CONTAINS_AGRIVALUE] = (
-            df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
+            self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
        )
        # divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)).
-        df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
+        self.df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
            disaster_buildings_sum_series
-            / df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
+            / self.df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
        )
        # Round all float columns to just 10 digits.
        # Note: `round` is smart enough to only apply to float columns.
-        df_nri = df_nri.round(10)
+        self.df_nri = self.df_nri.round(10)
        # Assign the final df to the class' output_df for the load method
-        self.output_df = df_nri
+        self.output_df = self.df_nri
    def load(self) -> None:
        # Suppress scientific notation.
--- a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
@ -3,6 +3,8 @@
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
@ -13,10 +15,7 @@ class NatureDeprivedETL(ExtractTransformLoad):
    """ETL class for the Nature Deprived Communities dataset"""
    NAME = "nlcd_nature_deprived"
-    SOURCE_URL = (
+
        settings.AWS_JUSTICE40_DATASOURCES_URL
        + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
    )
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    PUERTO_RICO_EXPECTED_IN_DATA = False
    LOAD_YAML_CONFIG: bool = True
@ -29,14 +28,25 @@ class NatureDeprivedETL(ExtractTransformLoad):
    TRACT_PERCENT_CROPLAND_FIELD_NAME: str
    def __init__(self):
-        # define the full path for the input CSV file
+
-        self.INPUT_CSV = (
+        # fetch
-            self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
+        self.nature_deprived_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
        )
        # source
        # define the full path for the input CSV file
        self.nature_deprived_source = (
            self.get_sources_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
        )
        # output
        # this is the main dataframe
        self.df: pd.DataFrame
        self.df_ncld: pd.DataFrame
        # Start dataset-specific vars here
        self.PERCENT_NATURAL_FIELD_NAME = "PctNatural"
        self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv"
@ -47,28 +57,43 @@ class NatureDeprivedETL(ExtractTransformLoad):
        # for area. This does indeed remove tracts from the 90th+ percentile later on
        self.TRACT_ACRES_LOWER_BOUND = 35
-    def transform(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        return [
            ZIPDataSource(
                source=self.nature_deprived_url,
                destination=self.get_sources_path(),
            )
        ]
    def extract(self, use_cached_data_sources: bool = False) -> None:
        """Reads the unzipped data file into memory and applies the following
        transformations to prepare it for the load() method:
        - Renames columns as needed
        """
-        df_ncld: pd.DataFrame = pd.read_csv(
+        super().extract(
-            self.INPUT_CSV,
+            use_cached_data_sources
        )  # download and extract data sources
        self.df_ncld = pd.read_csv(
            self.nature_deprived_source,
            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
            low_memory=False,
        )
-        df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
+    def transform(self) -> None:
-            df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND
+
        self.df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
            self.df_ncld[self.TRACT_ACRES_FIELD_NAME]
            >= self.TRACT_ACRES_LOWER_BOUND
        )
-        df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
+        self.df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
-            100 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
+            100 - self.df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
        )
        # Assign the final df to the class' output_df for the load method with rename
-        self.output_df = df_ncld.rename(
+        self.output_df = self.df_ncld.rename(
            columns={
                self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME,
                self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME,
--- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py
@ -3,9 +3,10 @@ import functools
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import unzip_file_from_url
 logger = get_module_logger(__name__)
@ -23,6 +24,26 @@ class PersistentPovertyETL(ExtractTransformLoad):
    PUERTO_RICO_EXPECTED_IN_DATA = False
    def __init__(self):
        # fetch
        self.poverty_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL + "/LTDB_Std_All_Sample.zip"
        )
        # source
        self.poverty_sources = [
            self.get_sources_path()
            / "ltdb_std_all_sample"
            / "ltdb_std_1990_sample.csv",
            self.get_sources_path()
            / "ltdb_std_all_sample"
            / "ltdb_std_2000_sample.csv",
            self.get_sources_path()
            / "ltdb_std_all_sample"
            / "ltdb_std_2010_sample.csv",
        ]
        # output
        self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"
        # Need to change hyperlink to S3
@ -44,6 +65,13 @@ class PersistentPovertyETL(ExtractTransformLoad):
        self.df: pd.DataFrame
    def get_data_sources(self) -> [DataSource]:
        return [
            ZIPDataSource(
                source=self.poverty_url, destination=self.get_sources_path()
            )
        ]
    def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
        df = functools.reduce(
            lambda df_a, df_b: pd.merge(
@ -75,28 +103,17 @@ class PersistentPovertyETL(ExtractTransformLoad):
        return df
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
        unzipped_file_path = self.get_tmp_path()
-        unzip_file_from_url(
+        super().extract(
-            file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+            use_cached_data_sources
-            + "/LTDB_Std_All_Sample.zip",
+        )  # download and extract data sources
            download_path=self.get_tmp_path(),
            unzipped_file_path=unzipped_file_path,
        )
        file_names = [
            "ltdb_std_1990_sample.csv",
            "ltdb_std_2000_sample.csv",
            "ltdb_std_2010_sample.csv",
        ]
        temporary_input_dfs = []
-        for file_name in file_names:
+        for file_name in self.poverty_sources:
            temporary_input_df = pd.read_csv(
-                filepath_or_buffer=unzipped_file_path
+                filepath_or_buffer=file_name,
                / f"ltdb_std_all_sample/{file_name}",
                dtype={
                    self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
                    self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",
--- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py
@ -1,6 +1,8 @@
 import geopandas as gpd
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.utils import get_module_logger
 logger = get_module_logger(__name__)
@ -20,10 +22,17 @@ class TreeEquityScoreETL(ExtractTransformLoad):
    """
    def __init__(self):
-        self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
+
-        self.TES_CSV = self.get_tmp_path() / "tes_2021_data.csv"
+        # input
        self.TES_CSV = self.get_sources_path() / "tes_2021_data.csv"
        # output
        self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
        self.df: gpd.GeoDataFrame
        self.tes_state_dfs = []
        # config
        self.states = [
            "al",
            "az",
@ -76,21 +85,36 @@ class TreeEquityScoreETL(ExtractTransformLoad):
            "wy",
        ]
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        tes_url = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
        sources = []
        for state in self.states:
            sources.append(
                ZIPDataSource(
                    source=f"{tes_url}{state}.zip.zip",
                    destination=self.get_sources_path() / state,
                )
            )
        return sources
    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
-                f"{self.TES_URL}{state}.zip.zip",
+            use_cached_data_sources
-                f"{self.get_tmp_path()}/{state}",
+        )  # download and extract data sources
        for state in self.states:
            self.tes_state_dfs.append(
                gpd.read_file(f"{self.get_sources_path()}/{state}/{state}.shp")
            )
    def transform(self) -> None:
-        tes_state_dfs = []
+
        for state in self.states:
            tes_state_dfs.append(
                gpd.read_file(f"{self.get_tmp_path()}/{state}/{state}.shp")
            )
        self.df = gpd.GeoDataFrame(
-            pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs
+            pd.concat(self.tes_state_dfs), crs=self.tes_state_dfs[0].crs
        )
        # rename ID to Tract ID
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py
@ -4,63 +4,57 @@ import geopandas as gpd
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import ZIPDataSource
 from data_pipeline.score import field_names
 from data_pipeline.utils import get_module_logger
 from data_pipeline.utils import unzip_file_from_url
 logger = get_module_logger(__name__)
 class TribalETL(ExtractTransformLoad):
    def __init__(self):
        self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
        self.GEOGRAPHIC_BASE_PATH = (
            self.DATA_PATH / "tribal" / "geographic_data"
        )
        self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
        self.NATIONAL_TRIBAL_GEOJSON_PATH = (
            self.GEOGRAPHIC_BASE_PATH / "usa.json"
        )
        self.USA_TRIBAL_DF_LIST = []
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
        """Extract the tribal geojson zip files from Justice40 S3 data folder
-        Returns:
+        national_lar_url = (
            None
        """
        bia_shapefile_zip_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/BIA_National_LAR_updated_20220929.zip"
        )
-
+        tsa_and_aian_url = (
        tsa_and_aian_geojson_zip_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/BIA_TSA_and_AIAN_json.zip"
        )
-
+        alaska_native_villages_url = (
        alaska_geojson_url = (
            settings.AWS_JUSTICE40_DATASOURCES_URL
            + "/Alaska_Native_Villages_json.zip"
        )
-        unzip_file_from_url(
+        return [
-            bia_shapefile_zip_url,
+            ZIPDataSource(
-            self.TMP_PATH,
+                national_lar_url,
-            self.GEOGRAPHIC_BASE_PATH / "bia_national_lar",
+                destination=self.get_sources_path() / "bia_national_lar",
-        )
+            ),
-
+            ZIPDataSource(
-        unzip_file_from_url(
+                source=tsa_and_aian_url,
-            tsa_and_aian_geojson_zip_url,
+                destination=self.get_sources_path() / "tsa_and_aian",
-            self.TMP_PATH,
+            ),
-            self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian",
+            ZIPDataSource(
-        )
+                source=alaska_native_villages_url,
-
+                destination=self.get_sources_path() / "alaska_native_villages",
-        unzip_file_from_url(
+            ),
-            alaska_geojson_url,
+        ]
            self.TMP_PATH,
            self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages",
        )
    def _transform_bia_national_lar(self, path: Path) -> None:
        """Transform the Tribal BIA National Lar Geodataframe and appends it to the
@ -187,21 +181,21 @@ class TribalETL(ExtractTransformLoad):
        """
        # Set the filepaths:
        bia_national_lar_shapefile = (
-            self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
+            self.get_sources_path() / "bia_national_lar"
        )
        bia_aian_supplemental_geojson = (
-            self.GEOGRAPHIC_BASE_PATH
+            self.get_sources_path()
            / "tsa_and_aian"
            / "BIA_AIAN_Supplemental.json"
        )
        bia_tsa_geojson = (
-            self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json"
+            self.get_sources_path() / "tsa_and_aian" / "BIA_TSA.json"
        )
        alaska_native_villages_geojson = (
-            self.GEOGRAPHIC_BASE_PATH
+            self.get_sources_path()
            / "alaska_native_villages"
            / "AlaskaNativeVillages.gdb.geojson"
        )
@ -225,7 +219,11 @@ class TribalETL(ExtractTransformLoad):
            "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
        )
        # note – this works a little different than many of the ETLs. The file
        # being written here is used again downstream, so it's placed in a
        # special directory.
        logger.debug("Writing national geojson file")
        self.GEOGRAPHIC_BASE_PATH.mkdir(parents=True, exist_ok=True)
        usa_tribal_df.to_file(
            self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py
@ -4,6 +4,7 @@ import geopandas as gpd
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 from data_pipeline.etl.sources.geo_utils import get_tract_geojson
@ -67,6 +68,9 @@ class TribalOverlapETL(ExtractTransformLoad):
        self.census_tract_gdf: gpd.GeoDataFrame
        self.tribal_gdf: gpd.GeoDataFrame
    def get_data_sources(self) -> [DataSource]:
        return []  # this uses already retrieved / calculated data
    @staticmethod
    def _create_string_from_list(series: pd.Series) -> str:
        """Helper method that creates a sorted string list (for tribal names)."""
@ -89,7 +93,12 @@ class TribalOverlapETL(ExtractTransformLoad):
        return percentage_float
-    def extract(self) -> None:
+    def extract(self, use_cached_data_sources: bool = False) -> None:
        super().extract(
            use_cached_data_sources
        )  # download and extract data sources
        self.census_tract_gdf = get_tract_geojson()
        self.tribal_gdf = get_tribal_geojson()
--- a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py
@ -4,9 +4,10 @@ import geopandas as gpd
 import numpy as np
 import pandas as pd
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.datasource import DataSource
 from data_pipeline.etl.datasource import FileDataSource
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
 from data_pipeline.utils import download_file_from_url
 from data_pipeline.utils import get_module_logger
 from data_pipeline.config import settings
@ -28,19 +29,6 @@ class USArmyFUDS(ExtractTransformLoad):
    def __init__(self):
        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
            self.FILE_URL = (
                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
                "all_data_reported_to_Congress_in_FY2020.geojson"
            )
        else:
            self.FILE_URL: str = (
                "https://opendata.arcgis.com/api/v3/datasets/"
                "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
                "data?format=geojson&spatialRefId=4326&where=1%3D1"
            )
        self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"
        # Constants for output
@ -50,17 +38,27 @@ class USArmyFUDS(ExtractTransformLoad):
            self.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
            self.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
        ]
-        self.DOWNLOAD_FILE_NAME = self.get_tmp_path() / "fuds.geojson"
+        self.fuds_source = self.get_sources_path() / "fuds.geojson"
        self.raw_df: gpd.GeoDataFrame
        self.output_df: pd.DataFrame
-    def extract(self) -> None:
+    def get_data_sources(self) -> [DataSource]:
-        download_file_from_url(
+
-            file_url=self.FILE_URL,
+        if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
-            download_file_name=self.DOWNLOAD_FILE_NAME,
+            fuds_url = (
-            verify=True,
+                f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
                "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
                "all_data_reported_to_Congress_in_FY2020.geojson"
            )
        else:
            fuds_url: str = (
                "https://opendata.arcgis.com/api/v3/datasets/"
                "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
                "data?format=geojson&spatialRefId=4326&where=1%3D1"
            )
        return [FileDataSource(source=fuds_url, destination=self.fuds_source)]
    def transform(self) -> None:
        # before we try to do any transformation, get the tract data
@ -68,7 +66,7 @@ class USArmyFUDS(ExtractTransformLoad):
        logger.debug("Loading FUDS data as GeoDataFrame for transform")
        raw_df = gpd.read_file(
-            filename=self.DOWNLOAD_FILE_NAME,
+            filename=self.fuds_source,
            low_memory=False,
        )
--- a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py
@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
        data. A basic version of that patching is included here for classes that can use it.
        """
        data_path, tmp_path = mock_paths
        sources_path = data_path / "sources" / self._ETL_CLASS.__name__
        sources_path.mkdir(parents=True, exist_ok=True)
        with mock.patch(
-            "data_pipeline.utils.requests"
+            "data_pipeline.etl.downloader.requests"
        ) as requests_mock, mock.patch(
            "data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
        ) as sources_mock, mock.patch(
            "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
        ) as mock_get_state_fips_codes:
            tmp_path = mock_paths[1]
            # requests mock
            def fake_get(url, *args, **kwargs):
                file_path = url.split("/")[-1]
                with open(
@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
                return response_mock
            requests_mock.get = fake_get
            # fips codes mock
            mock_get_state_fips_codes.return_value = [
                x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
            ]
            # sources mock
            sources_mock.return_value = sources_path
            # Instantiate the ETL class.
            etl = self._get_instance_of_etl_class()
            # Monkey-patch the temporary directory to the one used in the test
            etl.TMP_PATH = tmp_path
            etl.SOURCES_PATH = data_path / "sources"
            # Run the extract method.
            etl.extract()
        def fake_get_sources_path() -> pathlib.PosixPath:
            return sources_path
        mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
        return etl
    def test_init(self, mock_etl, mock_paths):
--- a/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py
@ -28,7 +28,7 @@ class TestTravelCompositeETL(TestETL):
            mock_paths=mock_paths,
        )
        df = gpd.read_file(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
            dtype={etl.GEOID_TRACT_FIELD_NAME: str},
        )
        assert df.shape[0] == 30
--- a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py
@ -5,6 +5,7 @@ from data_pipeline.config import settings
 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.base import ValidGeoLevel
 from data_pipeline.utils import get_module_logger
 from data_pipeline.etl.datasource import DataSource
 logger = get_module_logger(__name__)
@ -30,6 +31,9 @@ class ExampleETL(ExtractTransformLoad):
            self.EXAMPLE_FIELD_NAME,
        ]
    def get_data_sources(self) -> [DataSource]:
        return []
    def extract(self):
        # Pretend to download zip from external URL, write it to CSV.
        zip_file_path = (
@ -42,11 +46,11 @@ class ExampleETL(ExtractTransformLoad):
        )
        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
-            zip_ref.extractall(self.get_tmp_path())
+            zip_ref.extractall(self.get_sources_path())
    def transform(self):
        df: pd.DataFrame = pd.read_csv(
-            self.get_tmp_path() / "input.csv",
+            self.get_sources_path() / "input.csv",
            dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
            low_memory=False,
        )
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@ -124,12 +124,18 @@ class TestETL:
        data. A basic version of that patching is included here for classes that can use it.
        """
        data_path, tmp_path = mock_paths
        sources_path = data_path / "sources" / self._ETL_CLASS.__name__
        sources_path.mkdir(parents=True, exist_ok=True)
        with mock.patch(
-            "data_pipeline.utils.requests"
+            "data_pipeline.etl.downloader.requests"
        ) as requests_mock, mock.patch(
            "data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
        ) as sources_mock, mock.patch(
            "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
        ) as mock_get_state_fips_codes:
-            tmp_path = mock_paths[1]
+
            if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
                zip_file_fixture_src = (
                    self._DATA_DIRECTORY_FOR_TEST
@ -145,6 +151,7 @@ class TestETL:
                    "rb",
                ) as file:
                    file_contents = file.read()
            response_mock = requests.Response()
            response_mock.status_code = 200
            # pylint: disable=protected-access
@ -154,15 +161,25 @@ class TestETL:
            mock_get_state_fips_codes.return_value = [
                x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
            ]
            # sources mock
            sources_mock.return_value = sources_path
            # Instantiate the ETL class.
            etl = self._get_instance_of_etl_class()
            # Monkey-patch the temporary directory to the one used in the test
            etl.TMP_PATH = tmp_path
            etl.SOURCES_PATH = data_path / "sources"
            # Run the extract method.
            etl.extract()
        def fake_get_sources_path() -> pathlib.PosixPath:
            return sources_path
        mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
        return etl
    def test_init_base(self, mock_etl, mock_paths):
@ -263,17 +280,12 @@ class TestETL:
        file was unzipped from a "fake" downloaded zip (located in data) in a  temporary path.
        """
        if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
            tmp_path = mock_paths[1]
-            _ = self._setup_etl_instance_and_run_extract(
+            etl = self._setup_etl_instance_and_run_extract(
                mock_etl=mock_etl,
                mock_paths=mock_paths,
            )
-            assert (
+            assert (etl.get_sources_path()).exists()
                tmp_path
                / self._EXTRACT_TMP_FOLDER_NAME
                / self._SAMPLE_DATA_FILE_NAME
            ).exists()
    def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
        """Tests the extract method.
@ -285,8 +297,11 @@ class TestETL:
            mock_etl=mock_etl,
            mock_paths=mock_paths,
        )
        data_path, tmp_path = mock_paths
        tmp_df = pd.read_csv(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
            dtype={etl.GEOID_TRACT_FIELD_NAME: str},
        )
        snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
--- a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py
@ -29,7 +29,7 @@ class TestHistoricRedliningETL(TestETL):
            mock_paths=mock_paths,
        )
        tmp_df = pd.read_excel(
-            etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
+            etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
            dtype={etl.GEOID_TRACT_FIELD_NAME: str},
        )
        assert tmp_df.shape == (15, 5)
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@ -36,23 +36,12 @@ class TestNationalRiskIndexETL(TestETL):
    def test_init(self, mock_etl, mock_paths):
        """Tests that the mock NationalRiskIndexETL class instance was
-        initiliazed correctly.
+        initialized correctly.
        Validates the following conditions:
        - self.DATA_PATH points to the "data" folder in the temp directory
        - self.TMP_PATH points to the "data/tmp" folder in the temp directory
        - self.INPUT_PATH points to the correct path in the temp directory
        - self.OUTPUT_PATH points to the correct path in the temp directory
        """
        # setup
        etl = NationalRiskIndexETL()
        data_path, tmp_path = mock_paths
        input_csv = (
            tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
        )
        # validation
        assert etl.INPUT_CSV == input_csv
        assert etl.GEOID_FIELD_NAME == "GEOID10"
        assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
        assert etl.NAME == "national_risk_index"