From 6f39033ddee026354d455537ced127e6eae18cf7 Mon Sep 17 00:00:00 2001 From: Travis Newby <83976412+travis-newby@users.noreply.github.com> Date: Fri, 3 Mar 2023 12:26:24 -0600 Subject: [PATCH] Add ability to cache ETL data sources (#2169) * Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline --- data/data-pipeline/README.md | 4 +- .../data_pipeline/application.py | 146 ++++++++++++++++-- data/data-pipeline/data_pipeline/etl/base.py | 89 +++++++---- .../data_pipeline/etl/datasource.py | 124 +++++++++++++++ .../data_pipeline/etl/downloader.py | 95 ++++++++++++ .../data-pipeline/data_pipeline/etl/runner.py | 74 +++++++-- .../data_pipeline/etl/score/etl_score.py | 10 +- .../data_pipeline/etl/score/etl_score_geo.py | 9 +- .../data_pipeline/etl/score/etl_score_post.py | 24 ++- .../data_pipeline/etl/score/etl_utils.py | 4 +- .../etl/sources/calenviroscreen/etl.py | 50 ++++-- .../etl/sources/cdc_life_expectancy/etl.py | 95 +++++++----- .../etl/sources/cdc_places/etl.py | 37 +++-- .../etl/sources/cdc_svi_index/etl.py | 34 +++- .../data_pipeline/etl/sources/census/etl.py | 41 ++--- .../etl/sources/census/etl_utils.py | 1 + .../etl/sources/census_acs/etl.py | 64 +++++--- .../etl/sources/census_acs_2010/etl.py | 37 +++-- .../sources/census_acs_median_income/etl.py | 104 ++++++++----- .../etl/sources/census_decennial/etl.py | 63 ++++++-- .../sources/child_opportunity_index/etl.py | 43 ++++-- .../etl/sources/doe_energy_burden/etl.py | 45 ++++-- .../etl/sources/dot_travel_composite/etl.py | 53 +++++-- .../data_pipeline/etl/sources/eamlis/etl.py | 49 ++++-- .../data_pipeline/etl/sources/ejscreen/etl.py | 38 +++-- .../sources/ejscreen_areas_of_concern/etl.py | 28 +++- .../etl.py | 38 +++-- .../data_pipeline/etl/sources/epa_rsei/etl.py | 40 +++-- .../etl/sources/fsf_flood_risk/etl.py | 68 +++++--- .../etl/sources/fsf_wildfire_risk/etl.py | 63 ++++++-- .../data_pipeline/etl/sources/geocorr/etl.py | 41 +++-- .../etl/sources/historic_redlining/etl.py | 51 ++++-- .../sources/housing_and_transportation/etl.py | 49 +++--- .../etl/sources/hud_housing/etl.py | 39 +++-- .../etl/sources/hud_recap/etl.py | 45 +++--- .../etl/sources/mapping_for_ej/etl.py | 54 +++++-- .../etl/sources/mapping_inequality/etl.py | 75 +++++---- .../etl/sources/maryland_ejscreen/etl.py | 45 ++++-- .../etl/sources/michigan_ejscreen/etl.py | 31 +++- .../etl/sources/national_risk_index/etl.py | 115 ++++++++------ .../etl/sources/nlcd_nature_deprived/etl.py | 55 +++++-- .../etl/sources/persistent_poverty/etl.py | 53 ++++--- .../etl/sources/tree_equity_score/etl.py | 48 ++++-- .../data_pipeline/etl/sources/tribal/etl.py | 66 ++++---- .../etl/sources/tribal_overlap/etl.py | 11 +- .../etl/sources/us_army_fuds/etl.py | 42 +++-- .../sources/cdc_life_expectancy/test_etl.py | 23 ++- .../sources/dot_travel_composite/test_etl.py | 2 +- .../tests/sources/example/etl.py | 8 +- .../tests/sources/example/test_etl.py | 35 +++-- .../sources/historic_redlining/test_etl.py | 2 +- .../sources/national_risk_index/test_etl.py | 13 +- 52 files changed, 1787 insertions(+), 686 deletions(-) create mode 100644 data/data-pipeline/data_pipeline/etl/datasource.py create mode 100644 data/data-pipeline/data_pipeline/etl/downloader.py diff --git a/data/data-pipeline/README.md b/data/data-pipeline/README.md index bd5d68e2..5372882b 100644 --- a/data/data-pipeline/README.md +++ b/data/data-pipeline/README.md @@ -92,7 +92,6 @@ If you want to run specific data tasks, you can open a terminal window, navigate - Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles` To learn more about these commands and when they should be run, refer to [Running for Local Development](#running-for-local-development). - --- @@ -136,6 +135,9 @@ Once you've downloaded the census data, run the following commands – in order Many commands have options. For example, you can run a single dataset with `etl-run` by passing the command line parameter `-d name-of-dataset-to-run`. Please use the `--help` option to find out more. +> :bulb: **NOTE** +> One important command line option is enabling cached data sources. Pass the command line parameter `-u` to many commands (e.g. `etl-run`) to use locally cached data sources within the ETL portion of the pipeline. This will ensure that you don't download many GB of data with each run of the data pipeline. + ## How Scoring Works Scores are generated by running the `score-run` command via Poetry or Docker. This command executes [`data_pipeline/etl/score/etl_score.py`](data_pipeline/etl/score/etl_score.py). During execution, diff --git a/data/data-pipeline/data_pipeline/application.py b/data/data-pipeline/data_pipeline/application.py index ad621894..a1c10865 100644 --- a/data/data-pipeline/data_pipeline/application.py +++ b/data/data-pipeline/data_pipeline/application.py @@ -7,6 +7,9 @@ from data_pipeline.etl.runner import etl_runner from data_pipeline.etl.runner import score_generate from data_pipeline.etl.runner import score_geo from data_pipeline.etl.runner import score_post +from data_pipeline.etl.runner import get_data_sources +from data_pipeline.etl.runner import extract_data_sources as extract_ds +from data_pipeline.etl.runner import clear_data_source_cache as clear_ds_cache from data_pipeline.etl.sources.census.etl_utils import check_census_data_source from data_pipeline.etl.sources.census.etl_utils import ( reset_data_directories as census_reset, @@ -79,7 +82,14 @@ def data_cleanup(): is_flag=True, help="Upload to AWS S3 a zipped archive of the census data.", ) -def census_data_download(zip_compress): +@click.option( + "-u", + "--use-cache", + is_flag=True, + default=False, + help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.", +) +def census_data_download(zip_compress, use_cache): """CLI command to download all census shape files from the Census FTP and extract the geojson to generate national and by state Census Block Group CSVs""" log_title("Download Census Data ") @@ -88,7 +98,7 @@ def census_data_download(zip_compress): census_reset(data_path) log_info("Downloading census data") - etl_runner("census") + etl_runner("census", use_cache) if zip_compress: log_info("Zipping census data") @@ -129,7 +139,14 @@ def pull_census_data(data_source: str): type=str, help=dataset_cli_help, ) -def etl_run(dataset: str): +@click.option( + "-u", + "--use-cache", + is_flag=True, + default=False, + help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.", +) +def etl_run(dataset: str, use_cache: bool): """Run a specific or all ETL processes Args: @@ -141,7 +158,7 @@ def etl_run(dataset: str): log_title("Run ETL") log_info("Running dataset(s)") - etl_runner(dataset) + etl_runner(dataset, use_cache) log_goodbye() sys.exit() @@ -167,7 +184,14 @@ def score_run(): @cli.command( help="Run ETL + Score Generation", ) -def score_full_run(): +@click.option( + "-u", + "--use-cache", + is_flag=True, + default=False, + help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.", +) +def score_full_run(use_cache: bool): """CLI command to run ETL and generate the score in one command""" log_title("Score Full Run", "Run ETL and Generate Score (no tiles)") @@ -177,7 +201,7 @@ def score_full_run(): temp_folder_cleanup() log_info("Running all ETLs") - etl_runner() + etl_runner(use_cache=use_cache) log_info("Generating score") score_generate() @@ -297,7 +321,14 @@ def generate_map_tiles(generate_tribal_layer): type=str, help=dataset_cli_help, ) -def data_full_run(check: bool, data_source: str): +@click.option( + "-u", + "--use-cache", + is_flag=True, + default=False, + help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.", +) +def data_full_run(check: bool, data_source: str, use_cache: bool): """CLI command to run ETL, score, JSON combine and generate tiles in one command Args: @@ -330,10 +361,10 @@ def data_full_run(check: bool, data_source: str): if data_source == "local": log_info("Downloading census data") - etl_runner("census") + etl_runner("census", use_cache) log_info("Running all ETLs") - etl_runner() + etl_runner(use_cache=use_cache) log_info("Generating score") score_generate() @@ -357,6 +388,103 @@ def data_full_run(check: bool, data_source: str): sys.exit() +@cli.command( + help="Print data sources for all ETL processes (or a specific one)", +) +@click.option( + "-d", + "--dataset", + required=False, + type=str, + help=dataset_cli_help, +) +def print_data_sources(dataset: str): + """Print data sources for all ETL processes (or a specific one) + + Args: + dataset (str): Name of the ETL module to be run (optional) + + Returns: + None + """ + log_title("Print ETL Datasources") + + log_info("Retrieving dataset(s)") + sources = get_data_sources(dataset) + + log_info(f"Discovered {len(sources)} files") + + for s in sources: + log_info(s) + + log_goodbye() + sys.exit() + + +@cli.command( + help="Fetch data sources for all ETL processes (or a specific one)", +) +@click.option( + "-d", + "--dataset", + required=False, + type=str, + help=dataset_cli_help, +) +@click.option( + "-u", + "--use-cache", + is_flag=True, + default=False, + help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.", +) +def extract_data_sources(dataset: str, use_cache: bool): + """Extract and cache data source(s) for all ETL processes (or a specific one) + + Args: + dataset (str): Name of the ETL module whose data sources you wish to fetch + use_cache (bool): Use this flag if you wish to use the cached data sources (if they exist) + + Returns: + None + """ + log_title("Fetch ETL Datasources") + + log_info("Fetching data source(s)") + extract_ds(dataset, use_cache) + + log_goodbye() + sys.exit() + + +@cli.command( + help="Clear data source cache for all ETL processes (or a specific one)", +) +@click.option( + "-d", + "--dataset", + required=False, + type=str, + help=dataset_cli_help, +) +def clear_data_source_cache(dataset: str): + """Clear data source(s) cache for all ETL processes (or a specific one) + + Args: + dataset (str): Name of the ETL module whose cache you wish to clear + + Returns: + None + """ + log_title("Fetch ETL Datasources") + + log_info("Clear data source cache") + clear_ds_cache(dataset) + + log_goodbye() + sys.exit() + + def log_title(title: str, subtitle: str = None): """Logs a title in our fancy title format""" logger.info("-" * LOG_LINE_WIDTH) diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index c15f0240..945b6ccb 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -2,7 +2,9 @@ import enum import pathlib import sys import typing +import shutil from typing import Optional +from abc import ABC, abstractmethod import pandas as pd from data_pipeline.config import settings @@ -13,7 +15,7 @@ from data_pipeline.etl.score.schemas.datasets import DatasetsConfig from data_pipeline.utils import get_module_logger from data_pipeline.utils import load_yaml_dict_from_file from data_pipeline.utils import remove_all_from_dir -from data_pipeline.utils import unzip_file_from_url +from data_pipeline.etl.datasource import DataSource logger = get_module_logger(__name__) @@ -25,7 +27,7 @@ class ValidGeoLevel(enum.Enum): CENSUS_BLOCK_GROUP = enum.auto() -class ExtractTransformLoad: +class ExtractTransformLoad(ABC): """ A class used to instantiate an ETL object to retrieve and process data from datasets. @@ -45,6 +47,7 @@ class ExtractTransformLoad: # Directories DATA_PATH: pathlib.Path = settings.DATA_PATH TMP_PATH: pathlib.Path = DATA_PATH / "tmp" + SOURCES_PATH: pathlib.Path = DATA_PATH / "sources" CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config" DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config" DATASET_CONFIG: Optional[dict] = None @@ -177,45 +180,60 @@ class ExtractTransformLoad: output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv" return output_file_path - def get_tmp_path(self) -> pathlib.Path: - """Returns the temporary path associated with this ETL class.""" - # Note: the temporary path will be defined on `init`, because it uses the class - # of the instance which is often a child class. - tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__) + def get_sources_path(self) -> pathlib.Path: + """Returns the sources path associated with this ETL class. The sources path + is the home for cached data sources used by this ETL.""" + + sources_path = self.SOURCES_PATH / str(self.__class__.__name__) # Create directory if it doesn't exist - tmp_path.mkdir(parents=True, exist_ok=True) + sources_path.mkdir(parents=True, exist_ok=True) - return tmp_path + return sources_path - def extract( - self, - source_url: str = None, - extract_path: pathlib.Path = None, - verify: Optional[bool] = True, - ) -> None: - """Extract the data from a remote source. By default it provides code - to get the file from a source url, unzips it and stores it on an - extract_path.""" + @abstractmethod + def get_data_sources(self) -> [DataSource]: + pass - if source_url is None: - source_url = self.SOURCE_URL + def _fetch(self) -> None: + """Fetch all data sources for this ETL. When data sources are fetched, they + are stored in a cache directory for consistency between runs.""" + for ds in self.get_data_sources(): + ds.fetch() - if extract_path is None: - extract_path = self.get_tmp_path() + def clear_data_source_cache(self) -> None: + """Clears the cache for this ETLs data source(s)""" + shutil.rmtree(self.get_sources_path()) - unzip_file_from_url( - file_url=source_url, - download_path=self.get_tmp_path(), - unzipped_file_path=extract_path, - verify=verify, - ) + def extract(self, use_cached_data_sources: bool = False) -> None: + """Extract (download) data from a remote source, and validate + that data. By default, this method fetches data from the set of + data sources returned by get_data_sources. + If use_cached_data_sources is true, this method attempts to use cached data + rather than re-downloading from the original source. The cache algorithm is very + simple: it just looks to see if the directory has any contents. If so, it uses + that content. If not, it downloads all data sources. + + Subclasses should call super() before performing any work if they wish to take + advantage of the automatic downloading and caching ability of this superclass. + """ + + if use_cached_data_sources and any(self.get_sources_path().iterdir()): + logger.info( + f"Using cached data sources for {self.__class__.__name__}" + ) + else: + self.clear_data_source_cache() + self._fetch() + + # the rest of the work should be performed here + + @abstractmethod def transform(self) -> None: """Transform the data extracted into a format that can be consumed by the score generator""" - - raise NotImplementedError + pass def validate(self) -> None: """Validates the output. @@ -380,3 +398,14 @@ class ExtractTransformLoad: def cleanup(self) -> None: """Clears out any files stored in the TMP folder""" remove_all_from_dir(self.get_tmp_path()) + + def get_tmp_path(self) -> pathlib.Path: + """Returns the temporary path associated with this ETL class.""" + # Note: the temporary path will be defined on `init`, because it uses the class + # of the instance which is often a child class. + tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__) + + # Create directory if it doesn't exist + tmp_path.mkdir(parents=True, exist_ok=True) + + return tmp_path diff --git a/data/data-pipeline/data_pipeline/etl/datasource.py b/data/data-pipeline/data_pipeline/etl/datasource.py new file mode 100644 index 00000000..3d299207 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/datasource.py @@ -0,0 +1,124 @@ +"""This module defines a set of classes that can be used to fetch data +from a remote source. They are meant to be used in conjuction with ETLs +or other classes that require downloading data. + +There are three types of data sources defined in this file: + +FileDataSource – meant to be used when you have a single file to +retrive from a remote location and save to a destination. + +ZipDataSource – used when you need to fetch and unzip a file, and save +the contents of that file to a destination. + +CensusDataSource – used to download data from the Census API and store +the contents to a destination. + +DataSource subclasses must implement the fetch method to define how +they will reach out to a remote source, download the data, and save +that data to the destination. +""" + +from pathlib import Path +from typing import List +from dataclasses import dataclass +from abc import ABC, abstractmethod + +from data_pipeline.etl.downloader import Downloader +from data_pipeline.etl.sources.census_acs.etl_utils import ( + retrieve_census_acs_data, +) + + +@dataclass +class DataSource(ABC): + """A data source represents any source of data that is fetchable + from a remote location. + + Attributes: + source : str + the location of this data source, as a url + destination : Path + the Path where the data source should be saved locally upon being fetched + + """ + + source: str + destination: Path + + @abstractmethod + def fetch(self) -> None: + pass + + +@dataclass +class FileDataSource(DataSource): + """A data source representing a single file. + + This single file will be fetched from the source and saved to a single + destination. + """ + + def fetch(self) -> None: + """Fetches a single file from a source and saves it to a destination.""" + + self.destination.parent.mkdir(parents=True, exist_ok=True) + Downloader.download_file_from_url( + file_url=self.source, + download_file_name=self.destination, + verify=True, + ) + + def __str__(self): + return f"File – {self.source}" + + +@dataclass +class ZIPDataSource(DataSource): + """A data source representing ZIP files. + + Zip files will be fetched and placed in the destination folder, then unzipped. + """ + + def fetch(self) -> None: + + self.destination.mkdir(parents=True, exist_ok=True) + Downloader.download_zip_file_from_url( + file_url=self.source, + unzipped_file_path=self.destination, + verify=True, + ) + + def __str__(self): + return f"Zip – {self.source}" + + +@dataclass +class CensusDataSource(DataSource): + """A data source representing census data. + + Data will be fetched using the Census API and saved to the destination file. Source is ignored. + """ + + acs_year: int + variables: List[str] + tract_output_field_name: str + data_path_for_fips_codes: Path + acs_type: str + + def fetch(self) -> None: + + df = retrieve_census_acs_data( + acs_year=self.acs_year, + variables=self.variables, + tract_output_field_name=self.tract_output_field_name, + data_path_for_fips_codes=self.data_path_for_fips_codes, + acs_type=self.acs_type, + ) + + self.destination.parent.mkdir(parents=True, exist_ok=True) + + # Write CSV representation of census data + df.to_csv(self.destination, index=False) + + def __str__(self): + return f"Census – {self.acs_type}, {self.acs_year}" diff --git a/data/data-pipeline/data_pipeline/etl/downloader.py b/data/data-pipeline/data_pipeline/etl/downloader.py new file mode 100644 index 00000000..53ea2a38 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/downloader.py @@ -0,0 +1,95 @@ +import uuid +import urllib3 +import requests +import zipfile +import shutil + +from pathlib import Path +from data_pipeline.config import settings + + +class Downloader: + """A simple class to encapsulate the download capabilities of the application""" + + @classmethod + def download_file_from_url( + cls, + file_url: str, + download_file_name: Path, + verify: bool = True, + ) -> str: + """Downloads a file from a remote URL location and returns the file location. + + Args: + file_url (str): URL where the zip file is located + download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default) + verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an + error (optional, default to False) + + Returns: + None + + """ + # disable https warning + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + download_file_name.parent.mkdir(parents=True, exist_ok=True) + + response = requests.get( + file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT + ) + if response.status_code == 200: + file_contents = response.content + else: + raise Exception( + f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}" + ) + + # Write the contents to disk. + file = open(download_file_name, "wb") + file.write(file_contents) + file.close() + + return download_file_name + + @classmethod + def download_zip_file_from_url( + cls, + file_url: str, + unzipped_file_path: Path, + verify: bool = True, + ) -> None: + """Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after + + Args: + file_url (str): URL where the zip file is located + unzipped_file_path (pathlib.Path): directory and name of the extracted file + verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an + error (optional, default to False) + + Returns: + None + + """ + # dir_id allows us to evade race conditions on parallel ETLs + dir_id = uuid.uuid4() + + zip_download_path = ( + settings.DATA_PATH + / "tmp" + / "downloads" + / f"{dir_id}" + / "download.zip" + ) + + zip_file_path = Downloader.download_file_from_url( + file_url=file_url, + download_file_name=zip_download_path, + verify=verify, + ) + + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(unzipped_file_path) + + # cleanup temporary file and directory + shutil.rmtree(zip_download_path.parent) diff --git a/data/data-pipeline/data_pipeline/etl/runner.py b/data/data-pipeline/data_pipeline/etl/runner.py index 8d896ded..5014771a 100644 --- a/data/data-pipeline/data_pipeline/etl/runner.py +++ b/data/data-pipeline/data_pipeline/etl/runner.py @@ -2,10 +2,14 @@ import concurrent.futures import importlib import typing +from functools import reduce + from data_pipeline.etl.score.etl_score import ScoreETL from data_pipeline.etl.score.etl_score_geo import GeoScoreETL from data_pipeline.etl.score.etl_score_post import PostScoreETL from data_pipeline.utils import get_module_logger +from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource from . import constants @@ -40,20 +44,26 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]: return dataset_list -def _run_one_dataset(dataset: dict) -> None: - """Runs one etl process.""" - - logger.info(f"Running ETL for {dataset['name']}") - +def _get_dataset(dataset: dict) -> ExtractTransformLoad: + """Instantiates a dataset object from a dictionary description of that object's class""" etl_module = importlib.import_module( f"data_pipeline.etl.sources.{dataset['module_dir']}.etl" ) etl_class = getattr(etl_module, dataset["class_name"]) etl_instance = etl_class() + return etl_instance + + +def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None: + """Runs one etl process.""" + + logger.info(f"Running ETL for {dataset['name']}") + etl_instance = _get_dataset(dataset) + # run extract logger.debug(f"Extracting {dataset['name']}") - etl_instance.extract() + etl_instance.extract(use_cache) # run transform logger.debug(f"Transforming {dataset['name']}") @@ -74,11 +84,12 @@ def _run_one_dataset(dataset: dict) -> None: logger.info(f"Finished ETL for dataset {dataset['name']}") -def etl_runner(dataset_to_run: str = None) -> None: +def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None: """Runs all etl processes or a specific one Args: dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional) + use_cache (bool): Use the cached data sources – if they exist – rather than downloading them all from scratch Returns: None @@ -105,7 +116,9 @@ def etl_runner(dataset_to_run: str = None) -> None: logger.info("Running concurrent ETL jobs") with concurrent.futures.ThreadPoolExecutor() as executor: futures = { - executor.submit(_run_one_dataset, dataset=dataset) + executor.submit( + _run_one_dataset, dataset=dataset, use_cache=use_cache + ) for dataset in concurrent_datasets } @@ -119,7 +132,50 @@ def etl_runner(dataset_to_run: str = None) -> None: if high_memory_datasets: logger.info("Running high-memory ETL jobs") for dataset in high_memory_datasets: - _run_one_dataset(dataset=dataset) + _run_one_dataset(dataset=dataset, use_cache=use_cache) + + +def get_data_sources(dataset_to_run: str = None) -> [DataSource]: + + dataset_list = _get_datasets_to_run(dataset_to_run) + + sources = [] + + for dataset in dataset_list: + etl_instance = _get_dataset(dataset) + sources.append(etl_instance.get_data_sources()) + + sources = reduce( + list.__add__, sources + ) # flatten the list of lists into a single list + + return sources + + +def extract_data_sources( + dataset_to_run: str = None, use_cache: bool = False +) -> None: + + dataset_list = _get_datasets_to_run(dataset_to_run) + + for dataset in dataset_list: + etl_instance = _get_dataset(dataset) + logger.info( + f"Extracting data set for {etl_instance.__class__.__name__}" + ) + etl_instance.extract(use_cache) + + +def clear_data_source_cache(dataset_to_run: str = None) -> None: + + dataset_list = _get_datasets_to_run(dataset_to_run) + + for dataset in dataset_list: + etl_instance = _get_dataset(dataset) + logger.info( + f"Clearing data set cache for {etl_instance.__class__.__name__}" + ) + etl_instance.clear_data_source_cache() def score_generate() -> None: diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py index cf6c4366..0314512b 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py @@ -22,6 +22,8 @@ from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS from data_pipeline.score import field_names from data_pipeline.score.score_runner import ScoreRunner from data_pipeline.utils import get_module_logger +from data_pipeline.etl.datasource import DataSource + logger = get_module_logger(__name__) @@ -55,7 +57,13 @@ class ScoreETL(ExtractTransformLoad): self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = [] - def extract(self) -> None: + def get_data_sources(self) -> [DataSource]: + return ( + [] + ) # we have all prerequisite sources locally as a result of running the ETLs + + def extract(self, use_cached_data_sources: bool = False) -> None: + # EJSCreen csv Load ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv" self.ejscreen_df = pd.read_csv( diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py index b7937272..75544e45 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_geo.py @@ -15,6 +15,7 @@ from data_pipeline.utils import get_module_logger from data_pipeline.utils import load_dict_from_yaml_object_fields from data_pipeline.utils import load_yaml_dict_from_file from data_pipeline.utils import zip_files +from data_pipeline.etl.datasource import DataSource logger = get_module_logger(__name__) @@ -68,7 +69,13 @@ class GeoScoreETL(ExtractTransformLoad): self.geojson_score_usa_high: gpd.GeoDataFrame self.geojson_score_usa_low: gpd.GeoDataFrame - def extract(self) -> None: + def get_data_sources(self) -> [DataSource]: + return ( + [] + ) # we have all prerequisite sources locally as a result of generating the previous steps in the pipeline + + def extract(self, use_cached_data_sources: bool = False) -> None: + # check census data check_census_data_source( census_data_path=self.DATA_PATH / "census", diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 0111bb04..85ce1ba5 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -2,7 +2,9 @@ import json from pathlib import Path import numpy as np +from numpy import float64 import pandas as pd + from data_pipeline.content.schemas.download_schemas import CodebookConfig from data_pipeline.content.schemas.download_schemas import CSVConfig from data_pipeline.content.schemas.download_schemas import ExcelConfig @@ -16,7 +18,8 @@ from data_pipeline.utils import get_module_logger from data_pipeline.utils import load_dict_from_yaml_object_fields from data_pipeline.utils import load_yaml_dict_from_file from data_pipeline.utils import zip_files -from numpy import float64 +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.downloader import Downloader from . import constants @@ -61,6 +64,11 @@ class PostScoreETL(ExtractTransformLoad): self.yaml_global_config_sort_by_label = "sort_by_label" # End YAML definition constants + def get_data_sources(self) -> [DataSource]: + return ( + [] + ) # we have all prerequisite sources locally as a result of generating the score + def _extract_counties(self, county_path: Path) -> pd.DataFrame: logger.debug("Reading Counties CSV") return pd.read_csv( @@ -97,17 +105,23 @@ class PostScoreETL(ExtractTransformLoad): return df - def extract(self) -> None: + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + # check census data check_census_data_source( census_data_path=self.DATA_PATH / "census", census_data_source=self.DATA_SOURCE, ) - super().extract( - constants.CENSUS_COUNTIES_ZIP_URL, - constants.TMP_PATH, + # TODO would could probably add this to the data sources for this file + Downloader.download_zip_file_from_url( + constants.CENSUS_COUNTIES_ZIP_URL, constants.TMP_PATH ) + self.input_counties_df = self._extract_counties( constants.CENSUS_COUNTIES_FILE_NAME ) diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py index bc0f45ac..7de96a42 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py @@ -13,7 +13,7 @@ from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.score import field_names -from data_pipeline.utils import download_file_from_url +from data_pipeline.etl.downloader import Downloader from data_pipeline.utils import get_module_logger from . import constants @@ -48,7 +48,7 @@ def check_score_data_source( # download from s3 if census_data_source is aws if score_data_source == "aws": logger.debug("Fetching Score Tile data from AWS S3") - download_file_from_url( + Downloader.download_file_from_url( file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV ) else: diff --git a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py index 9e3b2db4..68fc010f 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/calenviroscreen/etl.py @@ -1,23 +1,36 @@ import pandas as pd from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.utils import get_module_logger logger = get_module_logger(__name__) class CalEnviroScreenETL(ExtractTransformLoad): + """California environmental screen + + TODO: Need good description + """ + def __init__(self): - self.CALENVIROSCREEN_FTP_URL = ( + + # fetch + self.calenviroscreen_ftp_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/CalEnviroScreen_4.0_2021.zip" ) - self.CALENVIROSCREEN_CSV = ( - self.get_tmp_path() / "CalEnviroScreen_4.0_2021.csv" - ) - self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4" - # Definining some variable names + # input + self.calenviroscreen_source = ( + self.get_sources_path() / "CalEnviroScreen_4.0_2021.csv" + ) + + # output + self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4" + + # Defining some variable names self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score" self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = ( "calenviroscreen_percentile" @@ -32,19 +45,28 @@ class CalEnviroScreenETL(ExtractTransformLoad): self.df: pd.DataFrame - def extract(self) -> None: + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.calenviroscreen_ftp_url, + destination=self.get_sources_path(), + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + super().extract( - self.CALENVIROSCREEN_FTP_URL, - self.get_tmp_path(), + use_cached_data_sources + ) # download and extract data sources + + self.df = pd.read_csv( + self.calenviroscreen_source, dtype={"Census Tract": "string"} ) def transform(self) -> None: # Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically: # https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip # Load comparison index (CalEnviroScreen 4) - self.df = pd.read_csv( - self.CALENVIROSCREEN_CSV, dtype={"Census Tract": "string"} - ) self.df.rename( columns={ @@ -68,5 +90,5 @@ class CalEnviroScreenETL(ExtractTransformLoad): def load(self) -> None: # write nationwide csv - self.CSV_PATH.mkdir(parents=True, exist_ok=True) - self.df.to_csv(self.CSV_PATH / "data06.csv", index=False) + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + self.df.to_csv(self.OUTPUT_PATH / "data06.csv", index=False) diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py index 8c2da2e9..14908281 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py @@ -7,8 +7,9 @@ from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.score.etl_utils import ( compare_to_list_of_expected_state_fips_codes, ) +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import FileDataSource from data_pipeline.score import field_names -from data_pipeline.utils import download_file_from_url from data_pipeline.utils import get_module_logger from data_pipeline.config import settings @@ -17,59 +18,74 @@ logger = get_module_logger(__name__) class CDCLifeExpectancy(ExtractTransformLoad): + """#TODO: create description""" + GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False NAME = "cdc_life_expectancy" - if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV" - else: - USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV" - LOAD_YAML_CONFIG: bool = False LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)" INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID" STATES_MISSING_FROM_USA_FILE = ["23", "55"] - # For some reason, LEEP does not include Maine or Wisconsin in its "All of - # USA" file. Load these separately. - if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV" - MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV" - else: - WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV" - MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV" - TRACT_INPUT_COLUMN_NAME = "Tract ID" STATE_INPUT_COLUMN_NAME = "STATE2KX" - raw_df: pd.DataFrame - output_df: pd.DataFrame + raw_df: pd.DataFrame # result of extraction + output_df: pd.DataFrame # result of transformation def __init__(self): + + # fetch + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.usa_file_url = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV" + else: + self.usa_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV" + + # For some reason, LEEP does not include Maine or Wisconsin in its "All of USA" file. Load these separately. + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.wisconsin_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV" + self.maine_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV" + else: + self.wisconsin_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV" + self.maine_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV" + + # input + self.usa_source = self.get_sources_path() / "US_A.CSV" + self.maine_source = self.get_sources_path() / "ME_A.CSV" + self.wisconsin_source = self.get_sources_path() / "WI_A.CSV" + + # output self.OUTPUT_PATH: Path = ( self.DATA_PATH / "dataset" / "cdc_life_expectancy" ) - # Constants for output - self.COLUMNS_TO_KEEP = [ + self.COLUMNS_TO_KEEP = [ # the columns to save on output self.GEOID_TRACT_FIELD_NAME, field_names.LIFE_EXPECTANCY_FIELD, ] - def _download_and_prep_data( - self, file_url: str, download_file_name: pathlib.Path - ) -> pd.DataFrame: - download_file_from_url( - file_url=file_url, - download_file_name=download_file_name, - verify=True, - ) + def get_data_sources(self) -> [DataSource]: + return [ + FileDataSource( + source=self.usa_file_url, destination=self.usa_source + ), + FileDataSource( + source=self.maine_file_url, destination=self.maine_source + ), + FileDataSource( + source=self.wisconsin_file_url, + destination=self.wisconsin_source, + ), + ] + + def _read_data(self, file_name: pathlib.Path) -> pd.DataFrame: df = pd.read_csv( - filepath_or_buffer=download_file_name, + filepath_or_buffer=file_name, dtype={ # The following need to remain as strings for all of their digits, not get converted to numbers. self.TRACT_INPUT_COLUMN_NAME: "string", @@ -80,12 +96,13 @@ class CDCLifeExpectancy(ExtractTransformLoad): return df - def extract(self) -> None: + def extract(self, use_cached_data_sources: bool = False) -> None: - all_usa_raw_df = self._download_and_prep_data( - file_url=self.USA_FILE_URL, - download_file_name=self.get_tmp_path() / "US_A.CSV", - ) + super().extract( + use_cached_data_sources + ) # download and extract data sources + + all_usa_raw_df = self._read_data(self.usa_source) # Check which states are missing states_in_life_expectancy_usa_file = list( @@ -101,17 +118,11 @@ class CDCLifeExpectancy(ExtractTransformLoad): additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE, ) - logger.debug("Downloading data for Maine") - maine_raw_df = self._download_and_prep_data( - file_url=self.MAINE_FILE_URL, - download_file_name=self.get_tmp_path() / "maine.csv", + maine_raw_df = self._read_data( + self.maine_source, ) - logger.debug("Downloading data for Wisconsin") - wisconsin_raw_df = self._download_and_prep_data( - file_url=self.WISCONSIN_FILE_URL, - download_file_name=self.get_tmp_path() / "wisconsin.csv", - ) + wisconsin_raw_df = self._read_data(self.wisconsin_source) combined_df = pd.concat( objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df], diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py index d940cec9..87f79396 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_places/etl.py @@ -4,14 +4,17 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.score import field_names -from data_pipeline.utils import download_file_from_url from data_pipeline.utils import get_module_logger from data_pipeline.config import settings +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import FileDataSource logger = get_module_logger(__name__) class CDCPlacesETL(ExtractTransformLoad): + """#TODO: Need description""" + NAME = "cdc_places" GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False @@ -21,15 +24,21 @@ class CDCPlacesETL(ExtractTransformLoad): CDC_MEASURE_FIELD_NAME = "Measure" def __init__(self): - self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places" + # fetch if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - self.CDC_PLACES_URL = ( + self.cdc_places_url = ( f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" "cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv" ) else: - self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD" + self.cdc_places_url = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD" + + # input + self.places_source = self.get_sources_path() / "census_tract.csv" + + # output + self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places" self.COLUMNS_TO_KEEP: typing.List[str] = [ self.GEOID_TRACT_FIELD_NAME, @@ -43,19 +52,27 @@ class CDCPlacesETL(ExtractTransformLoad): self.df: pd.DataFrame - def extract(self) -> None: - file_path = download_file_from_url( - file_url=self.CDC_PLACES_URL, - download_file_name=self.get_tmp_path() / "census_tract.csv", - ) + def get_data_sources(self) -> [DataSource]: + return [ + FileDataSource( + source=self.cdc_places_url, destination=self.places_source + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources self.df = pd.read_csv( - filepath_or_buffer=file_path, + filepath_or_buffer=self.places_source, dtype={self.CDC_GEOID_FIELD_NAME: "string"}, low_memory=False, ) def transform(self) -> None: + # Rename GEOID field self.df.rename( columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME}, diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py index 7f725e91..87c29000 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_svi_index/etl.py @@ -1,6 +1,8 @@ import numpy as np import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import FileDataSource from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger from data_pipeline.config import settings @@ -11,22 +13,28 @@ logger = get_module_logger(__name__) class CDCSVIIndex(ExtractTransformLoad): """CDC SVI Index class ingests 2018 dataset located here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html + Please see the README in this module for further details. """ def __init__(self): - self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index" + # fetch if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - self.CDC_SVI_INDEX_URL = ( + self.cdc_svi_index_url = ( f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" "cdc_svi_index/SVI2018_US.csv" ) else: - self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv" + self.cdc_svi_index_url = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv" + + # input + self.svi_source = self.get_sources_path() / "SVI2018_US.csv" + + # output + self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index" self.CDC_RPL_THEMES_THRESHOLD = 0.90 - self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS" self.COLUMNS_TO_KEEP = [ @@ -47,9 +55,21 @@ class CDCSVIIndex(ExtractTransformLoad): self.df: pd.DataFrame - def extract(self) -> None: + def get_data_sources(self) -> [DataSource]: + return [ + FileDataSource( + source=self.cdc_svi_index_url, destination=self.svi_source + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + self.df = pd.read_csv( - filepath_or_buffer=self.CDC_SVI_INDEX_URL, + filepath_or_buffer=self.svi_source, dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"}, low_memory=False, ) @@ -107,8 +127,8 @@ class CDCSVIIndex(ExtractTransformLoad): ) def load(self) -> None: - self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) + self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) self.df[self.COLUMNS_TO_KEEP].to_csv( path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py index 407b83fc..1f4b260a 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl.py @@ -8,7 +8,8 @@ import geopandas as gpd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.utils import get_module_logger -from data_pipeline.utils import unzip_file_from_url +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource logger = get_module_logger(__name__) @@ -20,7 +21,7 @@ class GeoFileType(Enum): class CensusETL(ExtractTransformLoad): - SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp" + # SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp" GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson" CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv" GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson" @@ -29,6 +30,9 @@ class CensusETL(ExtractTransformLoad): GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT" def __init__(self): + + self.shape_file_path = self.get_sources_path() / "shp" + # the fips_states_2010.csv is generated from data here # https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH) @@ -50,7 +54,7 @@ class CensusETL(ExtractTransformLoad): file_path: Path if file_type == GeoFileType.SHP: file_path = Path( - self.SHP_BASE_PATH + self.shape_file_path / fips_code / f"tl_2010_{fips_code}_tract10.shp" ) @@ -60,33 +64,22 @@ class CensusETL(ExtractTransformLoad): file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv") return file_path - def _extract_shp(self, fips_code: str) -> None: - """Download the SHP file for the provided FIPS code + def get_data_sources(self) -> [DataSource]: - Args: - fips_code (str): the FIPS code for the region of interest + sources = [] - Returns: - None - """ - shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP) + for fips_code in self.STATE_FIPS_CODES: - # check if file exists - if not shp_file_path.is_file(): tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip" - unzip_file_from_url( - tract_state_url, - self.TMP_PATH, - self.DATA_PATH / "census" / "shp" / fips_code, + destination_path = self.shape_file_path / fips_code + + sources.append( + ZIPDataSource( + source=tract_state_url, destination=destination_path + ) ) - def extract(self) -> None: - logger.debug("Extracting census data") - for index, fips_code in enumerate(self.STATE_FIPS_CODES): - logger.debug( - f"Extracting shape for FIPS {fips_code} – {index+1} of {len(self.STATE_FIPS_CODES)}" - ) - self._extract_shp(fips_code) + return sources def _transform_to_geojson(self, fips_code: str) -> None: """Convert the downloaded SHP file for the associated FIPS to geojson diff --git a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py index 67a9b32e..9806aa97 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census/etl_utils.py @@ -56,6 +56,7 @@ def get_state_fips_codes(data_path: Path) -> list: else: fips = row[0].strip() fips_state_list.append(fips) + return fips_state_list diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py index c2965493..d67a2bc3 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs/etl.py @@ -8,12 +8,11 @@ from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.sources.census_acs.etl_imputations import ( calculate_income_measures, ) -from data_pipeline.etl.sources.census_acs.etl_utils import ( - retrieve_census_acs_data, -) from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger from data_pipeline.utils import unzip_file_from_url +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import CensusDataSource logger = get_module_logger(__name__) @@ -28,6 +27,9 @@ class CensusACSETL(ExtractTransformLoad): MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1 def __init__(self): + + self.census_acs_source = self.get_sources_path() / "acs.csv" + self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E" self.TOTAL_IN_LABOR_FORCE = "B23025_003E" self.EMPLOYMENT_FIELDS = [ @@ -311,6 +313,34 @@ class CensusACSETL(ExtractTransformLoad): self.df: pd.DataFrame + def get_data_sources(self) -> [DataSource]: + # Define the variables to retrieve + variables = ( + [ + self.MEDIAN_INCOME_FIELD, + self.MEDIAN_HOUSE_VALUE_FIELD, + ] + + self.EMPLOYMENT_FIELDS + + self.LINGUISTIC_ISOLATION_FIELDS + + self.POVERTY_FIELDS + + self.EDUCATIONAL_FIELDS + + self.RE_FIELDS + + self.COLLEGE_ATTENDANCE_FIELDS + + self.AGE_INPUT_FIELDS + ) + + return [ + CensusDataSource( + source=None, + destination=self.census_acs_source, + acs_year=self.ACS_YEAR, + variables=variables, + tract_output_field_name=self.GEOID_TRACT_FIELD_NAME, + data_path_for_fips_codes=self.DATA_PATH, + acs_type="acs5", + ) + ] + # pylint: disable=too-many-arguments def _merge_geojson( self, @@ -339,27 +369,15 @@ class CensusACSETL(ExtractTransformLoad): ) ) - def extract(self) -> None: - # Define the variables to retrieve - variables = ( - [ - self.MEDIAN_INCOME_FIELD, - self.MEDIAN_HOUSE_VALUE_FIELD, - ] - + self.EMPLOYMENT_FIELDS - + self.LINGUISTIC_ISOLATION_FIELDS - + self.POVERTY_FIELDS - + self.EDUCATIONAL_FIELDS - + self.RE_FIELDS - + self.COLLEGE_ATTENDANCE_FIELDS - + self.AGE_INPUT_FIELDS - ) + def extract(self, use_cached_data_sources: bool = False) -> None: - self.df = retrieve_census_acs_data( - acs_year=self.ACS_YEAR, - variables=variables, - tract_output_field_name=self.GEOID_TRACT_FIELD_NAME, - data_path_for_fips_codes=self.DATA_PATH, + super().extract( + use_cached_data_sources + ) # download and extract data sources + + self.df = pd.read_csv( + self.census_acs_source, + dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, ) def transform(self) -> None: diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py index a6dc5869..50cfef76 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_2010/etl.py @@ -1,10 +1,9 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad -from data_pipeline.etl.sources.census_acs.etl_utils import ( - retrieve_census_acs_data, -) from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import CensusDataSource logger = get_module_logger(__name__) @@ -18,6 +17,9 @@ class CensusACS2010ETL(ExtractTransformLoad): """ def __init__(self): + + self.census_acs_source = self.get_sources_path() / "acs_2010.csv" + self.ACS_YEAR = 2010 self.ACS_TYPE = "acs5" self.OUTPUT_PATH = ( @@ -99,7 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad): self.df: pd.DataFrame - def extract(self) -> None: + def get_data_sources(self) -> [DataSource]: # Define the variables to retrieve variables = ( self.UNEMPLOYED_FIELDS @@ -107,13 +109,26 @@ class CensusACS2010ETL(ExtractTransformLoad): + self.POVERTY_FIELDS ) - # Use the method defined on CensusACSETL to reduce coding redundancy. - self.df = retrieve_census_acs_data( - acs_year=self.ACS_YEAR, - variables=variables, - tract_output_field_name=self.GEOID_TRACT_FIELD_NAME, - data_path_for_fips_codes=self.DATA_PATH, - acs_type=self.ACS_TYPE, + return [ + CensusDataSource( + source=None, + destination=self.census_acs_source, + acs_year=self.ACS_YEAR, + variables=variables, + tract_output_field_name=self.GEOID_TRACT_FIELD_NAME, + data_path_for_fips_codes=self.DATA_PATH, + acs_type=self.ACS_TYPE, + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + self.df = pd.read_csv( + self.census_acs_source, dtype={"GEOID10_TRACT": "string"} ) def transform(self) -> None: diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py index f8abc7c4..2a1bf962 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_acs_median_income/etl.py @@ -1,14 +1,16 @@ +import os import json from pathlib import Path import numpy as np import pandas as pd -import requests + from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad -from data_pipeline.utils import download_file_from_url from data_pipeline.utils import get_module_logger -from data_pipeline.utils import unzip_file_from_url +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource +from data_pipeline.etl.datasource import FileDataSource logger = get_module_logger(__name__) @@ -22,6 +24,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): / f"census_acs_median_income_{self.ACS_YEAR}" ) + self.GEOCORR_ALL_STATES_URL = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + + "/geocorr2014_all_states_tracts_only.csv.zip" + ) + self.GEOCORR_ALL_STATES_PATH = self.get_sources_path() / "geocorr" + self.GEOCORR_ALL_STATES_SOURCE = ( + self.GEOCORR_ALL_STATES_PATH + / "geocorr2014_all_states_tracts_only.csv" + ) + # Set constants for Geocorr MSAs data. self.PLACE_FIELD_NAME: str = "Census Place Name" self.COUNTY_FIELD_NAME: str = "County Name" @@ -39,10 +51,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E" + "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area" ) + self.MSA_MEDIAN_INCOME_SOURCE = ( + self.get_sources_path() / "msa" / "msa_median_income.json" + ) self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)" # Set constants for state median incomes self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state" + self.STATE_MEDIAN_INCOME_SOURCE = ( + self.get_sources_path() / "state" / "state_median_income.json" + ) self.STATE_GEOID_FIELD_NAME: str = "GEOID2" self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)" @@ -50,6 +68,18 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): self.PUERTO_RICO_S3_LINK: str = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv" ) + self.PUERTO_RICO_ALL_STATES_SOURCE = ( + self.get_sources_path() / "pr_tracts" / "pr_tracts.csv" + ) + + census_api_key = os.environ.get("CENSUS_API_KEY") + if census_api_key: + self.MSA_MEDIAN_INCOME_URL = ( + self.MSA_MEDIAN_INCOME_URL + f"&key={census_api_key}" + ) + self.STATE_MEDIAN_INCOME_URL = ( + self.STATE_MEDIAN_INCOME_URL + f"&key={census_api_key}" + ) # Constants for output self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference" @@ -76,6 +106,27 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): self.state_median_incomes: dict self.pr_tracts: pd.DataFrame + def get_data_sources(self) -> [DataSource]: + + return [ + ZIPDataSource( + source=self.GEOCORR_ALL_STATES_URL, + destination=self.GEOCORR_ALL_STATES_PATH, + ), + FileDataSource( + source=self.PUERTO_RICO_S3_LINK, + destination=self.PUERTO_RICO_ALL_STATES_SOURCE, + ), + FileDataSource( + source=self.MSA_MEDIAN_INCOME_URL, + destination=self.MSA_MEDIAN_INCOME_SOURCE, + ), + FileDataSource( + source=self.STATE_MEDIAN_INCOME_URL, + destination=self.STATE_MEDIAN_INCOME_SOURCE, + ), + ] + def _transform_geocorr(self) -> pd.DataFrame: # Transform the geocorr data geocorr_df = self.raw_geocorr_df @@ -223,7 +274,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): ) return state_median_incomes_df - def extract(self) -> None: + def extract(self, use_cached_data_sources: bool = False) -> None: + # Load and clean GEOCORR data # Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census. # The specific query used is the following, which takes a couple of minutes to run: @@ -239,18 +291,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): # - Core based statistical area (CBSA) # - CBSA Type (Metro or Micro) logger.debug("Starting download of 1.5MB Geocorr information.") - - unzip_file_from_url( - file_url=settings.AWS_JUSTICE40_DATASOURCES_URL - + "/geocorr2014_all_states_tracts_only.csv.zip", - download_path=self.get_tmp_path(), - unzipped_file_path=self.get_tmp_path() / "geocorr", - ) + super().extract( + use_cached_data_sources + ) # download and extract data sources self.raw_geocorr_df = pd.read_csv( - filepath_or_buffer=self.get_tmp_path() - / "geocorr" - / "geocorr2014_all_states_tracts_only.csv", + filepath_or_buffer=self.GEOCORR_ALL_STATES_SOURCE, # Skip second row, which has descriptions. skiprows=[1], # The following need to remain as strings for all of their digits, not get converted to numbers. @@ -264,39 +310,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad): low_memory=False, ) - logger.debug("Pulling PR tract list down.") - # This step is necessary because PR is not in geocorr at the level that gets joined - pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv" - download_file_from_url( - file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file - ) self.pr_tracts = pd.read_csv( - filepath_or_buffer=self.get_tmp_path() - / "pr_tracts" - / "pr_tracts.csv", + filepath_or_buffer=self.PUERTO_RICO_ALL_STATES_SOURCE, # The following need to remain as strings for all of their digits, not get converted to numbers. dtype={"GEOID10_TRACT": str}, low_memory=False, ) self.pr_tracts["State Abbreviation"] = "PR" - # Download MSA median incomes - logger.debug("Starting download of MSA median incomes.") - download = requests.get( - self.MSA_MEDIAN_INCOME_URL, - verify=None, - timeout=settings.REQUESTS_DEFAULT_TIMOUT, - ) - self.msa_median_incomes = json.loads(download.content) + with self.MSA_MEDIAN_INCOME_SOURCE.open() as source: + self.msa_median_incomes = json.load(source) - # Download state median incomes - logger.debug("Starting download of state median incomes.") - download_state = requests.get( - self.STATE_MEDIAN_INCOME_URL, - verify=None, - timeout=settings.REQUESTS_DEFAULT_TIMOUT, - ) - self.state_median_incomes = json.loads(download_state.content) + with self.STATE_MEDIAN_INCOME_SOURCE.open() as source: + self.state_median_incomes = json.load(source) ## NOTE we already have PR's MI here def transform(self) -> None: diff --git a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py index 395697fc..4fe26249 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/census_decennial/etl.py @@ -1,13 +1,14 @@ import json from typing import List +import os import numpy as np import pandas as pd -import requests -from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import FileDataSource pd.options.mode.chained_assignment = "raise" @@ -342,20 +343,23 @@ class CensusDecennialETL(ExtractTransformLoad): + "&for=tract:*&in=state:{}%20county:{}" ) + census_api_key = os.environ.get("CENSUS_API_KEY") + if census_api_key: + self.API_URL = self.API_URL + f"&key={census_api_key}" + self.final_race_fields: List[str] = [] self.df: pd.DataFrame self.df_vi: pd.DataFrame self.df_all: pd.DataFrame - def extract(self) -> None: - dfs = [] - dfs_vi = [] + def get_data_sources(self) -> [DataSource]: + + sources = [] + for island in self.ISLAND_TERRITORIES: - logger.debug( - f"Downloading data for state/territory {island['state_abbreviation']}" - ) for county in island["county_fips"]: + api_url = self.API_URL.format( self.DECENNIAL_YEAR, island["state_abbreviation"], @@ -363,17 +367,48 @@ class CensusDecennialETL(ExtractTransformLoad): island["fips"], county, ) - logger.debug(f"CENSUS: Requesting {api_url}") - download = requests.get( - api_url, - timeout=settings.REQUESTS_DEFAULT_TIMOUT, + + sources.append( + FileDataSource( + source=api_url, + destination=self.get_sources_path() + / str(self.DECENNIAL_YEAR) + / island["state_abbreviation"] + / island["fips"] + / county + / "census.json", + ) ) + return sources + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + dfs = [] + dfs_vi = [] + for island in self.ISLAND_TERRITORIES: + logger.debug( + f"Downloading data for state/territory {island['state_abbreviation']}" + ) + for county in island["county_fips"]: + try: - df = json.loads(download.content) + filepath = ( + self.get_sources_path() + / str(self.DECENNIAL_YEAR) + / island["state_abbreviation"] + / island["fips"] + / county + / "census.json" + ) + df = json.load(filepath.open()) except ValueError as e: logger.error( - f"Could not load content in census decennial ETL because {e}. Content is {download.content}." + f"Could not load content in census decennial ETL because {e}." ) # First row is the header diff --git a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py index 5f9a10b8..c9b95ecb 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py @@ -5,6 +5,8 @@ from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger from data_pipeline.config import settings +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource logger = get_module_logger(__name__) @@ -38,17 +40,26 @@ class ChildOpportunityIndex(ExtractTransformLoad): PUERTO_RICO_EXPECTED_IN_DATA = False def __init__(self): + + # fetch if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - self.SOURCE_URL = ( + self.child_opportunity_url = ( f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" "child_opportunity_index/raw.zip" ) else: - self.SOURCE_URL = ( + self.child_opportunity_url = ( "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-" "3a0ededa30a0?format=csv" ) + # input + self.child_opportunity_index_source = ( + self.get_sources_path() / "raw.csv" + ) + + # output + # TODO: Decide about nixing this self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME @@ -62,17 +73,25 @@ class ChildOpportunityIndex(ExtractTransformLoad): self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN" self.READING_INPUT_FIELD = "ED_READING" + self.raw_df: pd.DataFrame self.output_df: pd.DataFrame - def extract(self) -> None: - super().extract( - source_url=self.SOURCE_URL, - extract_path=self.get_tmp_path(), - ) + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.child_opportunity_url, + destination=self.get_sources_path(), + ) + ] - def transform(self) -> None: - raw_df = pd.read_csv( - filepath_or_buffer=self.get_tmp_path() / "raw.csv", + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + self.raw_df = pd.read_csv( + filepath_or_buffer=self.child_opportunity_index_source, # The following need to remain as strings for all of their digits, not get # converted to numbers. dtype={ @@ -81,7 +100,9 @@ class ChildOpportunityIndex(ExtractTransformLoad): low_memory=False, ) - output_df = raw_df.rename( + def transform(self) -> None: + + output_df = self.raw_df.rename( columns={ self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD, diff --git a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py index 0056be9a..39c1ba6c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py @@ -5,22 +5,35 @@ from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource logger = get_module_logger(__name__) class DOEEnergyBurden(ExtractTransformLoad): + NAME = "doe_energy_burden" - SOURCE_URL: str = ( - settings.AWS_JUSTICE40_DATASOURCES_URL - + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip" - ) + GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT LOAD_YAML_CONFIG: bool = True REVISED_ENERGY_BURDEN_FIELD_NAME: str def __init__(self): + + # fetch + self.doe_energy_burden_url = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + + "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip" + ) + + # input + self.doe_energy_burden_source = ( + self.get_sources_path() / "DOE_LEAD_AMI_TRACT_2018_ALL.csv" + ) + + # output self.OUTPUT_PATH: Path = ( self.DATA_PATH / "dataset" / "doe_energy_burden" ) @@ -29,10 +42,22 @@ class DOEEnergyBurden(ExtractTransformLoad): self.raw_df: pd.DataFrame self.output_df: pd.DataFrame - def transform(self) -> None: - raw_df: pd.DataFrame = pd.read_csv( - filepath_or_buffer=self.get_tmp_path() - / "DOE_LEAD_AMI_TRACT_2018_ALL.csv", + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.doe_energy_burden_url, + destination=self.get_sources_path(), + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + self.raw_df = pd.read_csv( + filepath_or_buffer=self.doe_energy_burden_source, # The following need to remain as strings for all of their digits, not get converted to numbers. dtype={ self.INPUT_GEOID_TRACT_FIELD_NAME: "string", @@ -40,8 +65,10 @@ class DOEEnergyBurden(ExtractTransformLoad): low_memory=False, ) + def transform(self) -> None: + logger.debug("Renaming columns and ensuring output format is correct") - output_df = raw_df.rename( + output_df = self.raw_df.rename( columns={ self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME, self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME, diff --git a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py index 3329ec6a..794ee97e 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/dot_travel_composite/etl.py @@ -3,6 +3,8 @@ import geopandas as gpd import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger from data_pipeline.config import settings @@ -15,14 +17,6 @@ class TravelCompositeETL(ExtractTransformLoad): NAME = "travel_composite" - if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - SOURCE_URL = ( - f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" - "dot_travel_composite/Shapefile_and_Metadata.zip" - ) - else: - SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip" - GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False LOAD_YAML_CONFIG: bool = True @@ -31,14 +25,29 @@ class TravelCompositeETL(ExtractTransformLoad): TRAVEL_BURDEN_FIELD_NAME: str def __init__(self): + + # fetch + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.travel_composite_url = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "dot_travel_composite/Shapefile_and_Metadata.zip" + ) + else: + self.travel_composite_url = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip" + + # input # define the full path for the input CSV file - self.INPUT_SHP = ( - self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp" + self.disadvantage_layer_shape_source = ( + self.get_sources_path() + / "DOT_Disadvantage_Layer_Final_April2022.shp" ) + # output # this is the main dataframe self.df: pd.DataFrame + self.df_dot: pd.DataFrame + # Start dataset-specific vars here ## Average of Transportation Indicator Percentiles (calculated) ## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS @@ -46,6 +55,22 @@ class TravelCompositeETL(ExtractTransformLoad): self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH" self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS" + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.travel_composite_url, + destination=self.get_sources_path(), + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + self.df_dot = gpd.read_file(self.disadvantage_layer_shape_source) + def transform(self) -> None: """Reads the unzipped data file into memory and applies the following transformations to prepare it for the load() method: @@ -54,15 +79,15 @@ class TravelCompositeETL(ExtractTransformLoad): - Converts to CSV """ - # read in the unzipped shapefile from data source # reformat it to be standard df, remove unassigned rows, and # then rename the Census Tract column for merging - df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP) - df_dot = df_dot.rename( + + self.df_dot = self.df_dot.rename( columns={ self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME, self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME, } ).dropna(subset=[self.GEOID_TRACT_FIELD_NAME]) + # Assign the final df to the class' output_df for the load method - self.output_df = df_dot + self.output_df = self.df_dot diff --git a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py index 3162c637..b5bb163f 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/eamlis/etl.py @@ -1,12 +1,15 @@ from pathlib import Path -import geopandas as gpd import pandas as pd +import geopandas as gpd + from data_pipeline.config import settings -from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel -from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries from data_pipeline.utils import get_module_logger +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource +from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries logger = get_module_logger(__name__) @@ -39,13 +42,20 @@ class AbandonedMineETL(ExtractTransformLoad): "55", ] - # Define these for easy code completion def __init__(self): - self.SOURCE_URL = ( + + # fetch + self.eamlis_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/eAMLIS export of all data.tsv.zip" ) + # input + self.eamlis_source = ( + self.get_sources_path() / "eAMLIS export of all data.tsv" + ) + + # output self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME self.OUTPUT_PATH: Path = ( @@ -58,18 +68,34 @@ class AbandonedMineETL(ExtractTransformLoad): ] self.output_df: pd.DataFrame + self.df: pd.DataFrame - def transform(self) -> None: - df = pd.read_csv( - self.get_tmp_path() / "eAMLIS export of all data.tsv", + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.eamlis_url, destination=self.get_sources_path() + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + self.df = pd.read_csv( + self.eamlis_source, sep="\t", low_memory=False, ) + + def transform(self) -> None: + gdf = gpd.GeoDataFrame( - df, + self.df, geometry=gpd.points_from_xy( - x=df["Longitude"], - y=df["Latitude"], + x=self.df["Longitude"], + y=self.df["Latitude"], ), crs="epsg:4326", ) @@ -77,4 +103,5 @@ class AbandonedMineETL(ExtractTransformLoad): gdf_tracts = add_tracts_for_geometries(gdf) gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME) gdf_tracts[self.AML_BOOLEAN] = True + self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP] diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py index 0db8e648..44962156 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen/etl.py @@ -3,6 +3,8 @@ from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource logger = get_module_logger(__name__) @@ -15,11 +17,18 @@ class EJSCREENETL(ExtractTransformLoad): INPUT_GEOID_TRACT_FIELD_NAME: str = "ID" def __init__(self): - self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip" - self.EJSCREEN_CSV = ( - self.get_tmp_path() / "EJSCREEN_2021_USPR_Tracts.csv" + + # fetch + self.ejscreen_url = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip" + + # input + self.ejscreen_source = ( + self.get_sources_path() / "EJSCREEN_2021_USPR_Tracts.csv" ) + + # output self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen" + self.df: pd.DataFrame self.COLUMNS_TO_KEEP = [ @@ -43,22 +52,29 @@ class EJSCREENETL(ExtractTransformLoad): field_names.UST_FIELD, ] - def extract(self) -> None: - super().extract( - self.EJSCREEN_FTP_URL, - self.get_tmp_path(), - verify=False, # EPA EJScreen end point has certificate issues often - ) + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.ejscreen_url, destination=self.get_sources_path() + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources - def transform(self) -> None: self.df = pd.read_csv( - self.EJSCREEN_CSV, + self.ejscreen_source, dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, # EJSCREEN writes the word "None" for NA data. na_values=["None"], low_memory=False, ) + def transform(self) -> None: + # rename ID to Tract ID self.output_df = self.df.rename( columns={ diff --git a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py index 8c18034d..f8e09cb1 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/ejscreen_areas_of_concern/etl.py @@ -1,5 +1,6 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource from data_pipeline.utils import get_module_logger logger = get_module_logger(__name__) @@ -9,12 +10,17 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad): # Note: while we normally set these properties in `__init__`, # we are setting them as class properties here so they can be accessed by the # class method `ejscreen_areas_of_concern_data_exists`. - LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local" - EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = ( - LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv" + + EJSCREEN_AREAS_OF_CONCERN_SOURCE = ( + ExtractTransformLoad.DATA_PATH + / "sources" + / "EJSCREENAreasOfConcernETL" + / "ejscreen_areas_of_concerns_indicators.csv" ) def __init__(self): + + # output self.OUTPUT_PATH = ( self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern" ) @@ -22,6 +28,10 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad): # TO DO: Load from actual source; the issue is that this dataset is not public for now self.df: pd.DataFrame + def get_data_sources(self) -> [DataSource]: + """The source for this must be downloaded and saved manually. It is not publicly available""" + return [] + @classmethod def ejscreen_areas_of_concern_data_exists(cls): """Check whether or not the EJSCREEN areas of concern data exists. @@ -35,13 +45,19 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad): not reference this data. """ - return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file() + return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE.is_file() - def extract(self) -> None: + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + logger.info(self.EJSCREEN_AREAS_OF_CONCERN_SOURCE) if self.ejscreen_areas_of_concern_data_exists(): logger.debug("Loading EJSCREEN Areas of Concern Data Locally") self.df = pd.read_csv( - filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA, + filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE, dtype={ self.GEOID_FIELD_NAME: "string", }, diff --git a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py index 3f27898e..136eaa54 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/energy_definition_alternative_draft/etl.py @@ -5,18 +5,27 @@ from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger -from data_pipeline.utils import unzip_file_from_url +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource logger = get_module_logger(__name__) class EnergyDefinitionAlternativeDraft(ExtractTransformLoad): def __init__(self): - self.DEFINITION_ALTERNATIVE_FILE_URL = ( + + # fetch + self.definition_alternative_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/alternative DAC definition.csv.zip" ) + # input + self.definition_alternative_source = ( + self.get_sources_path() / "J40 alternative DAC definition.csv" + ) + + # output self.OUTPUT_PATH: Path = ( self.DATA_PATH / "dataset" / "energy_definition_alternative_draft" ) @@ -48,18 +57,22 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad): self.df: pd.DataFrame - def extract(self) -> None: - unzip_file_from_url( - file_url=self.DEFINITION_ALTERNATIVE_FILE_URL, - download_path=self.get_tmp_path(), - unzipped_file_path=self.get_tmp_path() - / "energy_definition_alternative_draft", - ) + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.definition_alternative_url, + destination=self.get_sources_path(), + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources self.df = pd.read_csv( - filepath_or_buffer=self.get_tmp_path() - / "energy_definition_alternative_draft" - / "J40 alternative DAC definition.csv", + filepath_or_buffer=self.definition_alternative_source, # The following need to remain as strings for all of their digits, not get converted to numbers. dtype={ self.TRACT_INPUT_COLUMN_NAME: "string", @@ -68,6 +81,7 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad): ) def transform(self) -> None: + self.df = self.df.rename( columns={ self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, diff --git a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py index 56f8bcc4..199ed9ff 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/epa_rsei/etl.py @@ -4,8 +4,9 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger -from data_pipeline.utils import unzip_file_from_url from data_pipeline.config import settings +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource logger = get_module_logger(__name__) @@ -23,17 +24,25 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad): def __init__(self): + # fetch if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - self.AGGREGATED_RSEI_SCORE_FILE_URL = ( + self.aggregated_rsei_score_file_url = ( f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" "epa_rsei/CensusMicroTracts2019_2019_aggregated.zip" ) else: - self.AGGREGATED_RSEI_SCORE_FILE_URL = ( + self.aggregated_rsei_score_file_url = ( "http://abt-rsei.s3.amazonaws.com/microdata2019/" "census_agg/CensusMicroTracts2019_2019_aggregated.zip" ) + # input + self.aggregated_rsei_score_source = ( + self.get_sources_path() + / "CensusMicroTracts2019_2019_aggregated.csv" + ) + + # output self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei" self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75 self.TRACT_INPUT_COLUMN_NAME = "GEOID10" @@ -64,7 +73,20 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad): self.df: pd.DataFrame - def extract(self) -> None: + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.aggregated_rsei_score_file_url, + destination=self.get_sources_path(), + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + # the column headers from the above dataset are actually a census tract's data at this point # We will use this data structure later to specify the column names input_columns = [ @@ -79,16 +101,8 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad): self.NCSCORE_INPUT_FIELD, ] - unzip_file_from_url( - file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL, - download_path=self.get_tmp_path(), - unzipped_file_path=self.get_tmp_path() / "epa_rsei", - ) - self.df = pd.read_csv( - filepath_or_buffer=self.get_tmp_path() - / "epa_rsei" - / "CensusMicroTracts2019_2019_aggregated.csv", + filepath_or_buffer=self.aggregated_rsei_score_source, # The following need to remain as strings for all of their digits, not get # converted to numbers. low_memory=False, diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py index 60534daa..55001436 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_flood_risk/etl.py @@ -5,6 +5,8 @@ from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource logger = get_module_logger(__name__) @@ -15,7 +17,7 @@ class FloodRiskETL(ExtractTransformLoad): NAME = "fsf_flood_risk" # These data were emailed to the J40 team while first street got # their official data sharing channels setup. - SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip" + GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT LOAD_YAML_CONFIG: bool = True @@ -27,13 +29,16 @@ class FloodRiskETL(ExtractTransformLoad): SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str def __init__(self): - # define the full path for the input CSV file - self.INPUT_CSV = ( - self.get_tmp_path() / "fsf_flood" / "flood-tract2010.csv" + + # fetch + self.flood_tract_url = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip" ) - # this is the main dataframe - self.df: pd.DataFrame + # input + self.flood_tract_source = ( + self.get_sources_path() / "fsf_flood" / "flood-tract2010.csv" + ) # Start dataset-specific vars here self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties" @@ -41,6 +46,29 @@ class FloodRiskETL(ExtractTransformLoad): self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30" self.CLIP_PROPERTIES_COUNT = 250 + self.df_fsf_flood: pd.DataFrame + + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.flood_tract_url, destination=self.get_sources_path() + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + # read in the unzipped csv data source then rename the + # Census Tract column for merging + self.df_fsf_flood = pd.read_csv( + self.flood_tract_source, + dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, + low_memory=False, + ) + def transform(self) -> None: """Reads the unzipped data file into memory and applies the following transformations to prepare it for the load() method: @@ -49,35 +77,29 @@ class FloodRiskETL(ExtractTransformLoad): - Calculates share of properties at risk, left-clipping number of properties at 250 """ - # read in the unzipped csv data source then rename the - # Census Tract column for merging - df_fsf_flood: pd.DataFrame = pd.read_csv( - self.INPUT_CSV, - dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, - low_memory=False, - ) - - df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = df_fsf_flood[ + self.df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_flood[ self.INPUT_GEOID_TRACT_FIELD_NAME ].str.zfill(11) - df_fsf_flood[self.COUNT_PROPERTIES] = df_fsf_flood[ + self.df_fsf_flood[self.COUNT_PROPERTIES] = self.df_fsf_flood[ self.COUNT_PROPERTIES_NATIVE_FIELD_NAME ].clip(lower=self.CLIP_PROPERTIES_COUNT) - df_fsf_flood[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY] = ( - df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY] - / df_fsf_flood[self.COUNT_PROPERTIES] + self.df_fsf_flood[ + self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY + ] = ( + self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY] + / self.df_fsf_flood[self.COUNT_PROPERTIES] ) - df_fsf_flood[ + self.df_fsf_flood[ self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS ] = ( - df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS] - / df_fsf_flood[self.COUNT_PROPERTIES] + self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS] + / self.df_fsf_flood[self.COUNT_PROPERTIES] ) # Assign the final df to the class' output_df for the load method with rename - self.output_df = df_fsf_flood.rename( + self.output_df = self.df_fsf_flood.rename( columns={ self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY, self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS, diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py index 2680eaf3..ebb88b73 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py @@ -4,6 +4,8 @@ import pandas as pd from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.utils import get_module_logger logger = get_module_logger(__name__) @@ -15,7 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad): NAME = "fsf_wildfire_risk" # These data were emailed to the J40 team while first street got # their official data sharing channels setup. - SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip" + GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False LOAD_YAML_CONFIG: bool = True @@ -29,18 +31,48 @@ class WildfireRiskETL(ExtractTransformLoad): SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str def __init__(self): - # define the full path for the input CSV file - self.INPUT_CSV = self.get_tmp_path() / "fsf_fire" / "fire-tract2010.csv" + # fetch + self.fsf_fire_url = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip" + ) + + # input + self.fsf_fire_source = ( + self.get_sources_path() / "fsf_fire" / "fire-tract2010.csv" + ) + + # output # this is the main dataframe self.df: pd.DataFrame + self.df_fsf_fire: pd.DataFrame + # Start dataset-specific vars here self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties" self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag" self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag" self.CLIP_PROPERTIES_COUNT = 250 + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.fsf_fire_url, destination=self.get_sources_path() + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + self.df_fsf_fire = pd.read_csv( + self.fsf_fire_source, + dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, + low_memory=False, + ) + def transform(self) -> None: """Reads the unzipped data file into memory and applies the following transformations to prepare it for the load() method: @@ -50,31 +82,28 @@ class WildfireRiskETL(ExtractTransformLoad): """ # read in the unzipped csv data source then rename the # Census Tract column for merging - df_fsf_fire: pd.DataFrame = pd.read_csv( - self.INPUT_CSV, - dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, - low_memory=False, - ) - df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = df_fsf_fire[ + self.df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_fire[ self.INPUT_GEOID_TRACT_FIELD_NAME ].str.zfill(11) - df_fsf_fire[self.COUNT_PROPERTIES] = df_fsf_fire[ + self.df_fsf_fire[self.COUNT_PROPERTIES] = self.df_fsf_fire[ self.COUNT_PROPERTIES_NATIVE_FIELD_NAME ].clip(lower=self.CLIP_PROPERTIES_COUNT) - df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = ( - df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY] - / df_fsf_fire[self.COUNT_PROPERTIES] + self.df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = ( + self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY] + / self.df_fsf_fire[self.COUNT_PROPERTIES] ) - df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS] = ( - df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS] - / df_fsf_fire[self.COUNT_PROPERTIES] + self.df_fsf_fire[ + self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS + ] = ( + self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS] + / self.df_fsf_fire[self.COUNT_PROPERTIES] ) # Assign the final df to the class' output_df for the load method with rename - self.output_df = df_fsf_fire.rename( + self.output_df = self.df_fsf_fire.rename( columns={ self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY, self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS, diff --git a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py index 223f0b09..16b719c2 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/geocorr/etl.py @@ -3,17 +3,33 @@ from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger -from data_pipeline.utils import unzip_file_from_url +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource logger = get_module_logger(__name__) class GeoCorrETL(ExtractTransformLoad): + NAME = "geocorr" + GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False def __init__(self): + + # fetch + self.geocorr_url = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + + "/geocorr_urban_rural.csv.zip" + ) + + # input + self.geocorr_source = ( + self.get_sources_path() / "geocorr_urban_rural.csv" + ) + + # output self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr" # Need to change hyperlink to S3 @@ -23,7 +39,7 @@ class GeoCorrETL(ExtractTransformLoad): # The source data for this notebook was downloaded from GeoCorr; # the instructions for generating the source data is here: # https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787 - self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip" + # self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip" self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT" self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag" self.COLUMNS_TO_KEEP = [ @@ -33,16 +49,21 @@ class GeoCorrETL(ExtractTransformLoad): self.df: pd.DataFrame - def extract(self) -> None: - unzip_file_from_url( - file_url=settings.AWS_JUSTICE40_DATASOURCES_URL - + "/geocorr_urban_rural.csv.zip", - download_path=self.get_tmp_path(), - unzipped_file_path=self.get_tmp_path(), - ) + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.geocorr_url, destination=self.get_sources_path() + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources self.df = pd.read_csv( - filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv", + filepath_or_buffer=self.geocorr_source, dtype={ self.GEOCORR_GEOID_FIELD_NAME: "string", }, diff --git a/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py b/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py index a65ed126..fb56b3cc 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/historic_redlining/etl.py @@ -3,12 +3,16 @@ from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource logger = get_module_logger(__name__) class HistoricRedliningETL(ExtractTransformLoad): + NAME = "historic_redlining" + GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT EXPECTED_MISSING_STATES = [ "10", @@ -25,14 +29,14 @@ class HistoricRedliningETL(ExtractTransformLoad): ] PUERTO_RICO_EXPECTED_IN_DATA = False ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False - SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip" def __init__(self): - self.CSV_PATH = self.DATA_PATH / "dataset" / "historic_redlining" - self.HISTORIC_REDLINING_FILE_PATH = ( - self.get_tmp_path() / "HRS_2010.xlsx" - ) + # fetch + self.hrs_url = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip" + + # input + self.hrs_source = self.get_sources_path() / "HRS_2010.xlsx" self.REDLINING_SCALAR = "Tract-level redlining score" @@ -40,30 +44,47 @@ class HistoricRedliningETL(ExtractTransformLoad): self.GEOID_TRACT_FIELD_NAME, self.REDLINING_SCALAR, ] + self.df: pd.DataFrame + self.historic_redlining_data: pd.DataFrame + + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.hrs_url, destination=self.get_sources_path() + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + self.historic_redlining_data = pd.read_excel(self.hrs_source) def transform(self) -> None: # this is obviously temporary - historic_redlining_data = pd.read_excel( - self.HISTORIC_REDLINING_FILE_PATH + + self.historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = ( + self.historic_redlining_data["GEOID10"].astype(str).str.zfill(11) ) - historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = ( - historic_redlining_data["GEOID10"].astype(str).str.zfill(11) - ) - historic_redlining_data = historic_redlining_data.rename( + self.historic_redlining_data = self.historic_redlining_data.rename( columns={"HRS2010": self.REDLINING_SCALAR} ) - logger.debug(f"{historic_redlining_data.columns}") + logger.debug(f"{self.historic_redlining_data.columns}") # Calculate lots of different score thresholds for convenience for threshold in [3.25, 3.5, 3.75]: - historic_redlining_data[ + self.historic_redlining_data[ f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}" - ] = (historic_redlining_data[self.REDLINING_SCALAR] >= threshold) + ] = ( + self.historic_redlining_data[self.REDLINING_SCALAR] >= threshold + ) ## NOTE We add to columns to keep here self.COLUMNS_TO_KEEP.append( f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}" ) - self.output_df = historic_redlining_data + self.output_df = self.historic_redlining_data diff --git a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py index b5e5a875..74e6623b 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/housing_and_transportation/etl.py @@ -1,8 +1,9 @@ import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.utils import get_module_logger -from data_pipeline.utils import unzip_file_from_url from pandas.errors import EmptyDataError logger = get_module_logger(__name__) @@ -10,36 +11,46 @@ logger = get_module_logger(__name__) class HousingTransportationETL(ExtractTransformLoad): def __init__(self): - self.HOUSING_FTP_URL = ( - "https://htaindex.cnt.org/download/download.php?focus=tract&geoid=" - ) + self.OUTPUT_PATH = ( self.DATA_PATH / "dataset" / "housing_and_transportation_index" ) self.df: pd.DataFrame - def extract(self) -> None: + def get_data_sources(self) -> [DataSource]: + + housing_url = ( + "https://htaindex.cnt.org/download/download.php?focus=tract&geoid=" + ) + + sources = [] + + for fips in get_state_fips_codes(self.DATA_PATH): + sources.append( + ZIPDataSource( + source=f"{housing_url}{fips}", + destination=self.get_sources_path(), + ) + ) + + return sources + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + # Download each state / territory individually dfs = [] - zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index" for fips in get_state_fips_codes(self.DATA_PATH): - logger.debug( - f"Downloading housing data for state/territory with FIPS code {fips}" - ) - unzip_file_from_url( - f"{self.HOUSING_FTP_URL}{fips}", - self.get_tmp_path(), - zip_file_dir, - ) - - # New file name: - tmp_csv_file_path = ( - zip_file_dir / f"htaindex2019_data_tracts_{fips}.csv" + csv_source = ( + self.get_sources_path() / f"htaindex2019_data_tracts_{fips}.csv" ) try: - tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path) + tmp_df = pd.read_csv(filepath_or_buffer=csv_source) except EmptyDataError: logger.error( f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}" diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py index 4cf0ee7d..9ca02cf8 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/hud_housing/etl.py @@ -3,24 +3,33 @@ from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger from data_pipeline.config import settings +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource logger = get_module_logger(__name__) class HudHousingETL(ExtractTransformLoad): + NAME = "hud_housing" GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT def __init__(self): - self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT" + # fetch if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - self.HOUSING_FTP_URL = ( + self.housing_url = ( f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" "hud_housing/2014thru2018-140-csv.zip" ) else: - self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip" + self.housing_url = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip" + + # source + + # output + + self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT" self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path() @@ -55,15 +64,16 @@ class HudHousingETL(ExtractTransformLoad): self.df: pd.DataFrame - def extract(self) -> None: - super().extract( - self.HOUSING_FTP_URL, - self.HOUSING_ZIP_FILE_DIR, - ) + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.housing_url, destination=self.get_sources_path() + ) + ] def _read_chas_table(self, file_name): - # New file name: - tmp_csv_file_path = self.HOUSING_ZIP_FILE_DIR / "140" / file_name + + tmp_csv_file_path = self.get_sources_path() / "140" / file_name tmp_df = pd.read_csv( filepath_or_buffer=tmp_csv_file_path, encoding="latin-1", @@ -78,7 +88,12 @@ class HudHousingETL(ExtractTransformLoad): return tmp_df - def transform(self) -> None: + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + table_8 = self._read_chas_table("Table8.csv") table_3 = self._read_chas_table("Table3.csv") @@ -86,6 +101,8 @@ class HudHousingETL(ExtractTransformLoad): table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME ) + def transform(self) -> None: + # Calculate share that lacks indoor plumbing or kitchen # This is computed as # ( diff --git a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py index ddf476b6..f6c61bfa 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/hud_recap/etl.py @@ -1,7 +1,9 @@ import pandas as pd -import requests + from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import FileDataSource from data_pipeline.utils import get_module_logger @@ -11,44 +13,51 @@ logger = get_module_logger(__name__) class HudRecapETL(ExtractTransformLoad): def __init__(self): + # fetch if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - self.HUD_RECAP_CSV_URL = ( + self.hud_recap_csv_url = ( f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" "hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv" ) else: - self.HUD_RECAP_CSV_URL = ( + self.hud_recap_csv_url = ( "https://opendata.arcgis.com/api/v3/datasets/" "56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326" ) - self.HUD_RECAP_CSV = ( - self.get_tmp_path() + # input + self.hud_recap_source = ( + self.get_sources_path() / "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv" ) + + # output self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap" - # Definining some variable names + # Defining some variable names self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = ( "hud_recap_priority_community" ) self.df: pd.DataFrame - def extract(self) -> None: - download = requests.get( - self.HUD_RECAP_CSV_URL, - verify=None, - timeout=settings.REQUESTS_DEFAULT_TIMOUT, - ) - file_contents = download.content - csv_file = open(self.HUD_RECAP_CSV, "wb") - csv_file.write(file_contents) - csv_file.close() + def get_data_sources(self) -> [DataSource]: + return [ + FileDataSource( + source=self.hud_recap_csv_url, destination=self.hud_recap_source + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + # Load comparison index (CalEnviroScreen 4) + self.df = pd.read_csv(self.hud_recap_source, dtype={"GEOID": "string"}) def transform(self) -> None: - # Load comparison index (CalEnviroScreen 4) - self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"}) self.df.rename( columns={ diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py b/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py index 7b4879f3..68e01824 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_for_ej/etl.py @@ -2,6 +2,8 @@ import geopandas as gpd import pandas as pd from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger @@ -10,16 +12,25 @@ logger = get_module_logger(__name__) class MappingForEJETL(ExtractTransformLoad): def __init__(self): - self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej" - self.MAPPING_FOR_EJ_VA_URL = ( + # fetch + self.mapping_for_ej_va_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip" ) - self.MAPPING_FOR_EJ_CO_URL = ( + self.mapping_for_ej_co_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip" ) - self.VA_SHP_FILE_PATH = self.get_tmp_path() / "mej_virginia_7_1.shp" - self.CO_SHP_FILE_PATH = self.get_tmp_path() / "mej_colorado_final.shp" + + # input + self.va_shp_file_source = ( + self.get_sources_path() / "mej_virginia_7_1.shp" + ) + self.co_shp_file_source = ( + self.get_sources_path() / "mej_colorado_final.shp" + ) + + # output + self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej" # Defining variables self.COLUMNS_TO_KEEP = [ @@ -38,26 +49,35 @@ class MappingForEJETL(ExtractTransformLoad): self.df: pd.DataFrame - def extract(self) -> None: - super().extract( - self.MAPPING_FOR_EJ_VA_URL, - self.get_tmp_path(), - ) - super().extract( - self.MAPPING_FOR_EJ_CO_URL, - self.get_tmp_path(), - ) + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.mapping_for_ej_va_url, + destination=self.get_sources_path(), + ), + ZIPDataSource( + source=self.mapping_for_ej_co_url, + destination=self.get_sources_path(), + ), + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources - def transform(self) -> None: # Join (here, it's just concatenating) the two dataframes from # CO and VA self.df = pd.concat( [ - gpd.read_file(self.VA_SHP_FILE_PATH), - gpd.read_file(self.CO_SHP_FILE_PATH), + gpd.read_file(self.va_shp_file_source), + gpd.read_file(self.co_shp_file_source), ] ) + def transform(self) -> None: + # Fill Census tract to get it to be 11 digits, incl. leading 0s # Note that VA and CO should never have leading 0s, so this isn't # strictly necessary, but if in the future, there are more states diff --git a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py index 05ff0593..e983efb6 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/mapping_inequality/etl.py @@ -3,8 +3,9 @@ import pathlib import numpy as np import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import FileDataSource from data_pipeline.score import field_names -from data_pipeline.utils import download_file_from_url from data_pipeline.utils import get_module_logger from data_pipeline.config import settings @@ -19,31 +20,35 @@ class MappingInequalityETL(ExtractTransformLoad): Information on the mapping of this data to census tracts is available at https://github.com/americanpanorama/Census_HOLC_Research. - """ def __init__(self): + + # fetch if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - self.MAPPING_INEQUALITY_CSV_URL = ( + self.mapping_inequality_csv_url = ( f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" "mapping_inequality/holc_tract_lookup.csv" ) else: - self.MAPPING_INEQUALITY_CSV_URL = ( + self.mapping_inequality_csv_url = ( "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/" "main/2010_Census_Tracts/holc_tract_lookup.csv" ) - self.MAPPING_INEQUALITY_CSV = ( - self.get_tmp_path() / "holc_tract_lookup.csv" - ) - self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality" - self.HOLC_MANUAL_MAPPING_CSV_PATH = ( + # input + self.mapping_inequality_source = ( + self.get_sources_path() / "holc_tract_lookup.csv" + ) + self.holc_manual_mapping_source = ( # here be dragons – this file is pulled from a different place than most pathlib.Path(__file__).parent / "data" / "holc_grades_manually_mapped.csv" ) + # output + self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality" + # Some input field names. From documentation: 'Census Tracts were intersected # with HOLC Polygons. Census information can be joined via the "geoid" field. # There are two field "holc_prop" and "tract_prop" which give the proportion @@ -73,22 +78,39 @@ class MappingInequalityETL(ExtractTransformLoad): ] self.df: pd.DataFrame + self.holc_manually_mapped_df: pd.DataFrame - def extract(self) -> None: - download_file_from_url( - file_url=self.MAPPING_INEQUALITY_CSV_URL, - download_file_name=self.MAPPING_INEQUALITY_CSV, - ) + def get_data_sources(self) -> [DataSource]: + return [ + FileDataSource( + source=self.mapping_inequality_csv_url, + destination=self.mapping_inequality_source, + ) + ] - def transform(self) -> None: - df: pd.DataFrame = pd.read_csv( - self.MAPPING_INEQUALITY_CSV, + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + self.df = pd.read_csv( + self.mapping_inequality_source, dtype={self.TRACT_INPUT_FIELD: "string"}, low_memory=False, ) + # Some data needs to be manually mapped to its grade. + # TODO: Investigate more data that may need to be manually mapped. + self.holc_manually_mapped_df = pd.read_csv( + filepath_or_buffer=self.holc_manual_mapping_source, + low_memory=False, + ) + + def transform(self) -> None: + # rename Tract ID - df.rename( + self.df.rename( columns={ self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME, }, @@ -98,28 +120,21 @@ class MappingInequalityETL(ExtractTransformLoad): # Keep the first character, which is the HOLC grade (A, B, C, D). # TODO: investigate why this dataframe triggers these pylint errors. # pylint: disable=unsupported-assignment-operation, unsubscriptable-object - df[self.HOLC_GRADE_DERIVED_FIELD] = df[ + self.df[self.HOLC_GRADE_DERIVED_FIELD] = self.df[ self.HOLC_GRADE_AND_ID_FIELD ].str[0:1] # Remove nonsense when the field has no grade or invalid grades. valid_grades = ["A", "B", "C", "D"] - df.loc[ + self.df.loc[ # pylint: disable=unsubscriptable-object - ~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades), + ~self.df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades), self.HOLC_GRADE_DERIVED_FIELD, ] = None - # Some data needs to be manually mapped to its grade. - # TODO: Investigate more data that may need to be manually mapped. - holc_manually_mapped_df = pd.read_csv( - filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH, - low_memory=False, - ) - # Join on the existing data - merged_df = df.merge( - right=holc_manually_mapped_df, + merged_df = self.df.merge( + right=self.holc_manually_mapped_df, on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD], how="left", ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py index 8f714c81..2f066525 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/maryland_ejscreen/etl.py @@ -4,6 +4,8 @@ import geopandas as gpd import pandas as pd from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger @@ -17,11 +19,16 @@ class MarylandEJScreenETL(ExtractTransformLoad): """ def __init__(self): - self.MARYLAND_EJSCREEN_URL = ( + + # fetch + self.maryland_ejscreen_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip" ) - self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen" + # input + self.shape_files_source = self.get_sources_path() / "mdejscreen" + + # output self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen" self.COLUMNS_TO_KEEP = [ @@ -31,37 +38,47 @@ class MarylandEJScreenETL(ExtractTransformLoad): ] self.df: pd.DataFrame + self.dfs_list: pd.DataFrame + + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.maryland_ejscreen_url, + destination=self.get_sources_path(), + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: - def extract(self) -> None: - logger.debug("Downloading 207MB Maryland EJSCREEN Data") super().extract( - self.MARYLAND_EJSCREEN_URL, - self.get_tmp_path(), - ) + use_cached_data_sources + ) # download and extract data sources - def transform(self) -> None: - list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp")) + logger.debug("Downloading 207MB Maryland EJSCREEN Data") + list_of_files = list(glob(str(self.shape_files_source) + "/*.shp")) - # Ignore counties becauses this is not the level of measurement + # Ignore counties because this is not the level of measurement # that is consistent with our current scoring and ranking methodology. - dfs_list = [ + self.dfs_list = [ gpd.read_file(f) for f in list_of_files if not f.endswith("CountiesEJScore.shp") ] + def transform(self) -> None: + # Set the Census tract as the index and drop the geometry column # that produces the census tract boundaries. # The latter is because Geopandas raises an exception if there # are duplicate geometry columns. # Moreover, since the unit of measurement is at the tract level # we can consistantly merge this with other datasets - dfs_list = [ + self.dfs_list = [ df.set_index("Census_Tra").drop("geometry", axis=1) - for df in dfs_list + for df in self.dfs_list ] # pylint: disable=unsubscriptable-object - self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1)) + self.df = gpd.GeoDataFrame(pd.concat(self.dfs_list, axis=1)) # Reset index so that we no longer have the tract as our index self.df = self.df.reset_index() diff --git a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py index efde123c..2c33f888 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/michigan_ejscreen/etl.py @@ -1,6 +1,8 @@ import pandas as pd from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import FileDataSource from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger @@ -15,12 +17,21 @@ class MichiganEnviroScreenETL(ExtractTransformLoad): """ def __init__(self): - self.MICHIGAN_EJSCREEN_S3_URL = ( + + # fetch + self.michigan_ejscreen_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/michigan_ejscore_12212021.csv" ) + # input + self.michigan_ejscreen_source = ( + self.get_sources_path() / "michigan_ejscore_12212021.csv" + ) + + # output self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen" + self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75 self.COLUMNS_TO_KEEP = [ @@ -32,14 +43,28 @@ class MichiganEnviroScreenETL(ExtractTransformLoad): self.df: pd.DataFrame - def extract(self) -> None: + def get_data_sources(self) -> [DataSource]: + return [ + FileDataSource( + source=self.michigan_ejscreen_url, + destination=self.michigan_ejscreen_source, + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + self.df = pd.read_csv( - filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL, + filepath_or_buffer=self.michigan_ejscreen_source, dtype={"GEO_ID": "string"}, low_memory=False, ) def transform(self) -> None: + self.df.rename( columns={ "GEO_ID": self.GEOID_TRACT_FIELD_NAME, diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index bced98f5..b58d8f30 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -4,6 +4,8 @@ # pylint: disable=unsupported-assignment-operation import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger from data_pipeline.config import settings @@ -16,17 +18,6 @@ class NationalRiskIndexETL(ExtractTransformLoad): NAME = "national_risk_index" - if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - SOURCE_URL = ( - f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" - "national_risk_index/NRI_Table_CensusTracts.zip" - ) - else: - SOURCE_URL = ( - "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/" - "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip" - ) - GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False LOAD_YAML_CONFIG: bool = True @@ -46,11 +37,28 @@ class NationalRiskIndexETL(ExtractTransformLoad): AGRIVALUE_LOWER_BOUND = 408000 def __init__(self): - # define the full path for the input CSV file - self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv" + # fetch + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + self.risk_index_url = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "national_risk_index/NRI_Table_CensusTracts.zip" + ) + else: + self.risk_index_url = ( + "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/" + "NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip" + ) + + # source + self.risk_index_source = ( + self.get_sources_path() / "NRI_Table_CensusTracts.csv" + ) + + # output # this is the main dataframe self.df: pd.DataFrame + self.df_nri: pd.DataFrame # Start dataset-specific vars here self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = ( @@ -65,14 +73,26 @@ class NationalRiskIndexETL(ExtractTransformLoad): self.POPULATION_INPUT_FIELD_NAME = "POPULATION" self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE" - def extract(self) -> None: - """Unzips NRI dataset from the FEMA data source and writes the files - to the temporary data folder for use in the transform() method - """ + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.risk_index_url, destination=self.get_sources_path() + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: super().extract( - source_url=self.SOURCE_URL, - extract_path=self.get_tmp_path(), + use_cached_data_sources + ) # download and extract data sources + + # read in the unzipped csv from NRI data source then rename the + # Census Tract column for merging + self.df_nri = pd.read_csv( + self.risk_index_source, + dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"}, + na_values=["None"], + low_memory=False, ) def transform(self) -> None: @@ -84,16 +104,7 @@ class NationalRiskIndexETL(ExtractTransformLoad): Groups inside of that Tract """ - # read in the unzipped csv from NRI data source then rename the - # Census Tract column for merging - df_nri: pd.DataFrame = pd.read_csv( - self.INPUT_CSV, - dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"}, - na_values=["None"], - low_memory=False, - ) - - df_nri.rename( + self.df_nri.rename( columns={ self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME, self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, @@ -123,42 +134,46 @@ class NationalRiskIndexETL(ExtractTransformLoad): agriculture_columns = [ f"{x}_EALA" for x in disaster_categories - if f"{x}_EALA" in list(df_nri.columns) + if f"{x}_EALA" in list(self.df_nri.columns) ] population_columns = [ f"{x}_EALP" for x in disaster_categories - if f"{x}_EALP" in list(df_nri.columns) + if f"{x}_EALP" in list(self.df_nri.columns) ] buildings_columns = [ f"{x}_EALB" for x in disaster_categories - if f"{x}_EALB" in list(df_nri.columns) + if f"{x}_EALB" in list(self.df_nri.columns) ] - disaster_population_sum_series = df_nri[population_columns].sum(axis=1) - - disaster_agriculture_sum_series = df_nri[agriculture_columns].sum( + disaster_population_sum_series = self.df_nri[population_columns].sum( axis=1 ) - disaster_buildings_sum_series = df_nri[buildings_columns].sum(axis=1) + disaster_agriculture_sum_series = self.df_nri[agriculture_columns].sum( + axis=1 + ) + + disaster_buildings_sum_series = self.df_nri[buildings_columns].sum( + axis=1 + ) # Population EAL Rate = Eal Valp / Population - df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = ( + self.df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = ( disaster_population_sum_series - / df_nri[self.POPULATION_INPUT_FIELD_NAME] + / self.df_nri[self.POPULATION_INPUT_FIELD_NAME] ) # Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000) ## FORMULA ADJUSTMENT 2/17 ## Because AGRIVALUE contains a lot of 0s, we are going to consider ## 90th percentile only for places that have some agrivalue at all - df_nri[ + self.df_nri[ self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME - ] = disaster_agriculture_sum_series / df_nri[ + ] = disaster_agriculture_sum_series / self.df_nri[ self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME ].clip( lower=self.AGRIVALUE_LOWER_BOUND @@ -167,11 +182,11 @@ class NationalRiskIndexETL(ExtractTransformLoad): ## Check that this clip worked -- that the only place the value has changed is when the clip took effect base_expectation = ( disaster_agriculture_sum_series - / df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] + / self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] ) assert ( - df_nri[ - df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] + self.df_nri[ + self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] != base_expectation ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max() <= self.AGRIVALUE_LOWER_BOUND @@ -181,27 +196,27 @@ class NationalRiskIndexETL(ExtractTransformLoad): ) assert ( - df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] + self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] != base_expectation ).sum() > 0, "Clipping the agrivalue did nothing!" # This produces a boolean that is True in the case of non-zero agricultural value - df_nri[self.CONTAINS_AGRIVALUE] = ( - df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0 + self.df_nri[self.CONTAINS_AGRIVALUE] = ( + self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0 ) # divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)). - df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = ( + self.df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = ( disaster_buildings_sum_series - / df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME] + / self.df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME] ) # Round all float columns to just 10 digits. # Note: `round` is smart enough to only apply to float columns. - df_nri = df_nri.round(10) + self.df_nri = self.df_nri.round(10) # Assign the final df to the class' output_df for the load method - self.output_df = df_nri + self.output_df = self.df_nri def load(self) -> None: # Suppress scientific notation. diff --git a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py index 39b12af0..782e824f 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py @@ -3,6 +3,8 @@ import pandas as pd from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger @@ -13,10 +15,7 @@ class NatureDeprivedETL(ExtractTransformLoad): """ETL class for the Nature Deprived Communities dataset""" NAME = "nlcd_nature_deprived" - SOURCE_URL = ( - settings.AWS_JUSTICE40_DATASOURCES_URL - + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip" - ) + GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False LOAD_YAML_CONFIG: bool = True @@ -29,14 +28,25 @@ class NatureDeprivedETL(ExtractTransformLoad): TRACT_PERCENT_CROPLAND_FIELD_NAME: str def __init__(self): - # define the full path for the input CSV file - self.INPUT_CSV = ( - self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv" + + # fetch + self.nature_deprived_url = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + + "/usa_conus_nat_dep__compiled_by_TPL.csv.zip" ) + # source + # define the full path for the input CSV file + self.nature_deprived_source = ( + self.get_sources_path() / "usa_conus_nat_dep__compiled_by_TPL.csv" + ) + + # output # this is the main dataframe self.df: pd.DataFrame + self.df_ncld: pd.DataFrame + # Start dataset-specific vars here self.PERCENT_NATURAL_FIELD_NAME = "PctNatural" self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv" @@ -47,28 +57,43 @@ class NatureDeprivedETL(ExtractTransformLoad): # for area. This does indeed remove tracts from the 90th+ percentile later on self.TRACT_ACRES_LOWER_BOUND = 35 - def transform(self) -> None: + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.nature_deprived_url, + destination=self.get_sources_path(), + ) + ] + + def extract(self, use_cached_data_sources: bool = False) -> None: """Reads the unzipped data file into memory and applies the following transformations to prepare it for the load() method: - Renames columns as needed """ - df_ncld: pd.DataFrame = pd.read_csv( - self.INPUT_CSV, + super().extract( + use_cached_data_sources + ) # download and extract data sources + + self.df_ncld = pd.read_csv( + self.nature_deprived_source, dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, low_memory=False, ) - df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = ( - df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND + def transform(self) -> None: + + self.df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = ( + self.df_ncld[self.TRACT_ACRES_FIELD_NAME] + >= self.TRACT_ACRES_LOWER_BOUND ) - df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = ( - 100 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME] + self.df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = ( + 100 - self.df_ncld[self.PERCENT_NATURAL_FIELD_NAME] ) # Assign the final df to the class' output_df for the load method with rename - self.output_df = df_ncld.rename( + self.output_df = self.df_ncld.rename( columns={ self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME, self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME, diff --git a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py index b797c418..7bdb7b55 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/persistent_poverty/etl.py @@ -3,9 +3,10 @@ import functools import pandas as pd from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger -from data_pipeline.utils import unzip_file_from_url logger = get_module_logger(__name__) @@ -23,6 +24,26 @@ class PersistentPovertyETL(ExtractTransformLoad): PUERTO_RICO_EXPECTED_IN_DATA = False def __init__(self): + + # fetch + self.poverty_url = ( + settings.AWS_JUSTICE40_DATASOURCES_URL + "/LTDB_Std_All_Sample.zip" + ) + + # source + self.poverty_sources = [ + self.get_sources_path() + / "ltdb_std_all_sample" + / "ltdb_std_1990_sample.csv", + self.get_sources_path() + / "ltdb_std_all_sample" + / "ltdb_std_2000_sample.csv", + self.get_sources_path() + / "ltdb_std_all_sample" + / "ltdb_std_2010_sample.csv", + ] + + # output self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty" # Need to change hyperlink to S3 @@ -44,6 +65,13 @@ class PersistentPovertyETL(ExtractTransformLoad): self.df: pd.DataFrame + def get_data_sources(self) -> [DataSource]: + return [ + ZIPDataSource( + source=self.poverty_url, destination=self.get_sources_path() + ) + ] + def _join_input_dfs(self, dfs: list) -> pd.DataFrame: df = functools.reduce( lambda df_a, df_b: pd.merge( @@ -75,28 +103,17 @@ class PersistentPovertyETL(ExtractTransformLoad): return df - def extract(self) -> None: - unzipped_file_path = self.get_tmp_path() + def extract(self, use_cached_data_sources: bool = False) -> None: - unzip_file_from_url( - file_url=settings.AWS_JUSTICE40_DATASOURCES_URL - + "/LTDB_Std_All_Sample.zip", - download_path=self.get_tmp_path(), - unzipped_file_path=unzipped_file_path, - ) - - file_names = [ - "ltdb_std_1990_sample.csv", - "ltdb_std_2000_sample.csv", - "ltdb_std_2010_sample.csv", - ] + super().extract( + use_cached_data_sources + ) # download and extract data sources temporary_input_dfs = [] - for file_name in file_names: + for file_name in self.poverty_sources: temporary_input_df = pd.read_csv( - filepath_or_buffer=unzipped_file_path - / f"ltdb_std_all_sample/{file_name}", + filepath_or_buffer=file_name, dtype={ self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string", self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string", diff --git a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py index 0b99b01a..56f803bd 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/tree_equity_score/etl.py @@ -1,6 +1,8 @@ import geopandas as gpd import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.utils import get_module_logger logger = get_module_logger(__name__) @@ -20,10 +22,17 @@ class TreeEquityScoreETL(ExtractTransformLoad): """ def __init__(self): - self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/" - self.TES_CSV = self.get_tmp_path() / "tes_2021_data.csv" + + # input + self.TES_CSV = self.get_sources_path() / "tes_2021_data.csv" + + # output self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score" self.df: gpd.GeoDataFrame + + self.tes_state_dfs = [] + + # config self.states = [ "al", "az", @@ -76,21 +85,36 @@ class TreeEquityScoreETL(ExtractTransformLoad): "wy", ] - def extract(self) -> None: + def get_data_sources(self) -> [DataSource]: + + tes_url = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/" + + sources = [] for state in self.states: - super().extract( - f"{self.TES_URL}{state}.zip.zip", - f"{self.get_tmp_path()}/{state}", + sources.append( + ZIPDataSource( + source=f"{tes_url}{state}.zip.zip", + destination=self.get_sources_path() / state, + ) + ) + + return sources + + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + + for state in self.states: + self.tes_state_dfs.append( + gpd.read_file(f"{self.get_sources_path()}/{state}/{state}.shp") ) def transform(self) -> None: - tes_state_dfs = [] - for state in self.states: - tes_state_dfs.append( - gpd.read_file(f"{self.get_tmp_path()}/{state}/{state}.shp") - ) + self.df = gpd.GeoDataFrame( - pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs + pd.concat(self.tes_state_dfs), crs=self.tes_state_dfs[0].crs ) # rename ID to Tract ID diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py index f8bd9df7..25f73366 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/tribal/etl.py @@ -4,63 +4,57 @@ import geopandas as gpd import pandas as pd from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import ZIPDataSource from data_pipeline.score import field_names from data_pipeline.utils import get_module_logger -from data_pipeline.utils import unzip_file_from_url logger = get_module_logger(__name__) class TribalETL(ExtractTransformLoad): def __init__(self): + + self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv" + self.GEOGRAPHIC_BASE_PATH = ( self.DATA_PATH / "tribal" / "geographic_data" ) - self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv" self.NATIONAL_TRIBAL_GEOJSON_PATH = ( self.GEOGRAPHIC_BASE_PATH / "usa.json" ) + self.USA_TRIBAL_DF_LIST = [] - def extract(self) -> None: - """Extract the tribal geojson zip files from Justice40 S3 data folder + def get_data_sources(self) -> [DataSource]: - Returns: - None - """ - - bia_shapefile_zip_url = ( + national_lar_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/BIA_National_LAR_updated_20220929.zip" ) - - tsa_and_aian_geojson_zip_url = ( + tsa_and_aian_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/BIA_TSA_and_AIAN_json.zip" ) - - alaska_geojson_url = ( + alaska_native_villages_url = ( settings.AWS_JUSTICE40_DATASOURCES_URL + "/Alaska_Native_Villages_json.zip" ) - unzip_file_from_url( - bia_shapefile_zip_url, - self.TMP_PATH, - self.GEOGRAPHIC_BASE_PATH / "bia_national_lar", - ) - - unzip_file_from_url( - tsa_and_aian_geojson_zip_url, - self.TMP_PATH, - self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian", - ) - - unzip_file_from_url( - alaska_geojson_url, - self.TMP_PATH, - self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages", - ) + return [ + ZIPDataSource( + national_lar_url, + destination=self.get_sources_path() / "bia_national_lar", + ), + ZIPDataSource( + source=tsa_and_aian_url, + destination=self.get_sources_path() / "tsa_and_aian", + ), + ZIPDataSource( + source=alaska_native_villages_url, + destination=self.get_sources_path() / "alaska_native_villages", + ), + ] def _transform_bia_national_lar(self, path: Path) -> None: """Transform the Tribal BIA National Lar Geodataframe and appends it to the @@ -187,21 +181,21 @@ class TribalETL(ExtractTransformLoad): """ # Set the filepaths: bia_national_lar_shapefile = ( - self.GEOGRAPHIC_BASE_PATH / "bia_national_lar" + self.get_sources_path() / "bia_national_lar" ) bia_aian_supplemental_geojson = ( - self.GEOGRAPHIC_BASE_PATH + self.get_sources_path() / "tsa_and_aian" / "BIA_AIAN_Supplemental.json" ) bia_tsa_geojson = ( - self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json" + self.get_sources_path() / "tsa_and_aian" / "BIA_TSA.json" ) alaska_native_villages_geojson = ( - self.GEOGRAPHIC_BASE_PATH + self.get_sources_path() / "alaska_native_villages" / "AlaskaNativeVillages.gdb.geojson" ) @@ -225,7 +219,11 @@ class TribalETL(ExtractTransformLoad): "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" ) + # note – this works a little different than many of the ETLs. The file + # being written here is used again downstream, so it's placed in a + # special directory. logger.debug("Writing national geojson file") + self.GEOGRAPHIC_BASE_PATH.mkdir(parents=True, exist_ok=True) usa_tribal_df.to_file( self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON" ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py index ba2e2226..602e6005 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/tribal_overlap/etl.py @@ -4,6 +4,7 @@ import geopandas as gpd import numpy as np import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries from data_pipeline.etl.sources.geo_utils import get_tract_geojson @@ -67,6 +68,9 @@ class TribalOverlapETL(ExtractTransformLoad): self.census_tract_gdf: gpd.GeoDataFrame self.tribal_gdf: gpd.GeoDataFrame + def get_data_sources(self) -> [DataSource]: + return [] # this uses already retrieved / calculated data + @staticmethod def _create_string_from_list(series: pd.Series) -> str: """Helper method that creates a sorted string list (for tribal names).""" @@ -89,7 +93,12 @@ class TribalOverlapETL(ExtractTransformLoad): return percentage_float - def extract(self) -> None: + def extract(self, use_cached_data_sources: bool = False) -> None: + + super().extract( + use_cached_data_sources + ) # download and extract data sources + self.census_tract_gdf = get_tract_geojson() self.tribal_gdf = get_tribal_geojson() diff --git a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py index 7f692603..3ad58a2a 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/us_army_fuds/etl.py @@ -4,9 +4,10 @@ import geopandas as gpd import numpy as np import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.datasource import DataSource +from data_pipeline.etl.datasource import FileDataSource from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries -from data_pipeline.utils import download_file_from_url from data_pipeline.utils import get_module_logger from data_pipeline.config import settings @@ -28,19 +29,6 @@ class USArmyFUDS(ExtractTransformLoad): def __init__(self): - if settings.DATASOURCE_RETRIEVAL_FROM_AWS: - self.FILE_URL = ( - f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" - "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_" - "all_data_reported_to_Congress_in_FY2020.geojson" - ) - else: - self.FILE_URL: str = ( - "https://opendata.arcgis.com/api/v3/datasets/" - "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/" - "data?format=geojson&spatialRefId=4326&where=1%3D1" - ) - self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds" # Constants for output @@ -50,17 +38,27 @@ class USArmyFUDS(ExtractTransformLoad): self.INELIGIBLE_FUDS_COUNT_FIELD_NAME, self.ELIGIBLE_FUDS_BINARY_FIELD_NAME, ] - self.DOWNLOAD_FILE_NAME = self.get_tmp_path() / "fuds.geojson" + self.fuds_source = self.get_sources_path() / "fuds.geojson" self.raw_df: gpd.GeoDataFrame self.output_df: pd.DataFrame - def extract(self) -> None: - download_file_from_url( - file_url=self.FILE_URL, - download_file_name=self.DOWNLOAD_FILE_NAME, - verify=True, - ) + def get_data_sources(self) -> [DataSource]: + + if settings.DATASOURCE_RETRIEVAL_FROM_AWS: + fuds_url = ( + f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" + "us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_" + "all_data_reported_to_Congress_in_FY2020.geojson" + ) + else: + fuds_url: str = ( + "https://opendata.arcgis.com/api/v3/datasets/" + "3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/" + "data?format=geojson&spatialRefId=4326&where=1%3D1" + ) + + return [FileDataSource(source=fuds_url, destination=self.fuds_source)] def transform(self) -> None: # before we try to do any transformation, get the tract data @@ -68,7 +66,7 @@ class USArmyFUDS(ExtractTransformLoad): logger.debug("Loading FUDS data as GeoDataFrame for transform") raw_df = gpd.read_file( - filename=self.DOWNLOAD_FILE_NAME, + filename=self.fuds_source, low_memory=False, ) diff --git a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py index 271ba800..7450d754 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/cdc_life_expectancy/test_etl.py @@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL): data. A basic version of that patching is included here for classes that can use it. """ + data_path, tmp_path = mock_paths + sources_path = data_path / "sources" / self._ETL_CLASS.__name__ + sources_path.mkdir(parents=True, exist_ok=True) + with mock.patch( - "data_pipeline.utils.requests" + "data_pipeline.etl.downloader.requests" ) as requests_mock, mock.patch( + "data_pipeline.etl.base.ExtractTransformLoad.get_sources_path" + ) as sources_mock, mock.patch( "data_pipeline.etl.score.etl_utils.get_state_fips_codes" ) as mock_get_state_fips_codes: - tmp_path = mock_paths[1] + # requests mock def fake_get(url, *args, **kwargs): file_path = url.split("/")[-1] with open( @@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL): return response_mock requests_mock.get = fake_get + + # fips codes mock mock_get_state_fips_codes.return_value = [ x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS ] + + # sources mock + sources_mock.return_value = sources_path + # Instantiate the ETL class. etl = self._get_instance_of_etl_class() # Monkey-patch the temporary directory to the one used in the test etl.TMP_PATH = tmp_path + etl.SOURCES_PATH = data_path / "sources" # Run the extract method. etl.extract() + + def fake_get_sources_path() -> pathlib.PosixPath: + return sources_path + + mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path) + return etl def test_init(self, mock_etl, mock_paths): diff --git a/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py index 602be901..f0200f62 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/dot_travel_composite/test_etl.py @@ -28,7 +28,7 @@ class TestTravelCompositeETL(TestETL): mock_paths=mock_paths, ) df = gpd.read_file( - etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME, + etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME, dtype={etl.GEOID_TRACT_FIELD_NAME: str}, ) assert df.shape[0] == 30 diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py index 7f78d3e4..d461d913 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/example/etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/example/etl.py @@ -5,6 +5,7 @@ from data_pipeline.config import settings from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.utils import get_module_logger +from data_pipeline.etl.datasource import DataSource logger = get_module_logger(__name__) @@ -30,6 +31,9 @@ class ExampleETL(ExtractTransformLoad): self.EXAMPLE_FIELD_NAME, ] + def get_data_sources(self) -> [DataSource]: + return [] + def extract(self): # Pretend to download zip from external URL, write it to CSV. zip_file_path = ( @@ -42,11 +46,11 @@ class ExampleETL(ExtractTransformLoad): ) with zipfile.ZipFile(zip_file_path, "r") as zip_ref: - zip_ref.extractall(self.get_tmp_path()) + zip_ref.extractall(self.get_sources_path()) def transform(self): df: pd.DataFrame = pd.read_csv( - self.get_tmp_path() / "input.csv", + self.get_sources_path() / "input.csv", dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, low_memory=False, ) diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py index 888cb5f1..cf019b17 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py @@ -124,12 +124,18 @@ class TestETL: data. A basic version of that patching is included here for classes that can use it. """ + data_path, tmp_path = mock_paths + sources_path = data_path / "sources" / self._ETL_CLASS.__name__ + sources_path.mkdir(parents=True, exist_ok=True) + with mock.patch( - "data_pipeline.utils.requests" + "data_pipeline.etl.downloader.requests" ) as requests_mock, mock.patch( + "data_pipeline.etl.base.ExtractTransformLoad.get_sources_path" + ) as sources_mock, mock.patch( "data_pipeline.etl.score.etl_utils.get_state_fips_codes" ) as mock_get_state_fips_codes: - tmp_path = mock_paths[1] + if self._SAMPLE_DATA_ZIP_FILE_NAME is not None: zip_file_fixture_src = ( self._DATA_DIRECTORY_FOR_TEST @@ -145,6 +151,7 @@ class TestETL: "rb", ) as file: file_contents = file.read() + response_mock = requests.Response() response_mock.status_code = 200 # pylint: disable=protected-access @@ -154,15 +161,25 @@ class TestETL: mock_get_state_fips_codes.return_value = [ x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS ] + + # sources mock + sources_mock.return_value = sources_path + # Instantiate the ETL class. etl = self._get_instance_of_etl_class() # Monkey-patch the temporary directory to the one used in the test etl.TMP_PATH = tmp_path + etl.SOURCES_PATH = data_path / "sources" # Run the extract method. etl.extract() + def fake_get_sources_path() -> pathlib.PosixPath: + return sources_path + + mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path) + return etl def test_init_base(self, mock_etl, mock_paths): @@ -263,17 +280,12 @@ class TestETL: file was unzipped from a "fake" downloaded zip (located in data) in a temporary path. """ if self._SAMPLE_DATA_ZIP_FILE_NAME is not None: - tmp_path = mock_paths[1] - _ = self._setup_etl_instance_and_run_extract( + etl = self._setup_etl_instance_and_run_extract( mock_etl=mock_etl, mock_paths=mock_paths, ) - assert ( - tmp_path - / self._EXTRACT_TMP_FOLDER_NAME - / self._SAMPLE_DATA_FILE_NAME - ).exists() + assert (etl.get_sources_path()).exists() def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths): """Tests the extract method. @@ -285,8 +297,11 @@ class TestETL: mock_etl=mock_etl, mock_paths=mock_paths, ) + + data_path, tmp_path = mock_paths + tmp_df = pd.read_csv( - etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME, + etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME, dtype={etl.GEOID_TRACT_FIELD_NAME: str}, ) snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST diff --git a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py index de2c7f8f..51aedce5 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/historic_redlining/test_etl.py @@ -29,7 +29,7 @@ class TestHistoricRedliningETL(TestETL): mock_paths=mock_paths, ) tmp_df = pd.read_excel( - etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME, + etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME, dtype={etl.GEOID_TRACT_FIELD_NAME: str}, ) assert tmp_df.shape == (15, 5) diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py index 493c0be2..26ef48bf 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py @@ -36,23 +36,12 @@ class TestNationalRiskIndexETL(TestETL): def test_init(self, mock_etl, mock_paths): """Tests that the mock NationalRiskIndexETL class instance was - initiliazed correctly. - - Validates the following conditions: - - self.DATA_PATH points to the "data" folder in the temp directory - - self.TMP_PATH points to the "data/tmp" folder in the temp directory - - self.INPUT_PATH points to the correct path in the temp directory - - self.OUTPUT_PATH points to the correct path in the temp directory + initialized correctly. """ # setup etl = NationalRiskIndexETL() - data_path, tmp_path = mock_paths - input_csv = ( - tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv" - ) # validation - assert etl.INPUT_CSV == input_csv assert etl.GEOID_FIELD_NAME == "GEOID10" assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT" assert etl.NAME == "national_risk_index"