Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs

* Update code to be more production-ish

* Move fetch to Extract part of ETL
* Create a downloader to house all downloading operations
* Remove unnecessary "name" in data source

* Format source files with black

* Fix issues from pylint and get the tests working with the new folder structure

* Clean up files with black

* Fix unzip test

* Add caching notes to README

* Fix tests (linting and case sensitivity bug)

* Address PR comments and add API keys for census where missing

* Merging comparator changes from main into this branch for the sake of the PR

* Add note on using cache (-u) during pipeline
This commit is contained in:
Travis Newby 2023-03-03 12:26:24 -06:00 committed by GitHub
parent 4d9c1dd11e
commit 6f39033dde
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
52 changed files with 1787 additions and 686 deletions

View file

@ -92,7 +92,6 @@ If you want to run specific data tasks, you can open a terminal window, navigate
- Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles`
To learn more about these commands and when they should be run, refer to [Running for Local Development](#running-for-local-development).
</details>
---
@ -136,6 +135,9 @@ Once you've downloaded the census data, run the following commands in order
Many commands have options. For example, you can run a single dataset with `etl-run` by passing the command line parameter `-d name-of-dataset-to-run`. Please use the `--help` option to find out more.
> :bulb: **NOTE**
> One important command line option is enabling cached data sources. Pass the command line parameter `-u` to many commands (e.g. `etl-run`) to use locally cached data sources within the ETL portion of the pipeline. This will ensure that you don't download many GB of data with each run of the data pipeline.
## How Scoring Works
Scores are generated by running the `score-run` command via Poetry or Docker. This command executes [`data_pipeline/etl/score/etl_score.py`](data_pipeline/etl/score/etl_score.py). During execution,

View file

@ -7,6 +7,9 @@ from data_pipeline.etl.runner import etl_runner
from data_pipeline.etl.runner import score_generate
from data_pipeline.etl.runner import score_geo
from data_pipeline.etl.runner import score_post
from data_pipeline.etl.runner import get_data_sources
from data_pipeline.etl.runner import extract_data_sources as extract_ds
from data_pipeline.etl.runner import clear_data_source_cache as clear_ds_cache
from data_pipeline.etl.sources.census.etl_utils import check_census_data_source
from data_pipeline.etl.sources.census.etl_utils import (
reset_data_directories as census_reset,
@ -79,7 +82,14 @@ def data_cleanup():
is_flag=True,
help="Upload to AWS S3 a zipped archive of the census data.",
)
def census_data_download(zip_compress):
@click.option(
"-u",
"--use-cache",
is_flag=True,
default=False,
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
)
def census_data_download(zip_compress, use_cache):
"""CLI command to download all census shape files from the Census FTP and extract the geojson
to generate national and by state Census Block Group CSVs"""
log_title("Download Census Data ")
@ -88,7 +98,7 @@ def census_data_download(zip_compress):
census_reset(data_path)
log_info("Downloading census data")
etl_runner("census")
etl_runner("census", use_cache)
if zip_compress:
log_info("Zipping census data")
@ -129,7 +139,14 @@ def pull_census_data(data_source: str):
type=str,
help=dataset_cli_help,
)
def etl_run(dataset: str):
@click.option(
"-u",
"--use-cache",
is_flag=True,
default=False,
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
)
def etl_run(dataset: str, use_cache: bool):
"""Run a specific or all ETL processes
Args:
@ -141,7 +158,7 @@ def etl_run(dataset: str):
log_title("Run ETL")
log_info("Running dataset(s)")
etl_runner(dataset)
etl_runner(dataset, use_cache)
log_goodbye()
sys.exit()
@ -167,7 +184,14 @@ def score_run():
@cli.command(
help="Run ETL + Score Generation",
)
def score_full_run():
@click.option(
"-u",
"--use-cache",
is_flag=True,
default=False,
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
)
def score_full_run(use_cache: bool):
"""CLI command to run ETL and generate the score in one command"""
log_title("Score Full Run", "Run ETL and Generate Score (no tiles)")
@ -177,7 +201,7 @@ def score_full_run():
temp_folder_cleanup()
log_info("Running all ETLs")
etl_runner()
etl_runner(use_cache=use_cache)
log_info("Generating score")
score_generate()
@ -297,7 +321,14 @@ def generate_map_tiles(generate_tribal_layer):
type=str,
help=dataset_cli_help,
)
def data_full_run(check: bool, data_source: str):
@click.option(
"-u",
"--use-cache",
is_flag=True,
default=False,
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
)
def data_full_run(check: bool, data_source: str, use_cache: bool):
"""CLI command to run ETL, score, JSON combine and generate tiles in one command
Args:
@ -330,10 +361,10 @@ def data_full_run(check: bool, data_source: str):
if data_source == "local":
log_info("Downloading census data")
etl_runner("census")
etl_runner("census", use_cache)
log_info("Running all ETLs")
etl_runner()
etl_runner(use_cache=use_cache)
log_info("Generating score")
score_generate()
@ -357,6 +388,103 @@ def data_full_run(check: bool, data_source: str):
sys.exit()
@cli.command(
help="Print data sources for all ETL processes (or a specific one)",
)
@click.option(
"-d",
"--dataset",
required=False,
type=str,
help=dataset_cli_help,
)
def print_data_sources(dataset: str):
"""Print data sources for all ETL processes (or a specific one)
Args:
dataset (str): Name of the ETL module to be run (optional)
Returns:
None
"""
log_title("Print ETL Datasources")
log_info("Retrieving dataset(s)")
sources = get_data_sources(dataset)
log_info(f"Discovered {len(sources)} files")
for s in sources:
log_info(s)
log_goodbye()
sys.exit()
@cli.command(
help="Fetch data sources for all ETL processes (or a specific one)",
)
@click.option(
"-d",
"--dataset",
required=False,
type=str,
help=dataset_cli_help,
)
@click.option(
"-u",
"--use-cache",
is_flag=True,
default=False,
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
)
def extract_data_sources(dataset: str, use_cache: bool):
"""Extract and cache data source(s) for all ETL processes (or a specific one)
Args:
dataset (str): Name of the ETL module whose data sources you wish to fetch
use_cache (bool): Use this flag if you wish to use the cached data sources (if they exist)
Returns:
None
"""
log_title("Fetch ETL Datasources")
log_info("Fetching data source(s)")
extract_ds(dataset, use_cache)
log_goodbye()
sys.exit()
@cli.command(
help="Clear data source cache for all ETL processes (or a specific one)",
)
@click.option(
"-d",
"--dataset",
required=False,
type=str,
help=dataset_cli_help,
)
def clear_data_source_cache(dataset: str):
"""Clear data source(s) cache for all ETL processes (or a specific one)
Args:
dataset (str): Name of the ETL module whose cache you wish to clear
Returns:
None
"""
log_title("Fetch ETL Datasources")
log_info("Clear data source cache")
clear_ds_cache(dataset)
log_goodbye()
sys.exit()
def log_title(title: str, subtitle: str = None):
"""Logs a title in our fancy title format"""
logger.info("-" * LOG_LINE_WIDTH)

View file

@ -2,7 +2,9 @@ import enum
import pathlib
import sys
import typing
import shutil
from typing import Optional
from abc import ABC, abstractmethod
import pandas as pd
from data_pipeline.config import settings
@ -13,7 +15,7 @@ from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import load_yaml_dict_from_file
from data_pipeline.utils import remove_all_from_dir
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__)
@ -25,7 +27,7 @@ class ValidGeoLevel(enum.Enum):
CENSUS_BLOCK_GROUP = enum.auto()
class ExtractTransformLoad:
class ExtractTransformLoad(ABC):
"""
A class used to instantiate an ETL object to retrieve and process data from
datasets.
@ -45,6 +47,7 @@ class ExtractTransformLoad:
# Directories
DATA_PATH: pathlib.Path = settings.DATA_PATH
TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
SOURCES_PATH: pathlib.Path = DATA_PATH / "sources"
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
DATASET_CONFIG: Optional[dict] = None
@ -177,45 +180,60 @@ class ExtractTransformLoad:
output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
return output_file_path
def get_tmp_path(self) -> pathlib.Path:
"""Returns the temporary path associated with this ETL class."""
# Note: the temporary path will be defined on `init`, because it uses the class
# of the instance which is often a child class.
tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
def get_sources_path(self) -> pathlib.Path:
"""Returns the sources path associated with this ETL class. The sources path
is the home for cached data sources used by this ETL."""
sources_path = self.SOURCES_PATH / str(self.__class__.__name__)
# Create directory if it doesn't exist
tmp_path.mkdir(parents=True, exist_ok=True)
sources_path.mkdir(parents=True, exist_ok=True)
return tmp_path
return sources_path
def extract(
self,
source_url: str = None,
extract_path: pathlib.Path = None,
verify: Optional[bool] = True,
) -> None:
"""Extract the data from a remote source. By default it provides code
to get the file from a source url, unzips it and stores it on an
extract_path."""
@abstractmethod
def get_data_sources(self) -> [DataSource]:
pass
if source_url is None:
source_url = self.SOURCE_URL
def _fetch(self) -> None:
"""Fetch all data sources for this ETL. When data sources are fetched, they
are stored in a cache directory for consistency between runs."""
for ds in self.get_data_sources():
ds.fetch()
if extract_path is None:
extract_path = self.get_tmp_path()
def clear_data_source_cache(self) -> None:
"""Clears the cache for this ETLs data source(s)"""
shutil.rmtree(self.get_sources_path())
unzip_file_from_url(
file_url=source_url,
download_path=self.get_tmp_path(),
unzipped_file_path=extract_path,
verify=verify,
)
def extract(self, use_cached_data_sources: bool = False) -> None:
"""Extract (download) data from a remote source, and validate
that data. By default, this method fetches data from the set of
data sources returned by get_data_sources.
If use_cached_data_sources is true, this method attempts to use cached data
rather than re-downloading from the original source. The cache algorithm is very
simple: it just looks to see if the directory has any contents. If so, it uses
that content. If not, it downloads all data sources.
Subclasses should call super() before performing any work if they wish to take
advantage of the automatic downloading and caching ability of this superclass.
"""
if use_cached_data_sources and any(self.get_sources_path().iterdir()):
logger.info(
f"Using cached data sources for {self.__class__.__name__}"
)
else:
self.clear_data_source_cache()
self._fetch()
# the rest of the work should be performed here
@abstractmethod
def transform(self) -> None:
"""Transform the data extracted into a format that can be consumed by the
score generator"""
raise NotImplementedError
pass
def validate(self) -> None:
"""Validates the output.
@ -380,3 +398,14 @@ class ExtractTransformLoad:
def cleanup(self) -> None:
"""Clears out any files stored in the TMP folder"""
remove_all_from_dir(self.get_tmp_path())
def get_tmp_path(self) -> pathlib.Path:
"""Returns the temporary path associated with this ETL class."""
# Note: the temporary path will be defined on `init`, because it uses the class
# of the instance which is often a child class.
tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
# Create directory if it doesn't exist
tmp_path.mkdir(parents=True, exist_ok=True)
return tmp_path

View file

@ -0,0 +1,124 @@
"""This module defines a set of classes that can be used to fetch data
from a remote source. They are meant to be used in conjuction with ETLs
or other classes that require downloading data.
There are three types of data sources defined in this file:
FileDataSource meant to be used when you have a single file to
retrive from a remote location and save to a destination.
ZipDataSource used when you need to fetch and unzip a file, and save
the contents of that file to a destination.
CensusDataSource used to download data from the Census API and store
the contents to a destination.
DataSource subclasses must implement the fetch method to define how
they will reach out to a remote source, download the data, and save
that data to the destination.
"""
from pathlib import Path
from typing import List
from dataclasses import dataclass
from abc import ABC, abstractmethod
from data_pipeline.etl.downloader import Downloader
from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
@dataclass
class DataSource(ABC):
"""A data source represents any source of data that is fetchable
from a remote location.
Attributes:
source : str
the location of this data source, as a url
destination : Path
the Path where the data source should be saved locally upon being fetched
"""
source: str
destination: Path
@abstractmethod
def fetch(self) -> None:
pass
@dataclass
class FileDataSource(DataSource):
"""A data source representing a single file.
This single file will be fetched from the source and saved to a single
destination.
"""
def fetch(self) -> None:
"""Fetches a single file from a source and saves it to a destination."""
self.destination.parent.mkdir(parents=True, exist_ok=True)
Downloader.download_file_from_url(
file_url=self.source,
download_file_name=self.destination,
verify=True,
)
def __str__(self):
return f"File {self.source}"
@dataclass
class ZIPDataSource(DataSource):
"""A data source representing ZIP files.
Zip files will be fetched and placed in the destination folder, then unzipped.
"""
def fetch(self) -> None:
self.destination.mkdir(parents=True, exist_ok=True)
Downloader.download_zip_file_from_url(
file_url=self.source,
unzipped_file_path=self.destination,
verify=True,
)
def __str__(self):
return f"Zip {self.source}"
@dataclass
class CensusDataSource(DataSource):
"""A data source representing census data.
Data will be fetched using the Census API and saved to the destination file. Source is ignored.
"""
acs_year: int
variables: List[str]
tract_output_field_name: str
data_path_for_fips_codes: Path
acs_type: str
def fetch(self) -> None:
df = retrieve_census_acs_data(
acs_year=self.acs_year,
variables=self.variables,
tract_output_field_name=self.tract_output_field_name,
data_path_for_fips_codes=self.data_path_for_fips_codes,
acs_type=self.acs_type,
)
self.destination.parent.mkdir(parents=True, exist_ok=True)
# Write CSV representation of census data
df.to_csv(self.destination, index=False)
def __str__(self):
return f"Census {self.acs_type}, {self.acs_year}"

View file

@ -0,0 +1,95 @@
import uuid
import urllib3
import requests
import zipfile
import shutil
from pathlib import Path
from data_pipeline.config import settings
class Downloader:
"""A simple class to encapsulate the download capabilities of the application"""
@classmethod
def download_file_from_url(
cls,
file_url: str,
download_file_name: Path,
verify: bool = True,
) -> str:
"""Downloads a file from a remote URL location and returns the file location.
Args:
file_url (str): URL where the zip file is located
download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default)
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
error (optional, default to False)
Returns:
None
"""
# disable https warning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
download_file_name.parent.mkdir(parents=True, exist_ok=True)
response = requests.get(
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
)
if response.status_code == 200:
file_contents = response.content
else:
raise Exception(
f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
)
# Write the contents to disk.
file = open(download_file_name, "wb")
file.write(file_contents)
file.close()
return download_file_name
@classmethod
def download_zip_file_from_url(
cls,
file_url: str,
unzipped_file_path: Path,
verify: bool = True,
) -> None:
"""Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after
Args:
file_url (str): URL where the zip file is located
unzipped_file_path (pathlib.Path): directory and name of the extracted file
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
error (optional, default to False)
Returns:
None
"""
# dir_id allows us to evade race conditions on parallel ETLs
dir_id = uuid.uuid4()
zip_download_path = (
settings.DATA_PATH
/ "tmp"
/ "downloads"
/ f"{dir_id}"
/ "download.zip"
)
zip_file_path = Downloader.download_file_from_url(
file_url=file_url,
download_file_name=zip_download_path,
verify=verify,
)
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(unzipped_file_path)
# cleanup temporary file and directory
shutil.rmtree(zip_download_path.parent)

View file

@ -2,10 +2,14 @@ import concurrent.futures
import importlib
import typing
from functools import reduce
from data_pipeline.etl.score.etl_score import ScoreETL
from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
from data_pipeline.etl.score.etl_score_post import PostScoreETL
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from . import constants
@ -40,20 +44,26 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
return dataset_list
def _run_one_dataset(dataset: dict) -> None:
"""Runs one etl process."""
logger.info(f"Running ETL for {dataset['name']}")
def _get_dataset(dataset: dict) -> ExtractTransformLoad:
"""Instantiates a dataset object from a dictionary description of that object's class"""
etl_module = importlib.import_module(
f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
)
etl_class = getattr(etl_module, dataset["class_name"])
etl_instance = etl_class()
return etl_instance
def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
"""Runs one etl process."""
logger.info(f"Running ETL for {dataset['name']}")
etl_instance = _get_dataset(dataset)
# run extract
logger.debug(f"Extracting {dataset['name']}")
etl_instance.extract()
etl_instance.extract(use_cache)
# run transform
logger.debug(f"Transforming {dataset['name']}")
@ -74,11 +84,12 @@ def _run_one_dataset(dataset: dict) -> None:
logger.info(f"Finished ETL for dataset {dataset['name']}")
def etl_runner(dataset_to_run: str = None) -> None:
def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None:
"""Runs all etl processes or a specific one
Args:
dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
use_cache (bool): Use the cached data sources if they exist rather than downloading them all from scratch
Returns:
None
@ -105,7 +116,9 @@ def etl_runner(dataset_to_run: str = None) -> None:
logger.info("Running concurrent ETL jobs")
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {
executor.submit(_run_one_dataset, dataset=dataset)
executor.submit(
_run_one_dataset, dataset=dataset, use_cache=use_cache
)
for dataset in concurrent_datasets
}
@ -119,7 +132,50 @@ def etl_runner(dataset_to_run: str = None) -> None:
if high_memory_datasets:
logger.info("Running high-memory ETL jobs")
for dataset in high_memory_datasets:
_run_one_dataset(dataset=dataset)
_run_one_dataset(dataset=dataset, use_cache=use_cache)
def get_data_sources(dataset_to_run: str = None) -> [DataSource]:
dataset_list = _get_datasets_to_run(dataset_to_run)
sources = []
for dataset in dataset_list:
etl_instance = _get_dataset(dataset)
sources.append(etl_instance.get_data_sources())
sources = reduce(
list.__add__, sources
) # flatten the list of lists into a single list
return sources
def extract_data_sources(
dataset_to_run: str = None, use_cache: bool = False
) -> None:
dataset_list = _get_datasets_to_run(dataset_to_run)
for dataset in dataset_list:
etl_instance = _get_dataset(dataset)
logger.info(
f"Extracting data set for {etl_instance.__class__.__name__}"
)
etl_instance.extract(use_cache)
def clear_data_source_cache(dataset_to_run: str = None) -> None:
dataset_list = _get_datasets_to_run(dataset_to_run)
for dataset in dataset_list:
etl_instance = _get_dataset(dataset)
logger.info(
f"Clearing data set cache for {etl_instance.__class__.__name__}"
)
etl_instance.clear_data_source_cache()
def score_generate() -> None:

View file

@ -22,6 +22,8 @@ from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
from data_pipeline.score import field_names
from data_pipeline.score.score_runner import ScoreRunner
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__)
@ -55,7 +57,13 @@ class ScoreETL(ExtractTransformLoad):
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return (
[]
) # we have all prerequisite sources locally as a result of running the ETLs
def extract(self, use_cached_data_sources: bool = False) -> None:
# EJSCreen csv Load
ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
self.ejscreen_df = pd.read_csv(

View file

@ -15,6 +15,7 @@ from data_pipeline.utils import get_module_logger
from data_pipeline.utils import load_dict_from_yaml_object_fields
from data_pipeline.utils import load_yaml_dict_from_file
from data_pipeline.utils import zip_files
from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__)
@ -68,7 +69,13 @@ class GeoScoreETL(ExtractTransformLoad):
self.geojson_score_usa_high: gpd.GeoDataFrame
self.geojson_score_usa_low: gpd.GeoDataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return (
[]
) # we have all prerequisite sources locally as a result of generating the previous steps in the pipeline
def extract(self, use_cached_data_sources: bool = False) -> None:
# check census data
check_census_data_source(
census_data_path=self.DATA_PATH / "census",

View file

@ -2,7 +2,9 @@ import json
from pathlib import Path
import numpy as np
from numpy import float64
import pandas as pd
from data_pipeline.content.schemas.download_schemas import CodebookConfig
from data_pipeline.content.schemas.download_schemas import CSVConfig
from data_pipeline.content.schemas.download_schemas import ExcelConfig
@ -16,7 +18,8 @@ from data_pipeline.utils import get_module_logger
from data_pipeline.utils import load_dict_from_yaml_object_fields
from data_pipeline.utils import load_yaml_dict_from_file
from data_pipeline.utils import zip_files
from numpy import float64
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.downloader import Downloader
from . import constants
@ -61,6 +64,11 @@ class PostScoreETL(ExtractTransformLoad):
self.yaml_global_config_sort_by_label = "sort_by_label"
# End YAML definition constants
def get_data_sources(self) -> [DataSource]:
return (
[]
) # we have all prerequisite sources locally as a result of generating the score
def _extract_counties(self, county_path: Path) -> pd.DataFrame:
logger.debug("Reading Counties CSV")
return pd.read_csv(
@ -97,17 +105,23 @@ class PostScoreETL(ExtractTransformLoad):
return df
def extract(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# check census data
check_census_data_source(
census_data_path=self.DATA_PATH / "census",
census_data_source=self.DATA_SOURCE,
)
super().extract(
constants.CENSUS_COUNTIES_ZIP_URL,
constants.TMP_PATH,
# TODO would could probably add this to the data sources for this file
Downloader.download_zip_file_from_url(
constants.CENSUS_COUNTIES_ZIP_URL, constants.TMP_PATH
)
self.input_counties_df = self._extract_counties(
constants.CENSUS_COUNTIES_FILE_NAME
)

View file

@ -13,7 +13,7 @@ from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.etl.downloader import Downloader
from data_pipeline.utils import get_module_logger
from . import constants
@ -48,7 +48,7 @@ def check_score_data_source(
# download from s3 if census_data_source is aws
if score_data_source == "aws":
logger.debug("Fetching Score Tile data from AWS S3")
download_file_from_url(
Downloader.download_file_from_url(
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
)
else:

View file

@ -1,23 +1,36 @@
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class CalEnviroScreenETL(ExtractTransformLoad):
"""California environmental screen
TODO: Need good description
"""
def __init__(self):
self.CALENVIROSCREEN_FTP_URL = (
# fetch
self.calenviroscreen_ftp_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/CalEnviroScreen_4.0_2021.zip"
)
self.CALENVIROSCREEN_CSV = (
self.get_tmp_path() / "CalEnviroScreen_4.0_2021.csv"
)
self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
# Definining some variable names
# input
self.calenviroscreen_source = (
self.get_sources_path() / "CalEnviroScreen_4.0_2021.csv"
)
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
# Defining some variable names
self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
"calenviroscreen_percentile"
@ -32,19 +45,28 @@ class CalEnviroScreenETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.calenviroscreen_ftp_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
self.CALENVIROSCREEN_FTP_URL,
self.get_tmp_path(),
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.calenviroscreen_source, dtype={"Census Tract": "string"}
)
def transform(self) -> None:
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(
self.CALENVIROSCREEN_CSV, dtype={"Census Tract": "string"}
)
self.df.rename(
columns={
@ -68,5 +90,5 @@ class CalEnviroScreenETL(ExtractTransformLoad):
def load(self) -> None:
# write nationwide csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.OUTPUT_PATH / "data06.csv", index=False)

View file

@ -7,8 +7,9 @@ from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.score.etl_utils import (
compare_to_list_of_expected_state_fips_codes,
)
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -17,59 +18,74 @@ logger = get_module_logger(__name__)
class CDCLifeExpectancy(ExtractTransformLoad):
"""#TODO: create description"""
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
NAME = "cdc_life_expectancy"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
else:
USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
LOAD_YAML_CONFIG: bool = False
LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"
STATES_MISSING_FROM_USA_FILE = ["23", "55"]
# For some reason, LEEP does not include Maine or Wisconsin in its "All of
# USA" file. Load these separately.
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
else:
WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
TRACT_INPUT_COLUMN_NAME = "Tract ID"
STATE_INPUT_COLUMN_NAME = "STATE2KX"
raw_df: pd.DataFrame
output_df: pd.DataFrame
raw_df: pd.DataFrame # result of extraction
output_df: pd.DataFrame # result of transformation
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.usa_file_url = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
else:
self.usa_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
# For some reason, LEEP does not include Maine or Wisconsin in its "All of USA" file. Load these separately.
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.wisconsin_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
self.maine_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
else:
self.wisconsin_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
self.maine_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
# input
self.usa_source = self.get_sources_path() / "US_A.CSV"
self.maine_source = self.get_sources_path() / "ME_A.CSV"
self.wisconsin_source = self.get_sources_path() / "WI_A.CSV"
# output
self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "cdc_life_expectancy"
)
# Constants for output
self.COLUMNS_TO_KEEP = [
self.COLUMNS_TO_KEEP = [ # the columns to save on output
self.GEOID_TRACT_FIELD_NAME,
field_names.LIFE_EXPECTANCY_FIELD,
]
def _download_and_prep_data(
self, file_url: str, download_file_name: pathlib.Path
) -> pd.DataFrame:
download_file_from_url(
file_url=file_url,
download_file_name=download_file_name,
verify=True,
)
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.usa_file_url, destination=self.usa_source
),
FileDataSource(
source=self.maine_file_url, destination=self.maine_source
),
FileDataSource(
source=self.wisconsin_file_url,
destination=self.wisconsin_source,
),
]
def _read_data(self, file_name: pathlib.Path) -> pd.DataFrame:
df = pd.read_csv(
filepath_or_buffer=download_file_name,
filepath_or_buffer=file_name,
dtype={
# The following need to remain as strings for all of their digits, not get converted to numbers.
self.TRACT_INPUT_COLUMN_NAME: "string",
@ -80,12 +96,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):
return df
def extract(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
all_usa_raw_df = self._download_and_prep_data(
file_url=self.USA_FILE_URL,
download_file_name=self.get_tmp_path() / "US_A.CSV",
)
super().extract(
use_cached_data_sources
) # download and extract data sources
all_usa_raw_df = self._read_data(self.usa_source)
# Check which states are missing
states_in_life_expectancy_usa_file = list(
@ -101,17 +118,11 @@ class CDCLifeExpectancy(ExtractTransformLoad):
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
)
logger.debug("Downloading data for Maine")
maine_raw_df = self._download_and_prep_data(
file_url=self.MAINE_FILE_URL,
download_file_name=self.get_tmp_path() / "maine.csv",
maine_raw_df = self._read_data(
self.maine_source,
)
logger.debug("Downloading data for Wisconsin")
wisconsin_raw_df = self._download_and_prep_data(
file_url=self.WISCONSIN_FILE_URL,
download_file_name=self.get_tmp_path() / "wisconsin.csv",
)
wisconsin_raw_df = self._read_data(self.wisconsin_source)
combined_df = pd.concat(
objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df],

View file

@ -4,14 +4,17 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
logger = get_module_logger(__name__)
class CDCPlacesETL(ExtractTransformLoad):
"""#TODO: Need description"""
NAME = "cdc_places"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
@ -21,15 +24,21 @@ class CDCPlacesETL(ExtractTransformLoad):
CDC_MEASURE_FIELD_NAME = "Measure"
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.CDC_PLACES_URL = (
self.cdc_places_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
)
else:
self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
self.cdc_places_url = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
# input
self.places_source = self.get_sources_path() / "census_tract.csv"
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
self.COLUMNS_TO_KEEP: typing.List[str] = [
self.GEOID_TRACT_FIELD_NAME,
@ -43,19 +52,27 @@ class CDCPlacesETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
file_path = download_file_from_url(
file_url=self.CDC_PLACES_URL,
download_file_name=self.get_tmp_path() / "census_tract.csv",
)
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.cdc_places_url, destination=self.places_source
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
filepath_or_buffer=file_path,
filepath_or_buffer=self.places_source,
dtype={self.CDC_GEOID_FIELD_NAME: "string"},
low_memory=False,
)
def transform(self) -> None:
# Rename GEOID field
self.df.rename(
columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},

View file

@ -1,6 +1,8 @@
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -11,22 +13,28 @@ logger = get_module_logger(__name__)
class CDCSVIIndex(ExtractTransformLoad):
"""CDC SVI Index class ingests 2018 dataset located
here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html
Please see the README in this module for further details.
"""
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.CDC_SVI_INDEX_URL = (
self.cdc_svi_index_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"cdc_svi_index/SVI2018_US.csv"
)
else:
self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
self.cdc_svi_index_url = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
# input
self.svi_source = self.get_sources_path() / "SVI2018_US.csv"
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
self.CDC_RPL_THEMES_THRESHOLD = 0.90
self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS"
self.COLUMNS_TO_KEEP = [
@ -47,9 +55,21 @@ class CDCSVIIndex(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.cdc_svi_index_url, destination=self.svi_source
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
filepath_or_buffer=self.CDC_SVI_INDEX_URL,
filepath_or_buffer=self.svi_source,
dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
low_memory=False,
)
@ -107,8 +127,8 @@ class CDCSVIIndex(ExtractTransformLoad):
)
def load(self) -> None:
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df[self.COLUMNS_TO_KEEP].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
)

View file

@ -8,7 +8,8 @@ import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -20,7 +21,7 @@ class GeoFileType(Enum):
class CensusETL(ExtractTransformLoad):
SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
# SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
@ -29,6 +30,9 @@ class CensusETL(ExtractTransformLoad):
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
def __init__(self):
self.shape_file_path = self.get_sources_path() / "shp"
# the fips_states_2010.csv is generated from data here
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
@ -50,7 +54,7 @@ class CensusETL(ExtractTransformLoad):
file_path: Path
if file_type == GeoFileType.SHP:
file_path = Path(
self.SHP_BASE_PATH
self.shape_file_path
/ fips_code
/ f"tl_2010_{fips_code}_tract10.shp"
)
@ -60,33 +64,22 @@ class CensusETL(ExtractTransformLoad):
file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
return file_path
def _extract_shp(self, fips_code: str) -> None:
"""Download the SHP file for the provided FIPS code
def get_data_sources(self) -> [DataSource]:
Args:
fips_code (str): the FIPS code for the region of interest
sources = []
Returns:
None
"""
shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
for fips_code in self.STATE_FIPS_CODES:
# check if file exists
if not shp_file_path.is_file():
tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
unzip_file_from_url(
tract_state_url,
self.TMP_PATH,
self.DATA_PATH / "census" / "shp" / fips_code,
destination_path = self.shape_file_path / fips_code
sources.append(
ZIPDataSource(
source=tract_state_url, destination=destination_path
)
)
def extract(self) -> None:
logger.debug("Extracting census data")
for index, fips_code in enumerate(self.STATE_FIPS_CODES):
logger.debug(
f"Extracting shape for FIPS {fips_code} {index+1} of {len(self.STATE_FIPS_CODES)}"
)
self._extract_shp(fips_code)
return sources
def _transform_to_geojson(self, fips_code: str) -> None:
"""Convert the downloaded SHP file for the associated FIPS to geojson

View file

@ -56,6 +56,7 @@ def get_state_fips_codes(data_path: Path) -> list:
else:
fips = row[0].strip()
fips_state_list.append(fips)
return fips_state_list

View file

@ -8,12 +8,11 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census_acs.etl_imputations import (
calculate_income_measures,
)
from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import CensusDataSource
logger = get_module_logger(__name__)
@ -28,6 +27,9 @@ class CensusACSETL(ExtractTransformLoad):
MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
def __init__(self):
self.census_acs_source = self.get_sources_path() / "acs.csv"
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
self.EMPLOYMENT_FIELDS = [
@ -311,6 +313,34 @@ class CensusACSETL(ExtractTransformLoad):
self.df: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
# Define the variables to retrieve
variables = (
[
self.MEDIAN_INCOME_FIELD,
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.EMPLOYMENT_FIELDS
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS
+ self.EDUCATIONAL_FIELDS
+ self.RE_FIELDS
+ self.COLLEGE_ATTENDANCE_FIELDS
+ self.AGE_INPUT_FIELDS
)
return [
CensusDataSource(
source=None,
destination=self.census_acs_source,
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
acs_type="acs5",
)
]
# pylint: disable=too-many-arguments
def _merge_geojson(
self,
@ -339,27 +369,15 @@ class CensusACSETL(ExtractTransformLoad):
)
)
def extract(self) -> None:
# Define the variables to retrieve
variables = (
[
self.MEDIAN_INCOME_FIELD,
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.EMPLOYMENT_FIELDS
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS
+ self.EDUCATIONAL_FIELDS
+ self.RE_FIELDS
+ self.COLLEGE_ATTENDANCE_FIELDS
+ self.AGE_INPUT_FIELDS
)
def extract(self, use_cached_data_sources: bool = False) -> None:
self.df = retrieve_census_acs_data(
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.census_acs_source,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
)
def transform(self) -> None:

View file

@ -1,10 +1,9 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import CensusDataSource
logger = get_module_logger(__name__)
@ -18,6 +17,9 @@ class CensusACS2010ETL(ExtractTransformLoad):
"""
def __init__(self):
self.census_acs_source = self.get_sources_path() / "acs_2010.csv"
self.ACS_YEAR = 2010
self.ACS_TYPE = "acs5"
self.OUTPUT_PATH = (
@ -99,7 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
# Define the variables to retrieve
variables = (
self.UNEMPLOYED_FIELDS
@ -107,13 +109,26 @@ class CensusACS2010ETL(ExtractTransformLoad):
+ self.POVERTY_FIELDS
)
# Use the method defined on CensusACSETL to reduce coding redundancy.
self.df = retrieve_census_acs_data(
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
acs_type=self.ACS_TYPE,
return [
CensusDataSource(
source=None,
destination=self.census_acs_source,
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
acs_type=self.ACS_TYPE,
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.census_acs_source, dtype={"GEOID10_TRACT": "string"}
)
def transform(self) -> None:

View file

@ -1,14 +1,16 @@
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import requests
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.datasource import FileDataSource
logger = get_module_logger(__name__)
@ -22,6 +24,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
/ f"census_acs_median_income_{self.ACS_YEAR}"
)
self.GEOCORR_ALL_STATES_URL = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr2014_all_states_tracts_only.csv.zip"
)
self.GEOCORR_ALL_STATES_PATH = self.get_sources_path() / "geocorr"
self.GEOCORR_ALL_STATES_SOURCE = (
self.GEOCORR_ALL_STATES_PATH
/ "geocorr2014_all_states_tracts_only.csv"
)
# Set constants for Geocorr MSAs data.
self.PLACE_FIELD_NAME: str = "Census Place Name"
self.COUNTY_FIELD_NAME: str = "County Name"
@ -39,10 +51,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E"
+ "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area"
)
self.MSA_MEDIAN_INCOME_SOURCE = (
self.get_sources_path() / "msa" / "msa_median_income.json"
)
self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)"
# Set constants for state median incomes
self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state"
self.STATE_MEDIAN_INCOME_SOURCE = (
self.get_sources_path() / "state" / "state_median_income.json"
)
self.STATE_GEOID_FIELD_NAME: str = "GEOID2"
self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)"
@ -50,6 +68,18 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.PUERTO_RICO_S3_LINK: str = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv"
)
self.PUERTO_RICO_ALL_STATES_SOURCE = (
self.get_sources_path() / "pr_tracts" / "pr_tracts.csv"
)
census_api_key = os.environ.get("CENSUS_API_KEY")
if census_api_key:
self.MSA_MEDIAN_INCOME_URL = (
self.MSA_MEDIAN_INCOME_URL + f"&key={census_api_key}"
)
self.STATE_MEDIAN_INCOME_URL = (
self.STATE_MEDIAN_INCOME_URL + f"&key={census_api_key}"
)
# Constants for output
self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
@ -76,6 +106,27 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.state_median_incomes: dict
self.pr_tracts: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.GEOCORR_ALL_STATES_URL,
destination=self.GEOCORR_ALL_STATES_PATH,
),
FileDataSource(
source=self.PUERTO_RICO_S3_LINK,
destination=self.PUERTO_RICO_ALL_STATES_SOURCE,
),
FileDataSource(
source=self.MSA_MEDIAN_INCOME_URL,
destination=self.MSA_MEDIAN_INCOME_SOURCE,
),
FileDataSource(
source=self.STATE_MEDIAN_INCOME_URL,
destination=self.STATE_MEDIAN_INCOME_SOURCE,
),
]
def _transform_geocorr(self) -> pd.DataFrame:
# Transform the geocorr data
geocorr_df = self.raw_geocorr_df
@ -223,7 +274,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
)
return state_median_incomes_df
def extract(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
# Load and clean GEOCORR data
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
# The specific query used is the following, which takes a couple of minutes to run:
@ -239,18 +291,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
# - Core based statistical area (CBSA)
# - CBSA Type (Metro or Micro)
logger.debug("Starting download of 1.5MB Geocorr information.")
unzip_file_from_url(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr2014_all_states_tracts_only.csv.zip",
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path() / "geocorr",
)
super().extract(
use_cached_data_sources
) # download and extract data sources
self.raw_geocorr_df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "geocorr"
/ "geocorr2014_all_states_tracts_only.csv",
filepath_or_buffer=self.GEOCORR_ALL_STATES_SOURCE,
# Skip second row, which has descriptions.
skiprows=[1],
# The following need to remain as strings for all of their digits, not get converted to numbers.
@ -264,39 +310,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
low_memory=False,
)
logger.debug("Pulling PR tract list down.")
# This step is necessary because PR is not in geocorr at the level that gets joined
pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
download_file_from_url(
file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file
)
self.pr_tracts = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "pr_tracts"
/ "pr_tracts.csv",
filepath_or_buffer=self.PUERTO_RICO_ALL_STATES_SOURCE,
# The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={"GEOID10_TRACT": str},
low_memory=False,
)
self.pr_tracts["State Abbreviation"] = "PR"
# Download MSA median incomes
logger.debug("Starting download of MSA median incomes.")
download = requests.get(
self.MSA_MEDIAN_INCOME_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
self.msa_median_incomes = json.loads(download.content)
with self.MSA_MEDIAN_INCOME_SOURCE.open() as source:
self.msa_median_incomes = json.load(source)
# Download state median incomes
logger.debug("Starting download of state median incomes.")
download_state = requests.get(
self.STATE_MEDIAN_INCOME_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
self.state_median_incomes = json.loads(download_state.content)
with self.STATE_MEDIAN_INCOME_SOURCE.open() as source:
self.state_median_incomes = json.load(source)
## NOTE we already have PR's MI here
def transform(self) -> None:

View file

@ -1,13 +1,14 @@
import json
from typing import List
import os
import numpy as np
import pandas as pd
import requests
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
pd.options.mode.chained_assignment = "raise"
@ -342,20 +343,23 @@ class CensusDecennialETL(ExtractTransformLoad):
+ "&for=tract:*&in=state:{}%20county:{}"
)
census_api_key = os.environ.get("CENSUS_API_KEY")
if census_api_key:
self.API_URL = self.API_URL + f"&key={census_api_key}"
self.final_race_fields: List[str] = []
self.df: pd.DataFrame
self.df_vi: pd.DataFrame
self.df_all: pd.DataFrame
def extract(self) -> None:
dfs = []
dfs_vi = []
def get_data_sources(self) -> [DataSource]:
sources = []
for island in self.ISLAND_TERRITORIES:
logger.debug(
f"Downloading data for state/territory {island['state_abbreviation']}"
)
for county in island["county_fips"]:
api_url = self.API_URL.format(
self.DECENNIAL_YEAR,
island["state_abbreviation"],
@ -363,17 +367,48 @@ class CensusDecennialETL(ExtractTransformLoad):
island["fips"],
county,
)
logger.debug(f"CENSUS: Requesting {api_url}")
download = requests.get(
api_url,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
sources.append(
FileDataSource(
source=api_url,
destination=self.get_sources_path()
/ str(self.DECENNIAL_YEAR)
/ island["state_abbreviation"]
/ island["fips"]
/ county
/ "census.json",
)
)
return sources
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
dfs = []
dfs_vi = []
for island in self.ISLAND_TERRITORIES:
logger.debug(
f"Downloading data for state/territory {island['state_abbreviation']}"
)
for county in island["county_fips"]:
try:
df = json.loads(download.content)
filepath = (
self.get_sources_path()
/ str(self.DECENNIAL_YEAR)
/ island["state_abbreviation"]
/ island["fips"]
/ county
/ "census.json"
)
df = json.load(filepath.open())
except ValueError as e:
logger.error(
f"Could not load content in census decennial ETL because {e}. Content is {download.content}."
f"Could not load content in census decennial ETL because {e}."
)
# First row is the header

View file

@ -5,6 +5,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -38,17 +40,26 @@ class ChildOpportunityIndex(ExtractTransformLoad):
PUERTO_RICO_EXPECTED_IN_DATA = False
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.SOURCE_URL = (
self.child_opportunity_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"child_opportunity_index/raw.zip"
)
else:
self.SOURCE_URL = (
self.child_opportunity_url = (
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
"3a0ededa30a0?format=csv"
)
# input
self.child_opportunity_index_source = (
self.get_sources_path() / "raw.csv"
)
# output
# TODO: Decide about nixing this
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
@ -62,17 +73,25 @@ class ChildOpportunityIndex(ExtractTransformLoad):
self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN"
self.READING_INPUT_FIELD = "ED_READING"
self.raw_df: pd.DataFrame
self.output_df: pd.DataFrame
def extract(self) -> None:
super().extract(
source_url=self.SOURCE_URL,
extract_path=self.get_tmp_path(),
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.child_opportunity_url,
destination=self.get_sources_path(),
)
]
def transform(self) -> None:
raw_df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() / "raw.csv",
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.raw_df = pd.read_csv(
filepath_or_buffer=self.child_opportunity_index_source,
# The following need to remain as strings for all of their digits, not get
# converted to numbers.
dtype={
@ -81,7 +100,9 @@ class ChildOpportunityIndex(ExtractTransformLoad):
low_memory=False,
)
output_df = raw_df.rename(
def transform(self) -> None:
output_df = self.raw_df.rename(
columns={
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD,

View file

@ -5,22 +5,35 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
class DOEEnergyBurden(ExtractTransformLoad):
NAME = "doe_energy_burden"
SOURCE_URL: str = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
)
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
LOAD_YAML_CONFIG: bool = True
REVISED_ENERGY_BURDEN_FIELD_NAME: str
def __init__(self):
# fetch
self.doe_energy_burden_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
)
# input
self.doe_energy_burden_source = (
self.get_sources_path() / "DOE_LEAD_AMI_TRACT_2018_ALL.csv"
)
# output
self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "doe_energy_burden"
)
@ -29,10 +42,22 @@ class DOEEnergyBurden(ExtractTransformLoad):
self.raw_df: pd.DataFrame
self.output_df: pd.DataFrame
def transform(self) -> None:
raw_df: pd.DataFrame = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.doe_energy_burden_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.raw_df = pd.read_csv(
filepath_or_buffer=self.doe_energy_burden_source,
# The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={
self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
@ -40,8 +65,10 @@ class DOEEnergyBurden(ExtractTransformLoad):
low_memory=False,
)
def transform(self) -> None:
logger.debug("Renaming columns and ensuring output format is correct")
output_df = raw_df.rename(
output_df = self.raw_df.rename(
columns={
self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,

View file

@ -3,6 +3,8 @@
import geopandas as gpd
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -15,14 +17,6 @@ class TravelCompositeETL(ExtractTransformLoad):
NAME = "travel_composite"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
SOURCE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"dot_travel_composite/Shapefile_and_Metadata.zip"
)
else:
SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True
@ -31,14 +25,29 @@ class TravelCompositeETL(ExtractTransformLoad):
TRAVEL_BURDEN_FIELD_NAME: str
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.travel_composite_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"dot_travel_composite/Shapefile_and_Metadata.zip"
)
else:
self.travel_composite_url = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
# input
# define the full path for the input CSV file
self.INPUT_SHP = (
self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
self.disadvantage_layer_shape_source = (
self.get_sources_path()
/ "DOT_Disadvantage_Layer_Final_April2022.shp"
)
# output
# this is the main dataframe
self.df: pd.DataFrame
self.df_dot: pd.DataFrame
# Start dataset-specific vars here
## Average of Transportation Indicator Percentiles (calculated)
## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
@ -46,6 +55,22 @@ class TravelCompositeETL(ExtractTransformLoad):
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.travel_composite_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df_dot = gpd.read_file(self.disadvantage_layer_shape_source)
def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method:
@ -54,15 +79,15 @@ class TravelCompositeETL(ExtractTransformLoad):
- Converts to CSV
"""
# read in the unzipped shapefile from data source
# reformat it to be standard df, remove unassigned rows, and
# then rename the Census Tract column for merging
df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
df_dot = df_dot.rename(
self.df_dot = self.df_dot.rename(
columns={
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
}
).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
# Assign the final df to the class' output_df for the load method
self.output_df = df_dot
self.output_df = self.df_dot

View file

@ -1,12 +1,15 @@
from pathlib import Path
import geopandas as gpd
import pandas as pd
import geopandas as gpd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
logger = get_module_logger(__name__)
@ -39,13 +42,20 @@ class AbandonedMineETL(ExtractTransformLoad):
"55",
]
# Define these for easy code completion
def __init__(self):
self.SOURCE_URL = (
# fetch
self.eamlis_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/eAMLIS export of all data.tsv.zip"
)
# input
self.eamlis_source = (
self.get_sources_path() / "eAMLIS export of all data.tsv"
)
# output
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
self.OUTPUT_PATH: Path = (
@ -58,18 +68,34 @@ class AbandonedMineETL(ExtractTransformLoad):
]
self.output_df: pd.DataFrame
self.df: pd.DataFrame
def transform(self) -> None:
df = pd.read_csv(
self.get_tmp_path() / "eAMLIS export of all data.tsv",
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.eamlis_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.eamlis_source,
sep="\t",
low_memory=False,
)
def transform(self) -> None:
gdf = gpd.GeoDataFrame(
df,
self.df,
geometry=gpd.points_from_xy(
x=df["Longitude"],
y=df["Latitude"],
x=self.df["Longitude"],
y=self.df["Latitude"],
),
crs="epsg:4326",
)
@ -77,4 +103,5 @@ class AbandonedMineETL(ExtractTransformLoad):
gdf_tracts = add_tracts_for_geometries(gdf)
gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
gdf_tracts[self.AML_BOOLEAN] = True
self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]

View file

@ -3,6 +3,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -15,11 +17,18 @@ class EJSCREENETL(ExtractTransformLoad):
INPUT_GEOID_TRACT_FIELD_NAME: str = "ID"
def __init__(self):
self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
self.EJSCREEN_CSV = (
self.get_tmp_path() / "EJSCREEN_2021_USPR_Tracts.csv"
# fetch
self.ejscreen_url = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
# input
self.ejscreen_source = (
self.get_sources_path() / "EJSCREEN_2021_USPR_Tracts.csv"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen"
self.df: pd.DataFrame
self.COLUMNS_TO_KEEP = [
@ -43,22 +52,29 @@ class EJSCREENETL(ExtractTransformLoad):
field_names.UST_FIELD,
]
def extract(self) -> None:
super().extract(
self.EJSCREEN_FTP_URL,
self.get_tmp_path(),
verify=False, # EPA EJScreen end point has certificate issues often
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.ejscreen_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
def transform(self) -> None:
self.df = pd.read_csv(
self.EJSCREEN_CSV,
self.ejscreen_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
# EJSCREEN writes the word "None" for NA data.
na_values=["None"],
low_memory=False,
)
def transform(self) -> None:
# rename ID to Tract ID
self.output_df = self.df.rename(
columns={

View file

@ -1,5 +1,6 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -9,12 +10,17 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
# Note: while we normally set these properties in `__init__`,
# we are setting them as class properties here so they can be accessed by the
# class method `ejscreen_areas_of_concern_data_exists`.
LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
EJSCREEN_AREAS_OF_CONCERN_SOURCE = (
ExtractTransformLoad.DATA_PATH
/ "sources"
/ "EJSCREENAreasOfConcernETL"
/ "ejscreen_areas_of_concerns_indicators.csv"
)
def __init__(self):
# output
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
)
@ -22,6 +28,10 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
# TO DO: Load from actual source; the issue is that this dataset is not public for now
self.df: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
"""The source for this must be downloaded and saved manually. It is not publicly available"""
return []
@classmethod
def ejscreen_areas_of_concern_data_exists(cls):
"""Check whether or not the EJSCREEN areas of concern data exists.
@ -35,13 +45,19 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
not reference this data.
"""
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE.is_file()
def extract(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
logger.info(self.EJSCREEN_AREAS_OF_CONCERN_SOURCE)
if self.ejscreen_areas_of_concern_data_exists():
logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
self.df = pd.read_csv(
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE,
dtype={
self.GEOID_FIELD_NAME: "string",
},

View file

@ -5,18 +5,27 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
def __init__(self):
self.DEFINITION_ALTERNATIVE_FILE_URL = (
# fetch
self.definition_alternative_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/alternative DAC definition.csv.zip"
)
# input
self.definition_alternative_source = (
self.get_sources_path() / "J40 alternative DAC definition.csv"
)
# output
self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "energy_definition_alternative_draft"
)
@ -48,18 +57,22 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
unzip_file_from_url(
file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path()
/ "energy_definition_alternative_draft",
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.definition_alternative_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "energy_definition_alternative_draft"
/ "J40 alternative DAC definition.csv",
filepath_or_buffer=self.definition_alternative_source,
# The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={
self.TRACT_INPUT_COLUMN_NAME: "string",
@ -68,6 +81,7 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
)
def transform(self) -> None:
self.df = self.df.rename(
columns={
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,

View file

@ -4,8 +4,9 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -23,17 +24,25 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.AGGREGATED_RSEI_SCORE_FILE_URL = (
self.aggregated_rsei_score_file_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
)
else:
self.AGGREGATED_RSEI_SCORE_FILE_URL = (
self.aggregated_rsei_score_file_url = (
"http://abt-rsei.s3.amazonaws.com/microdata2019/"
"census_agg/CensusMicroTracts2019_2019_aggregated.zip"
)
# input
self.aggregated_rsei_score_source = (
self.get_sources_path()
/ "CensusMicroTracts2019_2019_aggregated.csv"
)
# output
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
self.TRACT_INPUT_COLUMN_NAME = "GEOID10"
@ -64,7 +73,20 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.aggregated_rsei_score_file_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# the column headers from the above dataset are actually a census tract's data at this point
# We will use this data structure later to specify the column names
input_columns = [
@ -79,16 +101,8 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
self.NCSCORE_INPUT_FIELD,
]
unzip_file_from_url(
file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL,
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path() / "epa_rsei",
)
self.df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "epa_rsei"
/ "CensusMicroTracts2019_2019_aggregated.csv",
filepath_or_buffer=self.aggregated_rsei_score_source,
# The following need to remain as strings for all of their digits, not get
# converted to numbers.
low_memory=False,

View file

@ -5,6 +5,8 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -15,7 +17,7 @@ class FloodRiskETL(ExtractTransformLoad):
NAME = "fsf_flood_risk"
# These data were emailed to the J40 team while first street got
# their official data sharing channels setup.
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
LOAD_YAML_CONFIG: bool = True
@ -27,13 +29,16 @@ class FloodRiskETL(ExtractTransformLoad):
SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str
def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = (
self.get_tmp_path() / "fsf_flood" / "flood-tract2010.csv"
# fetch
self.flood_tract_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
)
# this is the main dataframe
self.df: pd.DataFrame
# input
self.flood_tract_source = (
self.get_sources_path() / "fsf_flood" / "flood-tract2010.csv"
)
# Start dataset-specific vars here
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
@ -41,6 +46,29 @@ class FloodRiskETL(ExtractTransformLoad):
self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30"
self.CLIP_PROPERTIES_COUNT = 250
self.df_fsf_flood: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.flood_tract_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# read in the unzipped csv data source then rename the
# Census Tract column for merging
self.df_fsf_flood = pd.read_csv(
self.flood_tract_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method:
@ -49,35 +77,29 @@ class FloodRiskETL(ExtractTransformLoad):
- Calculates share of properties at risk, left-clipping number of properties at 250
"""
# read in the unzipped csv data source then rename the
# Census Tract column for merging
df_fsf_flood: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = df_fsf_flood[
self.df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_flood[
self.INPUT_GEOID_TRACT_FIELD_NAME
].str.zfill(11)
df_fsf_flood[self.COUNT_PROPERTIES] = df_fsf_flood[
self.df_fsf_flood[self.COUNT_PROPERTIES] = self.df_fsf_flood[
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
].clip(lower=self.CLIP_PROPERTIES_COUNT)
df_fsf_flood[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY] = (
df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
/ df_fsf_flood[self.COUNT_PROPERTIES]
self.df_fsf_flood[
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY
] = (
self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
/ self.df_fsf_flood[self.COUNT_PROPERTIES]
)
df_fsf_flood[
self.df_fsf_flood[
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS
] = (
df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
/ df_fsf_flood[self.COUNT_PROPERTIES]
self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
/ self.df_fsf_flood[self.COUNT_PROPERTIES]
)
# Assign the final df to the class' output_df for the load method with rename
self.output_df = df_fsf_flood.rename(
self.output_df = self.df_fsf_flood.rename(
columns={
self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY,
self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS,

View file

@ -4,6 +4,8 @@ import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -15,7 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad):
NAME = "fsf_wildfire_risk"
# These data were emailed to the J40 team while first street got
# their official data sharing channels setup.
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True
@ -29,18 +31,48 @@ class WildfireRiskETL(ExtractTransformLoad):
SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str
def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = self.get_tmp_path() / "fsf_fire" / "fire-tract2010.csv"
# fetch
self.fsf_fire_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
)
# input
self.fsf_fire_source = (
self.get_sources_path() / "fsf_fire" / "fire-tract2010.csv"
)
# output
# this is the main dataframe
self.df: pd.DataFrame
self.df_fsf_fire: pd.DataFrame
# Start dataset-specific vars here
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag"
self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag"
self.CLIP_PROPERTIES_COUNT = 250
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.fsf_fire_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df_fsf_fire = pd.read_csv(
self.fsf_fire_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method:
@ -50,31 +82,28 @@ class WildfireRiskETL(ExtractTransformLoad):
"""
# read in the unzipped csv data source then rename the
# Census Tract column for merging
df_fsf_fire: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = df_fsf_fire[
self.df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_fire[
self.INPUT_GEOID_TRACT_FIELD_NAME
].str.zfill(11)
df_fsf_fire[self.COUNT_PROPERTIES] = df_fsf_fire[
self.df_fsf_fire[self.COUNT_PROPERTIES] = self.df_fsf_fire[
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
].clip(lower=self.CLIP_PROPERTIES_COUNT)
df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
/ df_fsf_fire[self.COUNT_PROPERTIES]
self.df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
/ self.df_fsf_fire[self.COUNT_PROPERTIES]
)
df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS] = (
df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
/ df_fsf_fire[self.COUNT_PROPERTIES]
self.df_fsf_fire[
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS
] = (
self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
/ self.df_fsf_fire[self.COUNT_PROPERTIES]
)
# Assign the final df to the class' output_df for the load method with rename
self.output_df = df_fsf_fire.rename(
self.output_df = self.df_fsf_fire.rename(
columns={
self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY,
self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS,

View file

@ -3,17 +3,33 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
class GeoCorrETL(ExtractTransformLoad):
NAME = "geocorr"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
def __init__(self):
# fetch
self.geocorr_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr_urban_rural.csv.zip"
)
# input
self.geocorr_source = (
self.get_sources_path() / "geocorr_urban_rural.csv"
)
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"
# Need to change hyperlink to S3
@ -23,7 +39,7 @@ class GeoCorrETL(ExtractTransformLoad):
# The source data for this notebook was downloaded from GeoCorr;
# the instructions for generating the source data is here:
# https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787
self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
# self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag"
self.COLUMNS_TO_KEEP = [
@ -33,16 +49,21 @@ class GeoCorrETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
unzip_file_from_url(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr_urban_rural.csv.zip",
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path(),
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.geocorr_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv",
filepath_or_buffer=self.geocorr_source,
dtype={
self.GEOCORR_GEOID_FIELD_NAME: "string",
},

View file

@ -3,12 +3,16 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
class HistoricRedliningETL(ExtractTransformLoad):
NAME = "historic_redlining"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
EXPECTED_MISSING_STATES = [
"10",
@ -25,14 +29,14 @@ class HistoricRedliningETL(ExtractTransformLoad):
]
PUERTO_RICO_EXPECTED_IN_DATA = False
ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
def __init__(self):
self.CSV_PATH = self.DATA_PATH / "dataset" / "historic_redlining"
self.HISTORIC_REDLINING_FILE_PATH = (
self.get_tmp_path() / "HRS_2010.xlsx"
)
# fetch
self.hrs_url = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
# input
self.hrs_source = self.get_sources_path() / "HRS_2010.xlsx"
self.REDLINING_SCALAR = "Tract-level redlining score"
@ -40,30 +44,47 @@ class HistoricRedliningETL(ExtractTransformLoad):
self.GEOID_TRACT_FIELD_NAME,
self.REDLINING_SCALAR,
]
self.df: pd.DataFrame
self.historic_redlining_data: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.hrs_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.historic_redlining_data = pd.read_excel(self.hrs_source)
def transform(self) -> None:
# this is obviously temporary
historic_redlining_data = pd.read_excel(
self.HISTORIC_REDLINING_FILE_PATH
self.historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
self.historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
)
historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
)
historic_redlining_data = historic_redlining_data.rename(
self.historic_redlining_data = self.historic_redlining_data.rename(
columns={"HRS2010": self.REDLINING_SCALAR}
)
logger.debug(f"{historic_redlining_data.columns}")
logger.debug(f"{self.historic_redlining_data.columns}")
# Calculate lots of different score thresholds for convenience
for threshold in [3.25, 3.5, 3.75]:
historic_redlining_data[
self.historic_redlining_data[
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
] = (historic_redlining_data[self.REDLINING_SCALAR] >= threshold)
] = (
self.historic_redlining_data[self.REDLINING_SCALAR] >= threshold
)
## NOTE We add to columns to keep here
self.COLUMNS_TO_KEEP.append(
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
)
self.output_df = historic_redlining_data
self.output_df = self.historic_redlining_data

View file

@ -1,8 +1,9 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from pandas.errors import EmptyDataError
logger = get_module_logger(__name__)
@ -10,36 +11,46 @@ logger = get_module_logger(__name__)
class HousingTransportationETL(ExtractTransformLoad):
def __init__(self):
self.HOUSING_FTP_URL = (
"https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
)
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / "housing_and_transportation_index"
)
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
housing_url = (
"https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
)
sources = []
for fips in get_state_fips_codes(self.DATA_PATH):
sources.append(
ZIPDataSource(
source=f"{housing_url}{fips}",
destination=self.get_sources_path(),
)
)
return sources
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# Download each state / territory individually
dfs = []
zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
for fips in get_state_fips_codes(self.DATA_PATH):
logger.debug(
f"Downloading housing data for state/territory with FIPS code {fips}"
)
unzip_file_from_url(
f"{self.HOUSING_FTP_URL}{fips}",
self.get_tmp_path(),
zip_file_dir,
)
# New file name:
tmp_csv_file_path = (
zip_file_dir / f"htaindex2019_data_tracts_{fips}.csv"
csv_source = (
self.get_sources_path() / f"htaindex2019_data_tracts_{fips}.csv"
)
try:
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
tmp_df = pd.read_csv(filepath_or_buffer=csv_source)
except EmptyDataError:
logger.error(
f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"

View file

@ -3,24 +3,33 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
class HudHousingETL(ExtractTransformLoad):
NAME = "hud_housing"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
def __init__(self):
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.HOUSING_FTP_URL = (
self.housing_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"hud_housing/2014thru2018-140-csv.zip"
)
else:
self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
self.housing_url = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
# source
# output
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()
@ -55,15 +64,16 @@ class HudHousingETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
super().extract(
self.HOUSING_FTP_URL,
self.HOUSING_ZIP_FILE_DIR,
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.housing_url, destination=self.get_sources_path()
)
]
def _read_chas_table(self, file_name):
# New file name:
tmp_csv_file_path = self.HOUSING_ZIP_FILE_DIR / "140" / file_name
tmp_csv_file_path = self.get_sources_path() / "140" / file_name
tmp_df = pd.read_csv(
filepath_or_buffer=tmp_csv_file_path,
encoding="latin-1",
@ -78,7 +88,12 @@ class HudHousingETL(ExtractTransformLoad):
return tmp_df
def transform(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
table_8 = self._read_chas_table("Table8.csv")
table_3 = self._read_chas_table("Table3.csv")
@ -86,6 +101,8 @@ class HudHousingETL(ExtractTransformLoad):
table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME
)
def transform(self) -> None:
# Calculate share that lacks indoor plumbing or kitchen
# This is computed as
# (

View file

@ -1,7 +1,9 @@
import pandas as pd
import requests
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.utils import get_module_logger
@ -11,44 +13,51 @@ logger = get_module_logger(__name__)
class HudRecapETL(ExtractTransformLoad):
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.HUD_RECAP_CSV_URL = (
self.hud_recap_csv_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
)
else:
self.HUD_RECAP_CSV_URL = (
self.hud_recap_csv_url = (
"https://opendata.arcgis.com/api/v3/datasets/"
"56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
)
self.HUD_RECAP_CSV = (
self.get_tmp_path()
# input
self.hud_recap_source = (
self.get_sources_path()
/ "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"
# Definining some variable names
# Defining some variable names
self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
"hud_recap_priority_community"
)
self.df: pd.DataFrame
def extract(self) -> None:
download = requests.get(
self.HUD_RECAP_CSV_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
file_contents = download.content
csv_file = open(self.HUD_RECAP_CSV, "wb")
csv_file.write(file_contents)
csv_file.close()
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.hud_recap_csv_url, destination=self.hud_recap_source
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(self.hud_recap_source, dtype={"GEOID": "string"})
def transform(self) -> None:
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
self.df.rename(
columns={

View file

@ -2,6 +2,8 @@ import geopandas as gpd
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
@ -10,16 +12,25 @@ logger = get_module_logger(__name__)
class MappingForEJETL(ExtractTransformLoad):
def __init__(self):
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
self.MAPPING_FOR_EJ_VA_URL = (
# fetch
self.mapping_for_ej_va_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip"
)
self.MAPPING_FOR_EJ_CO_URL = (
self.mapping_for_ej_co_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
)
self.VA_SHP_FILE_PATH = self.get_tmp_path() / "mej_virginia_7_1.shp"
self.CO_SHP_FILE_PATH = self.get_tmp_path() / "mej_colorado_final.shp"
# input
self.va_shp_file_source = (
self.get_sources_path() / "mej_virginia_7_1.shp"
)
self.co_shp_file_source = (
self.get_sources_path() / "mej_colorado_final.shp"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
# Defining variables
self.COLUMNS_TO_KEEP = [
@ -38,26 +49,35 @@ class MappingForEJETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
super().extract(
self.MAPPING_FOR_EJ_VA_URL,
self.get_tmp_path(),
)
super().extract(
self.MAPPING_FOR_EJ_CO_URL,
self.get_tmp_path(),
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.mapping_for_ej_va_url,
destination=self.get_sources_path(),
),
ZIPDataSource(
source=self.mapping_for_ej_co_url,
destination=self.get_sources_path(),
),
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
def transform(self) -> None:
# Join (here, it's just concatenating) the two dataframes from
# CO and VA
self.df = pd.concat(
[
gpd.read_file(self.VA_SHP_FILE_PATH),
gpd.read_file(self.CO_SHP_FILE_PATH),
gpd.read_file(self.va_shp_file_source),
gpd.read_file(self.co_shp_file_source),
]
)
def transform(self) -> None:
# Fill Census tract to get it to be 11 digits, incl. leading 0s
# Note that VA and CO should never have leading 0s, so this isn't
# strictly necessary, but if in the future, there are more states

View file

@ -3,8 +3,9 @@ import pathlib
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -19,31 +20,35 @@ class MappingInequalityETL(ExtractTransformLoad):
Information on the mapping of this data to census tracts is available at
https://github.com/americanpanorama/Census_HOLC_Research.
"""
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.MAPPING_INEQUALITY_CSV_URL = (
self.mapping_inequality_csv_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"mapping_inequality/holc_tract_lookup.csv"
)
else:
self.MAPPING_INEQUALITY_CSV_URL = (
self.mapping_inequality_csv_url = (
"https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
"main/2010_Census_Tracts/holc_tract_lookup.csv"
)
self.MAPPING_INEQUALITY_CSV = (
self.get_tmp_path() / "holc_tract_lookup.csv"
)
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
self.HOLC_MANUAL_MAPPING_CSV_PATH = (
# input
self.mapping_inequality_source = (
self.get_sources_path() / "holc_tract_lookup.csv"
)
self.holc_manual_mapping_source = ( # here be dragons this file is pulled from a different place than most
pathlib.Path(__file__).parent
/ "data"
/ "holc_grades_manually_mapped.csv"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
# Some input field names. From documentation: 'Census Tracts were intersected
# with HOLC Polygons. Census information can be joined via the "geoid" field.
# There are two field "holc_prop" and "tract_prop" which give the proportion
@ -73,22 +78,39 @@ class MappingInequalityETL(ExtractTransformLoad):
]
self.df: pd.DataFrame
self.holc_manually_mapped_df: pd.DataFrame
def extract(self) -> None:
download_file_from_url(
file_url=self.MAPPING_INEQUALITY_CSV_URL,
download_file_name=self.MAPPING_INEQUALITY_CSV,
)
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.mapping_inequality_csv_url,
destination=self.mapping_inequality_source,
)
]
def transform(self) -> None:
df: pd.DataFrame = pd.read_csv(
self.MAPPING_INEQUALITY_CSV,
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.mapping_inequality_source,
dtype={self.TRACT_INPUT_FIELD: "string"},
low_memory=False,
)
# Some data needs to be manually mapped to its grade.
# TODO: Investigate more data that may need to be manually mapped.
self.holc_manually_mapped_df = pd.read_csv(
filepath_or_buffer=self.holc_manual_mapping_source,
low_memory=False,
)
def transform(self) -> None:
# rename Tract ID
df.rename(
self.df.rename(
columns={
self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
},
@ -98,28 +120,21 @@ class MappingInequalityETL(ExtractTransformLoad):
# Keep the first character, which is the HOLC grade (A, B, C, D).
# TODO: investigate why this dataframe triggers these pylint errors.
# pylint: disable=unsupported-assignment-operation, unsubscriptable-object
df[self.HOLC_GRADE_DERIVED_FIELD] = df[
self.df[self.HOLC_GRADE_DERIVED_FIELD] = self.df[
self.HOLC_GRADE_AND_ID_FIELD
].str[0:1]
# Remove nonsense when the field has no grade or invalid grades.
valid_grades = ["A", "B", "C", "D"]
df.loc[
self.df.loc[
# pylint: disable=unsubscriptable-object
~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
~self.df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
self.HOLC_GRADE_DERIVED_FIELD,
] = None
# Some data needs to be manually mapped to its grade.
# TODO: Investigate more data that may need to be manually mapped.
holc_manually_mapped_df = pd.read_csv(
filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH,
low_memory=False,
)
# Join on the existing data
merged_df = df.merge(
right=holc_manually_mapped_df,
merged_df = self.df.merge(
right=self.holc_manually_mapped_df,
on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
how="left",
)

View file

@ -4,6 +4,8 @@ import geopandas as gpd
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
@ -17,11 +19,16 @@ class MarylandEJScreenETL(ExtractTransformLoad):
"""
def __init__(self):
self.MARYLAND_EJSCREEN_URL = (
# fetch
self.maryland_ejscreen_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
)
self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen"
# input
self.shape_files_source = self.get_sources_path() / "mdejscreen"
# output
self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"
self.COLUMNS_TO_KEEP = [
@ -31,37 +38,47 @@ class MarylandEJScreenETL(ExtractTransformLoad):
]
self.df: pd.DataFrame
self.dfs_list: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.maryland_ejscreen_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
def extract(self) -> None:
logger.debug("Downloading 207MB Maryland EJSCREEN Data")
super().extract(
self.MARYLAND_EJSCREEN_URL,
self.get_tmp_path(),
)
use_cached_data_sources
) # download and extract data sources
def transform(self) -> None:
list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
logger.debug("Downloading 207MB Maryland EJSCREEN Data")
list_of_files = list(glob(str(self.shape_files_source) + "/*.shp"))
# Ignore counties becauses this is not the level of measurement
# Ignore counties because this is not the level of measurement
# that is consistent with our current scoring and ranking methodology.
dfs_list = [
self.dfs_list = [
gpd.read_file(f)
for f in list_of_files
if not f.endswith("CountiesEJScore.shp")
]
def transform(self) -> None:
# Set the Census tract as the index and drop the geometry column
# that produces the census tract boundaries.
# The latter is because Geopandas raises an exception if there
# are duplicate geometry columns.
# Moreover, since the unit of measurement is at the tract level
# we can consistantly merge this with other datasets
dfs_list = [
self.dfs_list = [
df.set_index("Census_Tra").drop("geometry", axis=1)
for df in dfs_list
for df in self.dfs_list
]
# pylint: disable=unsubscriptable-object
self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1))
self.df = gpd.GeoDataFrame(pd.concat(self.dfs_list, axis=1))
# Reset index so that we no longer have the tract as our index
self.df = self.df.reset_index()

View file

@ -1,6 +1,8 @@
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
@ -15,12 +17,21 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
"""
def __init__(self):
self.MICHIGAN_EJSCREEN_S3_URL = (
# fetch
self.michigan_ejscreen_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/michigan_ejscore_12212021.csv"
)
# input
self.michigan_ejscreen_source = (
self.get_sources_path() / "michigan_ejscore_12212021.csv"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen"
self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75
self.COLUMNS_TO_KEEP = [
@ -32,14 +43,28 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.michigan_ejscreen_url,
destination=self.michigan_ejscreen_source,
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
filepath_or_buffer=self.michigan_ejscreen_source,
dtype={"GEO_ID": "string"},
low_memory=False,
)
def transform(self) -> None:
self.df.rename(
columns={
"GEO_ID": self.GEOID_TRACT_FIELD_NAME,

View file

@ -4,6 +4,8 @@
# pylint: disable=unsupported-assignment-operation
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -16,17 +18,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
NAME = "national_risk_index"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
SOURCE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"national_risk_index/NRI_Table_CensusTracts.zip"
)
else:
SOURCE_URL = (
"https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
"NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
)
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True
@ -46,11 +37,28 @@ class NationalRiskIndexETL(ExtractTransformLoad):
AGRIVALUE_LOWER_BOUND = 408000
def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.risk_index_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"national_risk_index/NRI_Table_CensusTracts.zip"
)
else:
self.risk_index_url = (
"https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
"NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
)
# source
self.risk_index_source = (
self.get_sources_path() / "NRI_Table_CensusTracts.csv"
)
# output
# this is the main dataframe
self.df: pd.DataFrame
self.df_nri: pd.DataFrame
# Start dataset-specific vars here
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
@ -65,14 +73,26 @@ class NationalRiskIndexETL(ExtractTransformLoad):
self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"
def extract(self) -> None:
"""Unzips NRI dataset from the FEMA data source and writes the files
to the temporary data folder for use in the transform() method
"""
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.risk_index_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
source_url=self.SOURCE_URL,
extract_path=self.get_tmp_path(),
use_cached_data_sources
) # download and extract data sources
# read in the unzipped csv from NRI data source then rename the
# Census Tract column for merging
self.df_nri = pd.read_csv(
self.risk_index_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
na_values=["None"],
low_memory=False,
)
def transform(self) -> None:
@ -84,16 +104,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
Groups inside of that Tract
"""
# read in the unzipped csv from NRI data source then rename the
# Census Tract column for merging
df_nri: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
na_values=["None"],
low_memory=False,
)
df_nri.rename(
self.df_nri.rename(
columns={
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
@ -123,42 +134,46 @@ class NationalRiskIndexETL(ExtractTransformLoad):
agriculture_columns = [
f"{x}_EALA"
for x in disaster_categories
if f"{x}_EALA" in list(df_nri.columns)
if f"{x}_EALA" in list(self.df_nri.columns)
]
population_columns = [
f"{x}_EALP"
for x in disaster_categories
if f"{x}_EALP" in list(df_nri.columns)
if f"{x}_EALP" in list(self.df_nri.columns)
]
buildings_columns = [
f"{x}_EALB"
for x in disaster_categories
if f"{x}_EALB" in list(df_nri.columns)
if f"{x}_EALB" in list(self.df_nri.columns)
]
disaster_population_sum_series = df_nri[population_columns].sum(axis=1)
disaster_agriculture_sum_series = df_nri[agriculture_columns].sum(
disaster_population_sum_series = self.df_nri[population_columns].sum(
axis=1
)
disaster_buildings_sum_series = df_nri[buildings_columns].sum(axis=1)
disaster_agriculture_sum_series = self.df_nri[agriculture_columns].sum(
axis=1
)
disaster_buildings_sum_series = self.df_nri[buildings_columns].sum(
axis=1
)
# Population EAL Rate = Eal Valp / Population
df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
self.df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
disaster_population_sum_series
/ df_nri[self.POPULATION_INPUT_FIELD_NAME]
/ self.df_nri[self.POPULATION_INPUT_FIELD_NAME]
)
# Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000)
## FORMULA ADJUSTMENT 2/17
## Because AGRIVALUE contains a lot of 0s, we are going to consider
## 90th percentile only for places that have some agrivalue at all
df_nri[
self.df_nri[
self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
] = disaster_agriculture_sum_series / df_nri[
] = disaster_agriculture_sum_series / self.df_nri[
self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME
].clip(
lower=self.AGRIVALUE_LOWER_BOUND
@ -167,11 +182,11 @@ class NationalRiskIndexETL(ExtractTransformLoad):
## Check that this clip worked -- that the only place the value has changed is when the clip took effect
base_expectation = (
disaster_agriculture_sum_series
/ df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
/ self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
)
assert (
df_nri[
df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
self.df_nri[
self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
!= base_expectation
][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
<= self.AGRIVALUE_LOWER_BOUND
@ -181,27 +196,27 @@ class NationalRiskIndexETL(ExtractTransformLoad):
)
assert (
df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
!= base_expectation
).sum() > 0, "Clipping the agrivalue did nothing!"
# This produces a boolean that is True in the case of non-zero agricultural value
df_nri[self.CONTAINS_AGRIVALUE] = (
df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
self.df_nri[self.CONTAINS_AGRIVALUE] = (
self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
)
# divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)).
df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
self.df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
disaster_buildings_sum_series
/ df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
/ self.df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
)
# Round all float columns to just 10 digits.
# Note: `round` is smart enough to only apply to float columns.
df_nri = df_nri.round(10)
self.df_nri = self.df_nri.round(10)
# Assign the final df to the class' output_df for the load method
self.output_df = df_nri
self.output_df = self.df_nri
def load(self) -> None:
# Suppress scientific notation.

View file

@ -3,6 +3,8 @@
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
@ -13,10 +15,7 @@ class NatureDeprivedETL(ExtractTransformLoad):
"""ETL class for the Nature Deprived Communities dataset"""
NAME = "nlcd_nature_deprived"
SOURCE_URL = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
)
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True
@ -29,14 +28,25 @@ class NatureDeprivedETL(ExtractTransformLoad):
TRACT_PERCENT_CROPLAND_FIELD_NAME: str
def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = (
self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
# fetch
self.nature_deprived_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
)
# source
# define the full path for the input CSV file
self.nature_deprived_source = (
self.get_sources_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
)
# output
# this is the main dataframe
self.df: pd.DataFrame
self.df_ncld: pd.DataFrame
# Start dataset-specific vars here
self.PERCENT_NATURAL_FIELD_NAME = "PctNatural"
self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv"
@ -47,28 +57,43 @@ class NatureDeprivedETL(ExtractTransformLoad):
# for area. This does indeed remove tracts from the 90th+ percentile later on
self.TRACT_ACRES_LOWER_BOUND = 35
def transform(self) -> None:
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.nature_deprived_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
"""Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method:
- Renames columns as needed
"""
df_ncld: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df_ncld = pd.read_csv(
self.nature_deprived_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND
def transform(self) -> None:
self.df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
self.df_ncld[self.TRACT_ACRES_FIELD_NAME]
>= self.TRACT_ACRES_LOWER_BOUND
)
df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
100 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
self.df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
100 - self.df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
)
# Assign the final df to the class' output_df for the load method with rename
self.output_df = df_ncld.rename(
self.output_df = self.df_ncld.rename(
columns={
self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME,
self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME,

View file

@ -3,9 +3,10 @@ import functools
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
logger = get_module_logger(__name__)
@ -23,6 +24,26 @@ class PersistentPovertyETL(ExtractTransformLoad):
PUERTO_RICO_EXPECTED_IN_DATA = False
def __init__(self):
# fetch
self.poverty_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/LTDB_Std_All_Sample.zip"
)
# source
self.poverty_sources = [
self.get_sources_path()
/ "ltdb_std_all_sample"
/ "ltdb_std_1990_sample.csv",
self.get_sources_path()
/ "ltdb_std_all_sample"
/ "ltdb_std_2000_sample.csv",
self.get_sources_path()
/ "ltdb_std_all_sample"
/ "ltdb_std_2010_sample.csv",
]
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"
# Need to change hyperlink to S3
@ -44,6 +65,13 @@ class PersistentPovertyETL(ExtractTransformLoad):
self.df: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.poverty_url, destination=self.get_sources_path()
)
]
def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
df = functools.reduce(
lambda df_a, df_b: pd.merge(
@ -75,28 +103,17 @@ class PersistentPovertyETL(ExtractTransformLoad):
return df
def extract(self) -> None:
unzipped_file_path = self.get_tmp_path()
def extract(self, use_cached_data_sources: bool = False) -> None:
unzip_file_from_url(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/LTDB_Std_All_Sample.zip",
download_path=self.get_tmp_path(),
unzipped_file_path=unzipped_file_path,
)
file_names = [
"ltdb_std_1990_sample.csv",
"ltdb_std_2000_sample.csv",
"ltdb_std_2010_sample.csv",
]
super().extract(
use_cached_data_sources
) # download and extract data sources
temporary_input_dfs = []
for file_name in file_names:
for file_name in self.poverty_sources:
temporary_input_df = pd.read_csv(
filepath_or_buffer=unzipped_file_path
/ f"ltdb_std_all_sample/{file_name}",
filepath_or_buffer=file_name,
dtype={
self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",

View file

@ -1,6 +1,8 @@
import geopandas as gpd
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -20,10 +22,17 @@ class TreeEquityScoreETL(ExtractTransformLoad):
"""
def __init__(self):
self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
self.TES_CSV = self.get_tmp_path() / "tes_2021_data.csv"
# input
self.TES_CSV = self.get_sources_path() / "tes_2021_data.csv"
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
self.df: gpd.GeoDataFrame
self.tes_state_dfs = []
# config
self.states = [
"al",
"az",
@ -76,21 +85,36 @@ class TreeEquityScoreETL(ExtractTransformLoad):
"wy",
]
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
tes_url = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
sources = []
for state in self.states:
super().extract(
f"{self.TES_URL}{state}.zip.zip",
f"{self.get_tmp_path()}/{state}",
sources.append(
ZIPDataSource(
source=f"{tes_url}{state}.zip.zip",
destination=self.get_sources_path() / state,
)
)
return sources
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
for state in self.states:
self.tes_state_dfs.append(
gpd.read_file(f"{self.get_sources_path()}/{state}/{state}.shp")
)
def transform(self) -> None:
tes_state_dfs = []
for state in self.states:
tes_state_dfs.append(
gpd.read_file(f"{self.get_tmp_path()}/{state}/{state}.shp")
)
self.df = gpd.GeoDataFrame(
pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs
pd.concat(self.tes_state_dfs), crs=self.tes_state_dfs[0].crs
)
# rename ID to Tract ID

View file

@ -4,63 +4,57 @@ import geopandas as gpd
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
logger = get_module_logger(__name__)
class TribalETL(ExtractTransformLoad):
def __init__(self):
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
self.GEOGRAPHIC_BASE_PATH = (
self.DATA_PATH / "tribal" / "geographic_data"
)
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
self.NATIONAL_TRIBAL_GEOJSON_PATH = (
self.GEOGRAPHIC_BASE_PATH / "usa.json"
)
self.USA_TRIBAL_DF_LIST = []
def extract(self) -> None:
"""Extract the tribal geojson zip files from Justice40 S3 data folder
def get_data_sources(self) -> [DataSource]:
Returns:
None
"""
bia_shapefile_zip_url = (
national_lar_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/BIA_National_LAR_updated_20220929.zip"
)
tsa_and_aian_geojson_zip_url = (
tsa_and_aian_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/BIA_TSA_and_AIAN_json.zip"
)
alaska_geojson_url = (
alaska_native_villages_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/Alaska_Native_Villages_json.zip"
)
unzip_file_from_url(
bia_shapefile_zip_url,
self.TMP_PATH,
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar",
)
unzip_file_from_url(
tsa_and_aian_geojson_zip_url,
self.TMP_PATH,
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian",
)
unzip_file_from_url(
alaska_geojson_url,
self.TMP_PATH,
self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages",
)
return [
ZIPDataSource(
national_lar_url,
destination=self.get_sources_path() / "bia_national_lar",
),
ZIPDataSource(
source=tsa_and_aian_url,
destination=self.get_sources_path() / "tsa_and_aian",
),
ZIPDataSource(
source=alaska_native_villages_url,
destination=self.get_sources_path() / "alaska_native_villages",
),
]
def _transform_bia_national_lar(self, path: Path) -> None:
"""Transform the Tribal BIA National Lar Geodataframe and appends it to the
@ -187,21 +181,21 @@ class TribalETL(ExtractTransformLoad):
"""
# Set the filepaths:
bia_national_lar_shapefile = (
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
self.get_sources_path() / "bia_national_lar"
)
bia_aian_supplemental_geojson = (
self.GEOGRAPHIC_BASE_PATH
self.get_sources_path()
/ "tsa_and_aian"
/ "BIA_AIAN_Supplemental.json"
)
bia_tsa_geojson = (
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json"
self.get_sources_path() / "tsa_and_aian" / "BIA_TSA.json"
)
alaska_native_villages_geojson = (
self.GEOGRAPHIC_BASE_PATH
self.get_sources_path()
/ "alaska_native_villages"
/ "AlaskaNativeVillages.gdb.geojson"
)
@ -225,7 +219,11 @@ class TribalETL(ExtractTransformLoad):
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
)
# note this works a little different than many of the ETLs. The file
# being written here is used again downstream, so it's placed in a
# special directory.
logger.debug("Writing national geojson file")
self.GEOGRAPHIC_BASE_PATH.mkdir(parents=True, exist_ok=True)
usa_tribal_df.to_file(
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
)

View file

@ -4,6 +4,7 @@ import geopandas as gpd
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.etl.sources.geo_utils import get_tract_geojson
@ -67,6 +68,9 @@ class TribalOverlapETL(ExtractTransformLoad):
self.census_tract_gdf: gpd.GeoDataFrame
self.tribal_gdf: gpd.GeoDataFrame
def get_data_sources(self) -> [DataSource]:
return [] # this uses already retrieved / calculated data
@staticmethod
def _create_string_from_list(series: pd.Series) -> str:
"""Helper method that creates a sorted string list (for tribal names)."""
@ -89,7 +93,12 @@ class TribalOverlapETL(ExtractTransformLoad):
return percentage_float
def extract(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.census_tract_gdf = get_tract_geojson()
self.tribal_gdf = get_tribal_geojson()

View file

@ -4,9 +4,10 @@ import geopandas as gpd
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -28,19 +29,6 @@ class USArmyFUDS(ExtractTransformLoad):
def __init__(self):
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.FILE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
"all_data_reported_to_Congress_in_FY2020.geojson"
)
else:
self.FILE_URL: str = (
"https://opendata.arcgis.com/api/v3/datasets/"
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
"data?format=geojson&spatialRefId=4326&where=1%3D1"
)
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"
# Constants for output
@ -50,17 +38,27 @@ class USArmyFUDS(ExtractTransformLoad):
self.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
self.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
]
self.DOWNLOAD_FILE_NAME = self.get_tmp_path() / "fuds.geojson"
self.fuds_source = self.get_sources_path() / "fuds.geojson"
self.raw_df: gpd.GeoDataFrame
self.output_df: pd.DataFrame
def extract(self) -> None:
download_file_from_url(
file_url=self.FILE_URL,
download_file_name=self.DOWNLOAD_FILE_NAME,
verify=True,
)
def get_data_sources(self) -> [DataSource]:
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
fuds_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
"all_data_reported_to_Congress_in_FY2020.geojson"
)
else:
fuds_url: str = (
"https://opendata.arcgis.com/api/v3/datasets/"
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
"data?format=geojson&spatialRefId=4326&where=1%3D1"
)
return [FileDataSource(source=fuds_url, destination=self.fuds_source)]
def transform(self) -> None:
# before we try to do any transformation, get the tract data
@ -68,7 +66,7 @@ class USArmyFUDS(ExtractTransformLoad):
logger.debug("Loading FUDS data as GeoDataFrame for transform")
raw_df = gpd.read_file(
filename=self.DOWNLOAD_FILE_NAME,
filename=self.fuds_source,
low_memory=False,
)

View file

@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
data. A basic version of that patching is included here for classes that can use it.
"""
data_path, tmp_path = mock_paths
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
sources_path.mkdir(parents=True, exist_ok=True)
with mock.patch(
"data_pipeline.utils.requests"
"data_pipeline.etl.downloader.requests"
) as requests_mock, mock.patch(
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
) as sources_mock, mock.patch(
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
) as mock_get_state_fips_codes:
tmp_path = mock_paths[1]
# requests mock
def fake_get(url, *args, **kwargs):
file_path = url.split("/")[-1]
with open(
@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
return response_mock
requests_mock.get = fake_get
# fips codes mock
mock_get_state_fips_codes.return_value = [
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
]
# sources mock
sources_mock.return_value = sources_path
# Instantiate the ETL class.
etl = self._get_instance_of_etl_class()
# Monkey-patch the temporary directory to the one used in the test
etl.TMP_PATH = tmp_path
etl.SOURCES_PATH = data_path / "sources"
# Run the extract method.
etl.extract()
def fake_get_sources_path() -> pathlib.PosixPath:
return sources_path
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
return etl
def test_init(self, mock_etl, mock_paths):

View file

@ -28,7 +28,7 @@ class TestTravelCompositeETL(TestETL):
mock_paths=mock_paths,
)
df = gpd.read_file(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
)
assert df.shape[0] == 30

View file

@ -5,6 +5,7 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__)
@ -30,6 +31,9 @@ class ExampleETL(ExtractTransformLoad):
self.EXAMPLE_FIELD_NAME,
]
def get_data_sources(self) -> [DataSource]:
return []
def extract(self):
# Pretend to download zip from external URL, write it to CSV.
zip_file_path = (
@ -42,11 +46,11 @@ class ExampleETL(ExtractTransformLoad):
)
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(self.get_tmp_path())
zip_ref.extractall(self.get_sources_path())
def transform(self):
df: pd.DataFrame = pd.read_csv(
self.get_tmp_path() / "input.csv",
self.get_sources_path() / "input.csv",
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False,
)

View file

@ -124,12 +124,18 @@ class TestETL:
data. A basic version of that patching is included here for classes that can use it.
"""
data_path, tmp_path = mock_paths
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
sources_path.mkdir(parents=True, exist_ok=True)
with mock.patch(
"data_pipeline.utils.requests"
"data_pipeline.etl.downloader.requests"
) as requests_mock, mock.patch(
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
) as sources_mock, mock.patch(
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
) as mock_get_state_fips_codes:
tmp_path = mock_paths[1]
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
zip_file_fixture_src = (
self._DATA_DIRECTORY_FOR_TEST
@ -145,6 +151,7 @@ class TestETL:
"rb",
) as file:
file_contents = file.read()
response_mock = requests.Response()
response_mock.status_code = 200
# pylint: disable=protected-access
@ -154,15 +161,25 @@ class TestETL:
mock_get_state_fips_codes.return_value = [
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
]
# sources mock
sources_mock.return_value = sources_path
# Instantiate the ETL class.
etl = self._get_instance_of_etl_class()
# Monkey-patch the temporary directory to the one used in the test
etl.TMP_PATH = tmp_path
etl.SOURCES_PATH = data_path / "sources"
# Run the extract method.
etl.extract()
def fake_get_sources_path() -> pathlib.PosixPath:
return sources_path
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
return etl
def test_init_base(self, mock_etl, mock_paths):
@ -263,17 +280,12 @@ class TestETL:
file was unzipped from a "fake" downloaded zip (located in data) in a temporary path.
"""
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
tmp_path = mock_paths[1]
_ = self._setup_etl_instance_and_run_extract(
etl = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl,
mock_paths=mock_paths,
)
assert (
tmp_path
/ self._EXTRACT_TMP_FOLDER_NAME
/ self._SAMPLE_DATA_FILE_NAME
).exists()
assert (etl.get_sources_path()).exists()
def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
"""Tests the extract method.
@ -285,8 +297,11 @@ class TestETL:
mock_etl=mock_etl,
mock_paths=mock_paths,
)
data_path, tmp_path = mock_paths
tmp_df = pd.read_csv(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
)
snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST

View file

@ -29,7 +29,7 @@ class TestHistoricRedliningETL(TestETL):
mock_paths=mock_paths,
)
tmp_df = pd.read_excel(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
)
assert tmp_df.shape == (15, 5)

View file

@ -36,23 +36,12 @@ class TestNationalRiskIndexETL(TestETL):
def test_init(self, mock_etl, mock_paths):
"""Tests that the mock NationalRiskIndexETL class instance was
initiliazed correctly.
Validates the following conditions:
- self.DATA_PATH points to the "data" folder in the temp directory
- self.TMP_PATH points to the "data/tmp" folder in the temp directory
- self.INPUT_PATH points to the correct path in the temp directory
- self.OUTPUT_PATH points to the correct path in the temp directory
initialized correctly.
"""
# setup
etl = NationalRiskIndexETL()
data_path, tmp_path = mock_paths
input_csv = (
tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
)
# validation
assert etl.INPUT_CSV == input_csv
assert etl.GEOID_FIELD_NAME == "GEOID10"
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
assert etl.NAME == "national_risk_index"