Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs

* Update code to be more production-ish

* Move fetch to Extract part of ETL
* Create a downloader to house all downloading operations
* Remove unnecessary "name" in data source

* Format source files with black

* Fix issues from pylint and get the tests working with the new folder structure

* Clean up files with black

* Fix unzip test

* Add caching notes to README

* Fix tests (linting and case sensitivity bug)

* Address PR comments and add API keys for census where missing

* Merging comparator changes from main into this branch for the sake of the PR

* Add note on using cache (-u) during pipeline
This commit is contained in:
Travis Newby 2023-03-03 12:26:24 -06:00 committed by GitHub
parent 4d9c1dd11e
commit 6f39033dde
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
52 changed files with 1787 additions and 686 deletions

View file

@ -92,7 +92,6 @@ If you want to run specific data tasks, you can open a terminal window, navigate
- Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles` - Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles`
To learn more about these commands and when they should be run, refer to [Running for Local Development](#running-for-local-development). To learn more about these commands and when they should be run, refer to [Running for Local Development](#running-for-local-development).
</details> </details>
--- ---
@ -136,6 +135,9 @@ Once you've downloaded the census data, run the following commands in order
Many commands have options. For example, you can run a single dataset with `etl-run` by passing the command line parameter `-d name-of-dataset-to-run`. Please use the `--help` option to find out more. Many commands have options. For example, you can run a single dataset with `etl-run` by passing the command line parameter `-d name-of-dataset-to-run`. Please use the `--help` option to find out more.
> :bulb: **NOTE**
> One important command line option is enabling cached data sources. Pass the command line parameter `-u` to many commands (e.g. `etl-run`) to use locally cached data sources within the ETL portion of the pipeline. This will ensure that you don't download many GB of data with each run of the data pipeline.
## How Scoring Works ## How Scoring Works
Scores are generated by running the `score-run` command via Poetry or Docker. This command executes [`data_pipeline/etl/score/etl_score.py`](data_pipeline/etl/score/etl_score.py). During execution, Scores are generated by running the `score-run` command via Poetry or Docker. This command executes [`data_pipeline/etl/score/etl_score.py`](data_pipeline/etl/score/etl_score.py). During execution,

View file

@ -7,6 +7,9 @@ from data_pipeline.etl.runner import etl_runner
from data_pipeline.etl.runner import score_generate from data_pipeline.etl.runner import score_generate
from data_pipeline.etl.runner import score_geo from data_pipeline.etl.runner import score_geo
from data_pipeline.etl.runner import score_post from data_pipeline.etl.runner import score_post
from data_pipeline.etl.runner import get_data_sources
from data_pipeline.etl.runner import extract_data_sources as extract_ds
from data_pipeline.etl.runner import clear_data_source_cache as clear_ds_cache
from data_pipeline.etl.sources.census.etl_utils import check_census_data_source from data_pipeline.etl.sources.census.etl_utils import check_census_data_source
from data_pipeline.etl.sources.census.etl_utils import ( from data_pipeline.etl.sources.census.etl_utils import (
reset_data_directories as census_reset, reset_data_directories as census_reset,
@ -79,7 +82,14 @@ def data_cleanup():
is_flag=True, is_flag=True,
help="Upload to AWS S3 a zipped archive of the census data.", help="Upload to AWS S3 a zipped archive of the census data.",
) )
def census_data_download(zip_compress): @click.option(
"-u",
"--use-cache",
is_flag=True,
default=False,
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
)
def census_data_download(zip_compress, use_cache):
"""CLI command to download all census shape files from the Census FTP and extract the geojson """CLI command to download all census shape files from the Census FTP and extract the geojson
to generate national and by state Census Block Group CSVs""" to generate national and by state Census Block Group CSVs"""
log_title("Download Census Data ") log_title("Download Census Data ")
@ -88,7 +98,7 @@ def census_data_download(zip_compress):
census_reset(data_path) census_reset(data_path)
log_info("Downloading census data") log_info("Downloading census data")
etl_runner("census") etl_runner("census", use_cache)
if zip_compress: if zip_compress:
log_info("Zipping census data") log_info("Zipping census data")
@ -129,7 +139,14 @@ def pull_census_data(data_source: str):
type=str, type=str,
help=dataset_cli_help, help=dataset_cli_help,
) )
def etl_run(dataset: str): @click.option(
"-u",
"--use-cache",
is_flag=True,
default=False,
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
)
def etl_run(dataset: str, use_cache: bool):
"""Run a specific or all ETL processes """Run a specific or all ETL processes
Args: Args:
@ -141,7 +158,7 @@ def etl_run(dataset: str):
log_title("Run ETL") log_title("Run ETL")
log_info("Running dataset(s)") log_info("Running dataset(s)")
etl_runner(dataset) etl_runner(dataset, use_cache)
log_goodbye() log_goodbye()
sys.exit() sys.exit()
@ -167,7 +184,14 @@ def score_run():
@cli.command( @cli.command(
help="Run ETL + Score Generation", help="Run ETL + Score Generation",
) )
def score_full_run(): @click.option(
"-u",
"--use-cache",
is_flag=True,
default=False,
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
)
def score_full_run(use_cache: bool):
"""CLI command to run ETL and generate the score in one command""" """CLI command to run ETL and generate the score in one command"""
log_title("Score Full Run", "Run ETL and Generate Score (no tiles)") log_title("Score Full Run", "Run ETL and Generate Score (no tiles)")
@ -177,7 +201,7 @@ def score_full_run():
temp_folder_cleanup() temp_folder_cleanup()
log_info("Running all ETLs") log_info("Running all ETLs")
etl_runner() etl_runner(use_cache=use_cache)
log_info("Generating score") log_info("Generating score")
score_generate() score_generate()
@ -297,7 +321,14 @@ def generate_map_tiles(generate_tribal_layer):
type=str, type=str,
help=dataset_cli_help, help=dataset_cli_help,
) )
def data_full_run(check: bool, data_source: str): @click.option(
"-u",
"--use-cache",
is_flag=True,
default=False,
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
)
def data_full_run(check: bool, data_source: str, use_cache: bool):
"""CLI command to run ETL, score, JSON combine and generate tiles in one command """CLI command to run ETL, score, JSON combine and generate tiles in one command
Args: Args:
@ -330,10 +361,10 @@ def data_full_run(check: bool, data_source: str):
if data_source == "local": if data_source == "local":
log_info("Downloading census data") log_info("Downloading census data")
etl_runner("census") etl_runner("census", use_cache)
log_info("Running all ETLs") log_info("Running all ETLs")
etl_runner() etl_runner(use_cache=use_cache)
log_info("Generating score") log_info("Generating score")
score_generate() score_generate()
@ -357,6 +388,103 @@ def data_full_run(check: bool, data_source: str):
sys.exit() sys.exit()
@cli.command(
help="Print data sources for all ETL processes (or a specific one)",
)
@click.option(
"-d",
"--dataset",
required=False,
type=str,
help=dataset_cli_help,
)
def print_data_sources(dataset: str):
"""Print data sources for all ETL processes (or a specific one)
Args:
dataset (str): Name of the ETL module to be run (optional)
Returns:
None
"""
log_title("Print ETL Datasources")
log_info("Retrieving dataset(s)")
sources = get_data_sources(dataset)
log_info(f"Discovered {len(sources)} files")
for s in sources:
log_info(s)
log_goodbye()
sys.exit()
@cli.command(
help="Fetch data sources for all ETL processes (or a specific one)",
)
@click.option(
"-d",
"--dataset",
required=False,
type=str,
help=dataset_cli_help,
)
@click.option(
"-u",
"--use-cache",
is_flag=True,
default=False,
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
)
def extract_data_sources(dataset: str, use_cache: bool):
"""Extract and cache data source(s) for all ETL processes (or a specific one)
Args:
dataset (str): Name of the ETL module whose data sources you wish to fetch
use_cache (bool): Use this flag if you wish to use the cached data sources (if they exist)
Returns:
None
"""
log_title("Fetch ETL Datasources")
log_info("Fetching data source(s)")
extract_ds(dataset, use_cache)
log_goodbye()
sys.exit()
@cli.command(
help="Clear data source cache for all ETL processes (or a specific one)",
)
@click.option(
"-d",
"--dataset",
required=False,
type=str,
help=dataset_cli_help,
)
def clear_data_source_cache(dataset: str):
"""Clear data source(s) cache for all ETL processes (or a specific one)
Args:
dataset (str): Name of the ETL module whose cache you wish to clear
Returns:
None
"""
log_title("Fetch ETL Datasources")
log_info("Clear data source cache")
clear_ds_cache(dataset)
log_goodbye()
sys.exit()
def log_title(title: str, subtitle: str = None): def log_title(title: str, subtitle: str = None):
"""Logs a title in our fancy title format""" """Logs a title in our fancy title format"""
logger.info("-" * LOG_LINE_WIDTH) logger.info("-" * LOG_LINE_WIDTH)

View file

@ -2,7 +2,9 @@ import enum
import pathlib import pathlib
import sys import sys
import typing import typing
import shutil
from typing import Optional from typing import Optional
from abc import ABC, abstractmethod
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
@ -13,7 +15,7 @@ from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.utils import load_yaml_dict_from_file from data_pipeline.utils import load_yaml_dict_from_file
from data_pipeline.utils import remove_all_from_dir from data_pipeline.utils import remove_all_from_dir
from data_pipeline.utils import unzip_file_from_url from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -25,7 +27,7 @@ class ValidGeoLevel(enum.Enum):
CENSUS_BLOCK_GROUP = enum.auto() CENSUS_BLOCK_GROUP = enum.auto()
class ExtractTransformLoad: class ExtractTransformLoad(ABC):
""" """
A class used to instantiate an ETL object to retrieve and process data from A class used to instantiate an ETL object to retrieve and process data from
datasets. datasets.
@ -45,6 +47,7 @@ class ExtractTransformLoad:
# Directories # Directories
DATA_PATH: pathlib.Path = settings.DATA_PATH DATA_PATH: pathlib.Path = settings.DATA_PATH
TMP_PATH: pathlib.Path = DATA_PATH / "tmp" TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
SOURCES_PATH: pathlib.Path = DATA_PATH / "sources"
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config" CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config" DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
DATASET_CONFIG: Optional[dict] = None DATASET_CONFIG: Optional[dict] = None
@ -177,45 +180,60 @@ class ExtractTransformLoad:
output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv" output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
return output_file_path return output_file_path
def get_tmp_path(self) -> pathlib.Path: def get_sources_path(self) -> pathlib.Path:
"""Returns the temporary path associated with this ETL class.""" """Returns the sources path associated with this ETL class. The sources path
# Note: the temporary path will be defined on `init`, because it uses the class is the home for cached data sources used by this ETL."""
# of the instance which is often a child class.
tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__) sources_path = self.SOURCES_PATH / str(self.__class__.__name__)
# Create directory if it doesn't exist # Create directory if it doesn't exist
tmp_path.mkdir(parents=True, exist_ok=True) sources_path.mkdir(parents=True, exist_ok=True)
return tmp_path return sources_path
def extract( @abstractmethod
self, def get_data_sources(self) -> [DataSource]:
source_url: str = None, pass
extract_path: pathlib.Path = None,
verify: Optional[bool] = True,
) -> None:
"""Extract the data from a remote source. By default it provides code
to get the file from a source url, unzips it and stores it on an
extract_path."""
if source_url is None: def _fetch(self) -> None:
source_url = self.SOURCE_URL """Fetch all data sources for this ETL. When data sources are fetched, they
are stored in a cache directory for consistency between runs."""
for ds in self.get_data_sources():
ds.fetch()
if extract_path is None: def clear_data_source_cache(self) -> None:
extract_path = self.get_tmp_path() """Clears the cache for this ETLs data source(s)"""
shutil.rmtree(self.get_sources_path())
unzip_file_from_url( def extract(self, use_cached_data_sources: bool = False) -> None:
file_url=source_url, """Extract (download) data from a remote source, and validate
download_path=self.get_tmp_path(), that data. By default, this method fetches data from the set of
unzipped_file_path=extract_path, data sources returned by get_data_sources.
verify=verify,
If use_cached_data_sources is true, this method attempts to use cached data
rather than re-downloading from the original source. The cache algorithm is very
simple: it just looks to see if the directory has any contents. If so, it uses
that content. If not, it downloads all data sources.
Subclasses should call super() before performing any work if they wish to take
advantage of the automatic downloading and caching ability of this superclass.
"""
if use_cached_data_sources and any(self.get_sources_path().iterdir()):
logger.info(
f"Using cached data sources for {self.__class__.__name__}"
) )
else:
self.clear_data_source_cache()
self._fetch()
# the rest of the work should be performed here
@abstractmethod
def transform(self) -> None: def transform(self) -> None:
"""Transform the data extracted into a format that can be consumed by the """Transform the data extracted into a format that can be consumed by the
score generator""" score generator"""
pass
raise NotImplementedError
def validate(self) -> None: def validate(self) -> None:
"""Validates the output. """Validates the output.
@ -380,3 +398,14 @@ class ExtractTransformLoad:
def cleanup(self) -> None: def cleanup(self) -> None:
"""Clears out any files stored in the TMP folder""" """Clears out any files stored in the TMP folder"""
remove_all_from_dir(self.get_tmp_path()) remove_all_from_dir(self.get_tmp_path())
def get_tmp_path(self) -> pathlib.Path:
"""Returns the temporary path associated with this ETL class."""
# Note: the temporary path will be defined on `init`, because it uses the class
# of the instance which is often a child class.
tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
# Create directory if it doesn't exist
tmp_path.mkdir(parents=True, exist_ok=True)
return tmp_path

View file

@ -0,0 +1,124 @@
"""This module defines a set of classes that can be used to fetch data
from a remote source. They are meant to be used in conjuction with ETLs
or other classes that require downloading data.
There are three types of data sources defined in this file:
FileDataSource meant to be used when you have a single file to
retrive from a remote location and save to a destination.
ZipDataSource used when you need to fetch and unzip a file, and save
the contents of that file to a destination.
CensusDataSource used to download data from the Census API and store
the contents to a destination.
DataSource subclasses must implement the fetch method to define how
they will reach out to a remote source, download the data, and save
that data to the destination.
"""
from pathlib import Path
from typing import List
from dataclasses import dataclass
from abc import ABC, abstractmethod
from data_pipeline.etl.downloader import Downloader
from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
@dataclass
class DataSource(ABC):
"""A data source represents any source of data that is fetchable
from a remote location.
Attributes:
source : str
the location of this data source, as a url
destination : Path
the Path where the data source should be saved locally upon being fetched
"""
source: str
destination: Path
@abstractmethod
def fetch(self) -> None:
pass
@dataclass
class FileDataSource(DataSource):
"""A data source representing a single file.
This single file will be fetched from the source and saved to a single
destination.
"""
def fetch(self) -> None:
"""Fetches a single file from a source and saves it to a destination."""
self.destination.parent.mkdir(parents=True, exist_ok=True)
Downloader.download_file_from_url(
file_url=self.source,
download_file_name=self.destination,
verify=True,
)
def __str__(self):
return f"File {self.source}"
@dataclass
class ZIPDataSource(DataSource):
"""A data source representing ZIP files.
Zip files will be fetched and placed in the destination folder, then unzipped.
"""
def fetch(self) -> None:
self.destination.mkdir(parents=True, exist_ok=True)
Downloader.download_zip_file_from_url(
file_url=self.source,
unzipped_file_path=self.destination,
verify=True,
)
def __str__(self):
return f"Zip {self.source}"
@dataclass
class CensusDataSource(DataSource):
"""A data source representing census data.
Data will be fetched using the Census API and saved to the destination file. Source is ignored.
"""
acs_year: int
variables: List[str]
tract_output_field_name: str
data_path_for_fips_codes: Path
acs_type: str
def fetch(self) -> None:
df = retrieve_census_acs_data(
acs_year=self.acs_year,
variables=self.variables,
tract_output_field_name=self.tract_output_field_name,
data_path_for_fips_codes=self.data_path_for_fips_codes,
acs_type=self.acs_type,
)
self.destination.parent.mkdir(parents=True, exist_ok=True)
# Write CSV representation of census data
df.to_csv(self.destination, index=False)
def __str__(self):
return f"Census {self.acs_type}, {self.acs_year}"

View file

@ -0,0 +1,95 @@
import uuid
import urllib3
import requests
import zipfile
import shutil
from pathlib import Path
from data_pipeline.config import settings
class Downloader:
"""A simple class to encapsulate the download capabilities of the application"""
@classmethod
def download_file_from_url(
cls,
file_url: str,
download_file_name: Path,
verify: bool = True,
) -> str:
"""Downloads a file from a remote URL location and returns the file location.
Args:
file_url (str): URL where the zip file is located
download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default)
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
error (optional, default to False)
Returns:
None
"""
# disable https warning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
download_file_name.parent.mkdir(parents=True, exist_ok=True)
response = requests.get(
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
)
if response.status_code == 200:
file_contents = response.content
else:
raise Exception(
f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
)
# Write the contents to disk.
file = open(download_file_name, "wb")
file.write(file_contents)
file.close()
return download_file_name
@classmethod
def download_zip_file_from_url(
cls,
file_url: str,
unzipped_file_path: Path,
verify: bool = True,
) -> None:
"""Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after
Args:
file_url (str): URL where the zip file is located
unzipped_file_path (pathlib.Path): directory and name of the extracted file
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
error (optional, default to False)
Returns:
None
"""
# dir_id allows us to evade race conditions on parallel ETLs
dir_id = uuid.uuid4()
zip_download_path = (
settings.DATA_PATH
/ "tmp"
/ "downloads"
/ f"{dir_id}"
/ "download.zip"
)
zip_file_path = Downloader.download_file_from_url(
file_url=file_url,
download_file_name=zip_download_path,
verify=verify,
)
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(unzipped_file_path)
# cleanup temporary file and directory
shutil.rmtree(zip_download_path.parent)

View file

@ -2,10 +2,14 @@ import concurrent.futures
import importlib import importlib
import typing import typing
from functools import reduce
from data_pipeline.etl.score.etl_score import ScoreETL from data_pipeline.etl.score.etl_score import ScoreETL
from data_pipeline.etl.score.etl_score_geo import GeoScoreETL from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
from data_pipeline.etl.score.etl_score_post import PostScoreETL from data_pipeline.etl.score.etl_score_post import PostScoreETL
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from . import constants from . import constants
@ -40,20 +44,26 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
return dataset_list return dataset_list
def _run_one_dataset(dataset: dict) -> None: def _get_dataset(dataset: dict) -> ExtractTransformLoad:
"""Runs one etl process.""" """Instantiates a dataset object from a dictionary description of that object's class"""
logger.info(f"Running ETL for {dataset['name']}")
etl_module = importlib.import_module( etl_module = importlib.import_module(
f"data_pipeline.etl.sources.{dataset['module_dir']}.etl" f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
) )
etl_class = getattr(etl_module, dataset["class_name"]) etl_class = getattr(etl_module, dataset["class_name"])
etl_instance = etl_class() etl_instance = etl_class()
return etl_instance
def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
"""Runs one etl process."""
logger.info(f"Running ETL for {dataset['name']}")
etl_instance = _get_dataset(dataset)
# run extract # run extract
logger.debug(f"Extracting {dataset['name']}") logger.debug(f"Extracting {dataset['name']}")
etl_instance.extract() etl_instance.extract(use_cache)
# run transform # run transform
logger.debug(f"Transforming {dataset['name']}") logger.debug(f"Transforming {dataset['name']}")
@ -74,11 +84,12 @@ def _run_one_dataset(dataset: dict) -> None:
logger.info(f"Finished ETL for dataset {dataset['name']}") logger.info(f"Finished ETL for dataset {dataset['name']}")
def etl_runner(dataset_to_run: str = None) -> None: def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None:
"""Runs all etl processes or a specific one """Runs all etl processes or a specific one
Args: Args:
dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional) dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
use_cache (bool): Use the cached data sources if they exist rather than downloading them all from scratch
Returns: Returns:
None None
@ -105,7 +116,9 @@ def etl_runner(dataset_to_run: str = None) -> None:
logger.info("Running concurrent ETL jobs") logger.info("Running concurrent ETL jobs")
with concurrent.futures.ThreadPoolExecutor() as executor: with concurrent.futures.ThreadPoolExecutor() as executor:
futures = { futures = {
executor.submit(_run_one_dataset, dataset=dataset) executor.submit(
_run_one_dataset, dataset=dataset, use_cache=use_cache
)
for dataset in concurrent_datasets for dataset in concurrent_datasets
} }
@ -119,7 +132,50 @@ def etl_runner(dataset_to_run: str = None) -> None:
if high_memory_datasets: if high_memory_datasets:
logger.info("Running high-memory ETL jobs") logger.info("Running high-memory ETL jobs")
for dataset in high_memory_datasets: for dataset in high_memory_datasets:
_run_one_dataset(dataset=dataset) _run_one_dataset(dataset=dataset, use_cache=use_cache)
def get_data_sources(dataset_to_run: str = None) -> [DataSource]:
dataset_list = _get_datasets_to_run(dataset_to_run)
sources = []
for dataset in dataset_list:
etl_instance = _get_dataset(dataset)
sources.append(etl_instance.get_data_sources())
sources = reduce(
list.__add__, sources
) # flatten the list of lists into a single list
return sources
def extract_data_sources(
dataset_to_run: str = None, use_cache: bool = False
) -> None:
dataset_list = _get_datasets_to_run(dataset_to_run)
for dataset in dataset_list:
etl_instance = _get_dataset(dataset)
logger.info(
f"Extracting data set for {etl_instance.__class__.__name__}"
)
etl_instance.extract(use_cache)
def clear_data_source_cache(dataset_to_run: str = None) -> None:
dataset_list = _get_datasets_to_run(dataset_to_run)
for dataset in dataset_list:
etl_instance = _get_dataset(dataset)
logger.info(
f"Clearing data set cache for {etl_instance.__class__.__name__}"
)
etl_instance.clear_data_source_cache()
def score_generate() -> None: def score_generate() -> None:

View file

@ -22,6 +22,8 @@ from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.score.score_runner import ScoreRunner from data_pipeline.score.score_runner import ScoreRunner
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -55,7 +57,13 @@ class ScoreETL(ExtractTransformLoad):
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = [] self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
return (
[]
) # we have all prerequisite sources locally as a result of running the ETLs
def extract(self, use_cached_data_sources: bool = False) -> None:
# EJSCreen csv Load # EJSCreen csv Load
ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv" ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
self.ejscreen_df = pd.read_csv( self.ejscreen_df = pd.read_csv(

View file

@ -15,6 +15,7 @@ from data_pipeline.utils import get_module_logger
from data_pipeline.utils import load_dict_from_yaml_object_fields from data_pipeline.utils import load_dict_from_yaml_object_fields
from data_pipeline.utils import load_yaml_dict_from_file from data_pipeline.utils import load_yaml_dict_from_file
from data_pipeline.utils import zip_files from data_pipeline.utils import zip_files
from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -68,7 +69,13 @@ class GeoScoreETL(ExtractTransformLoad):
self.geojson_score_usa_high: gpd.GeoDataFrame self.geojson_score_usa_high: gpd.GeoDataFrame
self.geojson_score_usa_low: gpd.GeoDataFrame self.geojson_score_usa_low: gpd.GeoDataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
return (
[]
) # we have all prerequisite sources locally as a result of generating the previous steps in the pipeline
def extract(self, use_cached_data_sources: bool = False) -> None:
# check census data # check census data
check_census_data_source( check_census_data_source(
census_data_path=self.DATA_PATH / "census", census_data_path=self.DATA_PATH / "census",

View file

@ -2,7 +2,9 @@ import json
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
from numpy import float64
import pandas as pd import pandas as pd
from data_pipeline.content.schemas.download_schemas import CodebookConfig from data_pipeline.content.schemas.download_schemas import CodebookConfig
from data_pipeline.content.schemas.download_schemas import CSVConfig from data_pipeline.content.schemas.download_schemas import CSVConfig
from data_pipeline.content.schemas.download_schemas import ExcelConfig from data_pipeline.content.schemas.download_schemas import ExcelConfig
@ -16,7 +18,8 @@ from data_pipeline.utils import get_module_logger
from data_pipeline.utils import load_dict_from_yaml_object_fields from data_pipeline.utils import load_dict_from_yaml_object_fields
from data_pipeline.utils import load_yaml_dict_from_file from data_pipeline.utils import load_yaml_dict_from_file
from data_pipeline.utils import zip_files from data_pipeline.utils import zip_files
from numpy import float64 from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.downloader import Downloader
from . import constants from . import constants
@ -61,6 +64,11 @@ class PostScoreETL(ExtractTransformLoad):
self.yaml_global_config_sort_by_label = "sort_by_label" self.yaml_global_config_sort_by_label = "sort_by_label"
# End YAML definition constants # End YAML definition constants
def get_data_sources(self) -> [DataSource]:
return (
[]
) # we have all prerequisite sources locally as a result of generating the score
def _extract_counties(self, county_path: Path) -> pd.DataFrame: def _extract_counties(self, county_path: Path) -> pd.DataFrame:
logger.debug("Reading Counties CSV") logger.debug("Reading Counties CSV")
return pd.read_csv( return pd.read_csv(
@ -97,17 +105,23 @@ class PostScoreETL(ExtractTransformLoad):
return df return df
def extract(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# check census data # check census data
check_census_data_source( check_census_data_source(
census_data_path=self.DATA_PATH / "census", census_data_path=self.DATA_PATH / "census",
census_data_source=self.DATA_SOURCE, census_data_source=self.DATA_SOURCE,
) )
super().extract( # TODO would could probably add this to the data sources for this file
constants.CENSUS_COUNTIES_ZIP_URL, Downloader.download_zip_file_from_url(
constants.TMP_PATH, constants.CENSUS_COUNTIES_ZIP_URL, constants.TMP_PATH
) )
self.input_counties_df = self._extract_counties( self.input_counties_df = self._extract_counties(
constants.CENSUS_COUNTIES_FILE_NAME constants.CENSUS_COUNTIES_FILE_NAME
) )

View file

@ -13,7 +13,7 @@ from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url from data_pipeline.etl.downloader import Downloader
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from . import constants from . import constants
@ -48,7 +48,7 @@ def check_score_data_source(
# download from s3 if census_data_source is aws # download from s3 if census_data_source is aws
if score_data_source == "aws": if score_data_source == "aws":
logger.debug("Fetching Score Tile data from AWS S3") logger.debug("Fetching Score Tile data from AWS S3")
download_file_from_url( Downloader.download_file_from_url(
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
) )
else: else:

View file

@ -1,23 +1,36 @@
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
class CalEnviroScreenETL(ExtractTransformLoad): class CalEnviroScreenETL(ExtractTransformLoad):
"""California environmental screen
TODO: Need good description
"""
def __init__(self): def __init__(self):
self.CALENVIROSCREEN_FTP_URL = (
# fetch
self.calenviroscreen_ftp_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/CalEnviroScreen_4.0_2021.zip" + "/CalEnviroScreen_4.0_2021.zip"
) )
self.CALENVIROSCREEN_CSV = (
self.get_tmp_path() / "CalEnviroScreen_4.0_2021.csv"
)
self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
# Definining some variable names # input
self.calenviroscreen_source = (
self.get_sources_path() / "CalEnviroScreen_4.0_2021.csv"
)
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
# Defining some variable names
self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score" self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = ( self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
"calenviroscreen_percentile" "calenviroscreen_percentile"
@ -32,19 +45,28 @@ class CalEnviroScreenETL(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.calenviroscreen_ftp_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract( super().extract(
self.CALENVIROSCREEN_FTP_URL, use_cached_data_sources
self.get_tmp_path(), ) # download and extract data sources
self.df = pd.read_csv(
self.calenviroscreen_source, dtype={"Census Tract": "string"}
) )
def transform(self) -> None: def transform(self) -> None:
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically: # Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip # https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
# Load comparison index (CalEnviroScreen 4) # Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(
self.CALENVIROSCREEN_CSV, dtype={"Census Tract": "string"}
)
self.df.rename( self.df.rename(
columns={ columns={
@ -68,5 +90,5 @@ class CalEnviroScreenETL(ExtractTransformLoad):
def load(self) -> None: def load(self) -> None:
# write nationwide csv # write nationwide csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True) self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.CSV_PATH / "data06.csv", index=False) self.df.to_csv(self.OUTPUT_PATH / "data06.csv", index=False)

View file

@ -7,8 +7,9 @@ from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.score.etl_utils import ( from data_pipeline.etl.score.etl_utils import (
compare_to_list_of_expected_state_fips_codes, compare_to_list_of_expected_state_fips_codes,
) )
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings from data_pipeline.config import settings
@ -17,59 +18,74 @@ logger = get_module_logger(__name__)
class CDCLifeExpectancy(ExtractTransformLoad): class CDCLifeExpectancy(ExtractTransformLoad):
"""#TODO: create description"""
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False PUERTO_RICO_EXPECTED_IN_DATA = False
NAME = "cdc_life_expectancy" NAME = "cdc_life_expectancy"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
else:
USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
LOAD_YAML_CONFIG: bool = False LOAD_YAML_CONFIG: bool = False
LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)" LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID" INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"
STATES_MISSING_FROM_USA_FILE = ["23", "55"] STATES_MISSING_FROM_USA_FILE = ["23", "55"]
# For some reason, LEEP does not include Maine or Wisconsin in its "All of
# USA" file. Load these separately.
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
else:
WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
TRACT_INPUT_COLUMN_NAME = "Tract ID" TRACT_INPUT_COLUMN_NAME = "Tract ID"
STATE_INPUT_COLUMN_NAME = "STATE2KX" STATE_INPUT_COLUMN_NAME = "STATE2KX"
raw_df: pd.DataFrame raw_df: pd.DataFrame # result of extraction
output_df: pd.DataFrame output_df: pd.DataFrame # result of transformation
def __init__(self): def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.usa_file_url = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
else:
self.usa_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
# For some reason, LEEP does not include Maine or Wisconsin in its "All of USA" file. Load these separately.
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.wisconsin_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
self.maine_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
else:
self.wisconsin_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
self.maine_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
# input
self.usa_source = self.get_sources_path() / "US_A.CSV"
self.maine_source = self.get_sources_path() / "ME_A.CSV"
self.wisconsin_source = self.get_sources_path() / "WI_A.CSV"
# output
self.OUTPUT_PATH: Path = ( self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "cdc_life_expectancy" self.DATA_PATH / "dataset" / "cdc_life_expectancy"
) )
# Constants for output self.COLUMNS_TO_KEEP = [ # the columns to save on output
self.COLUMNS_TO_KEEP = [
self.GEOID_TRACT_FIELD_NAME, self.GEOID_TRACT_FIELD_NAME,
field_names.LIFE_EXPECTANCY_FIELD, field_names.LIFE_EXPECTANCY_FIELD,
] ]
def _download_and_prep_data( def get_data_sources(self) -> [DataSource]:
self, file_url: str, download_file_name: pathlib.Path return [
) -> pd.DataFrame: FileDataSource(
download_file_from_url( source=self.usa_file_url, destination=self.usa_source
file_url=file_url, ),
download_file_name=download_file_name, FileDataSource(
verify=True, source=self.maine_file_url, destination=self.maine_source
) ),
FileDataSource(
source=self.wisconsin_file_url,
destination=self.wisconsin_source,
),
]
def _read_data(self, file_name: pathlib.Path) -> pd.DataFrame:
df = pd.read_csv( df = pd.read_csv(
filepath_or_buffer=download_file_name, filepath_or_buffer=file_name,
dtype={ dtype={
# The following need to remain as strings for all of their digits, not get converted to numbers. # The following need to remain as strings for all of their digits, not get converted to numbers.
self.TRACT_INPUT_COLUMN_NAME: "string", self.TRACT_INPUT_COLUMN_NAME: "string",
@ -80,12 +96,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):
return df return df
def extract(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
all_usa_raw_df = self._download_and_prep_data( super().extract(
file_url=self.USA_FILE_URL, use_cached_data_sources
download_file_name=self.get_tmp_path() / "US_A.CSV", ) # download and extract data sources
)
all_usa_raw_df = self._read_data(self.usa_source)
# Check which states are missing # Check which states are missing
states_in_life_expectancy_usa_file = list( states_in_life_expectancy_usa_file = list(
@ -101,17 +118,11 @@ class CDCLifeExpectancy(ExtractTransformLoad):
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE, additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
) )
logger.debug("Downloading data for Maine") maine_raw_df = self._read_data(
maine_raw_df = self._download_and_prep_data( self.maine_source,
file_url=self.MAINE_FILE_URL,
download_file_name=self.get_tmp_path() / "maine.csv",
) )
logger.debug("Downloading data for Wisconsin") wisconsin_raw_df = self._read_data(self.wisconsin_source)
wisconsin_raw_df = self._download_and_prep_data(
file_url=self.WISCONSIN_FILE_URL,
download_file_name=self.get_tmp_path() / "wisconsin.csv",
)
combined_df = pd.concat( combined_df = pd.concat(
objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df], objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df],

View file

@ -4,14 +4,17 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
class CDCPlacesETL(ExtractTransformLoad): class CDCPlacesETL(ExtractTransformLoad):
"""#TODO: Need description"""
NAME = "cdc_places" NAME = "cdc_places"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False PUERTO_RICO_EXPECTED_IN_DATA = False
@ -21,15 +24,21 @@ class CDCPlacesETL(ExtractTransformLoad):
CDC_MEASURE_FIELD_NAME = "Measure" CDC_MEASURE_FIELD_NAME = "Measure"
def __init__(self): def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS: if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.CDC_PLACES_URL = ( self.cdc_places_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv" "cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
) )
else: else:
self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD" self.cdc_places_url = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
# input
self.places_source = self.get_sources_path() / "census_tract.csv"
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
self.COLUMNS_TO_KEEP: typing.List[str] = [ self.COLUMNS_TO_KEEP: typing.List[str] = [
self.GEOID_TRACT_FIELD_NAME, self.GEOID_TRACT_FIELD_NAME,
@ -43,19 +52,27 @@ class CDCPlacesETL(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
file_path = download_file_from_url( return [
file_url=self.CDC_PLACES_URL, FileDataSource(
download_file_name=self.get_tmp_path() / "census_tract.csv", source=self.cdc_places_url, destination=self.places_source
) )
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv( self.df = pd.read_csv(
filepath_or_buffer=file_path, filepath_or_buffer=self.places_source,
dtype={self.CDC_GEOID_FIELD_NAME: "string"}, dtype={self.CDC_GEOID_FIELD_NAME: "string"},
low_memory=False, low_memory=False,
) )
def transform(self) -> None: def transform(self) -> None:
# Rename GEOID field # Rename GEOID field
self.df.rename( self.df.rename(
columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME}, columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},

View file

@ -1,6 +1,8 @@
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings from data_pipeline.config import settings
@ -11,22 +13,28 @@ logger = get_module_logger(__name__)
class CDCSVIIndex(ExtractTransformLoad): class CDCSVIIndex(ExtractTransformLoad):
"""CDC SVI Index class ingests 2018 dataset located """CDC SVI Index class ingests 2018 dataset located
here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html
Please see the README in this module for further details. Please see the README in this module for further details.
""" """
def __init__(self): def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS: if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.CDC_SVI_INDEX_URL = ( self.cdc_svi_index_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"cdc_svi_index/SVI2018_US.csv" "cdc_svi_index/SVI2018_US.csv"
) )
else: else:
self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv" self.cdc_svi_index_url = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
# input
self.svi_source = self.get_sources_path() / "SVI2018_US.csv"
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
self.CDC_RPL_THEMES_THRESHOLD = 0.90 self.CDC_RPL_THEMES_THRESHOLD = 0.90
self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS" self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS"
self.COLUMNS_TO_KEEP = [ self.COLUMNS_TO_KEEP = [
@ -47,9 +55,21 @@ class CDCSVIIndex(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.cdc_svi_index_url, destination=self.svi_source
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv( self.df = pd.read_csv(
filepath_or_buffer=self.CDC_SVI_INDEX_URL, filepath_or_buffer=self.svi_source,
dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"}, dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
low_memory=False, low_memory=False,
) )
@ -107,8 +127,8 @@ class CDCSVIIndex(ExtractTransformLoad):
) )
def load(self) -> None: def load(self) -> None:
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df[self.COLUMNS_TO_KEEP].to_csv( self.df[self.COLUMNS_TO_KEEP].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
) )

View file

@ -8,7 +8,8 @@ import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -20,7 +21,7 @@ class GeoFileType(Enum):
class CensusETL(ExtractTransformLoad): class CensusETL(ExtractTransformLoad):
SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp" # SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson" GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv" CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson" GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
@ -29,6 +30,9 @@ class CensusETL(ExtractTransformLoad):
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT" GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
def __init__(self): def __init__(self):
self.shape_file_path = self.get_sources_path() / "shp"
# the fips_states_2010.csv is generated from data here # the fips_states_2010.csv is generated from data here
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html # https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH) self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
@ -50,7 +54,7 @@ class CensusETL(ExtractTransformLoad):
file_path: Path file_path: Path
if file_type == GeoFileType.SHP: if file_type == GeoFileType.SHP:
file_path = Path( file_path = Path(
self.SHP_BASE_PATH self.shape_file_path
/ fips_code / fips_code
/ f"tl_2010_{fips_code}_tract10.shp" / f"tl_2010_{fips_code}_tract10.shp"
) )
@ -60,33 +64,22 @@ class CensusETL(ExtractTransformLoad):
file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv") file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
return file_path return file_path
def _extract_shp(self, fips_code: str) -> None: def get_data_sources(self) -> [DataSource]:
"""Download the SHP file for the provided FIPS code
Args: sources = []
fips_code (str): the FIPS code for the region of interest
Returns: for fips_code in self.STATE_FIPS_CODES:
None
"""
shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
# check if file exists
if not shp_file_path.is_file():
tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip" tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
unzip_file_from_url( destination_path = self.shape_file_path / fips_code
tract_state_url,
self.TMP_PATH, sources.append(
self.DATA_PATH / "census" / "shp" / fips_code, ZIPDataSource(
source=tract_state_url, destination=destination_path
)
) )
def extract(self) -> None: return sources
logger.debug("Extracting census data")
for index, fips_code in enumerate(self.STATE_FIPS_CODES):
logger.debug(
f"Extracting shape for FIPS {fips_code} {index+1} of {len(self.STATE_FIPS_CODES)}"
)
self._extract_shp(fips_code)
def _transform_to_geojson(self, fips_code: str) -> None: def _transform_to_geojson(self, fips_code: str) -> None:
"""Convert the downloaded SHP file for the associated FIPS to geojson """Convert the downloaded SHP file for the associated FIPS to geojson

View file

@ -56,6 +56,7 @@ def get_state_fips_codes(data_path: Path) -> list:
else: else:
fips = row[0].strip() fips = row[0].strip()
fips_state_list.append(fips) fips_state_list.append(fips)
return fips_state_list return fips_state_list

View file

@ -8,12 +8,11 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census_acs.etl_imputations import ( from data_pipeline.etl.sources.census_acs.etl_imputations import (
calculate_income_measures, calculate_income_measures,
) )
from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import CensusDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -28,6 +27,9 @@ class CensusACSETL(ExtractTransformLoad):
MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1 MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
def __init__(self): def __init__(self):
self.census_acs_source = self.get_sources_path() / "acs.csv"
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E" self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
self.TOTAL_IN_LABOR_FORCE = "B23025_003E" self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
self.EMPLOYMENT_FIELDS = [ self.EMPLOYMENT_FIELDS = [
@ -311,6 +313,34 @@ class CensusACSETL(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
# Define the variables to retrieve
variables = (
[
self.MEDIAN_INCOME_FIELD,
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.EMPLOYMENT_FIELDS
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS
+ self.EDUCATIONAL_FIELDS
+ self.RE_FIELDS
+ self.COLLEGE_ATTENDANCE_FIELDS
+ self.AGE_INPUT_FIELDS
)
return [
CensusDataSource(
source=None,
destination=self.census_acs_source,
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
acs_type="acs5",
)
]
# pylint: disable=too-many-arguments # pylint: disable=too-many-arguments
def _merge_geojson( def _merge_geojson(
self, self,
@ -339,27 +369,15 @@ class CensusACSETL(ExtractTransformLoad):
) )
) )
def extract(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
# Define the variables to retrieve
variables = (
[
self.MEDIAN_INCOME_FIELD,
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.EMPLOYMENT_FIELDS
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS
+ self.EDUCATIONAL_FIELDS
+ self.RE_FIELDS
+ self.COLLEGE_ATTENDANCE_FIELDS
+ self.AGE_INPUT_FIELDS
)
self.df = retrieve_census_acs_data( super().extract(
acs_year=self.ACS_YEAR, use_cached_data_sources
variables=variables, ) # download and extract data sources
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH, self.df = pd.read_csv(
self.census_acs_source,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
) )
def transform(self) -> None: def transform(self) -> None:

View file

@ -1,10 +1,9 @@
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import CensusDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -18,6 +17,9 @@ class CensusACS2010ETL(ExtractTransformLoad):
""" """
def __init__(self): def __init__(self):
self.census_acs_source = self.get_sources_path() / "acs_2010.csv"
self.ACS_YEAR = 2010 self.ACS_YEAR = 2010
self.ACS_TYPE = "acs5" self.ACS_TYPE = "acs5"
self.OUTPUT_PATH = ( self.OUTPUT_PATH = (
@ -99,7 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
# Define the variables to retrieve # Define the variables to retrieve
variables = ( variables = (
self.UNEMPLOYED_FIELDS self.UNEMPLOYED_FIELDS
@ -107,14 +109,27 @@ class CensusACS2010ETL(ExtractTransformLoad):
+ self.POVERTY_FIELDS + self.POVERTY_FIELDS
) )
# Use the method defined on CensusACSETL to reduce coding redundancy. return [
self.df = retrieve_census_acs_data( CensusDataSource(
source=None,
destination=self.census_acs_source,
acs_year=self.ACS_YEAR, acs_year=self.ACS_YEAR,
variables=variables, variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME, tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH, data_path_for_fips_codes=self.DATA_PATH,
acs_type=self.ACS_TYPE, acs_type=self.ACS_TYPE,
) )
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.census_acs_source, dtype={"GEOID10_TRACT": "string"}
)
def transform(self) -> None: def transform(self) -> None:
df = self.df df = self.df

View file

@ -1,14 +1,16 @@
import os
import json import json
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import requests
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.datasource import FileDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -22,6 +24,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
/ f"census_acs_median_income_{self.ACS_YEAR}" / f"census_acs_median_income_{self.ACS_YEAR}"
) )
self.GEOCORR_ALL_STATES_URL = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr2014_all_states_tracts_only.csv.zip"
)
self.GEOCORR_ALL_STATES_PATH = self.get_sources_path() / "geocorr"
self.GEOCORR_ALL_STATES_SOURCE = (
self.GEOCORR_ALL_STATES_PATH
/ "geocorr2014_all_states_tracts_only.csv"
)
# Set constants for Geocorr MSAs data. # Set constants for Geocorr MSAs data.
self.PLACE_FIELD_NAME: str = "Census Place Name" self.PLACE_FIELD_NAME: str = "Census Place Name"
self.COUNTY_FIELD_NAME: str = "County Name" self.COUNTY_FIELD_NAME: str = "County Name"
@ -39,10 +51,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E" f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E"
+ "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area" + "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area"
) )
self.MSA_MEDIAN_INCOME_SOURCE = (
self.get_sources_path() / "msa" / "msa_median_income.json"
)
self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)" self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)"
# Set constants for state median incomes # Set constants for state median incomes
self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state" self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state"
self.STATE_MEDIAN_INCOME_SOURCE = (
self.get_sources_path() / "state" / "state_median_income.json"
)
self.STATE_GEOID_FIELD_NAME: str = "GEOID2" self.STATE_GEOID_FIELD_NAME: str = "GEOID2"
self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)" self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)"
@ -50,6 +68,18 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.PUERTO_RICO_S3_LINK: str = ( self.PUERTO_RICO_S3_LINK: str = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv" settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv"
) )
self.PUERTO_RICO_ALL_STATES_SOURCE = (
self.get_sources_path() / "pr_tracts" / "pr_tracts.csv"
)
census_api_key = os.environ.get("CENSUS_API_KEY")
if census_api_key:
self.MSA_MEDIAN_INCOME_URL = (
self.MSA_MEDIAN_INCOME_URL + f"&key={census_api_key}"
)
self.STATE_MEDIAN_INCOME_URL = (
self.STATE_MEDIAN_INCOME_URL + f"&key={census_api_key}"
)
# Constants for output # Constants for output
self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference" self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
@ -76,6 +106,27 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.state_median_incomes: dict self.state_median_incomes: dict
self.pr_tracts: pd.DataFrame self.pr_tracts: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.GEOCORR_ALL_STATES_URL,
destination=self.GEOCORR_ALL_STATES_PATH,
),
FileDataSource(
source=self.PUERTO_RICO_S3_LINK,
destination=self.PUERTO_RICO_ALL_STATES_SOURCE,
),
FileDataSource(
source=self.MSA_MEDIAN_INCOME_URL,
destination=self.MSA_MEDIAN_INCOME_SOURCE,
),
FileDataSource(
source=self.STATE_MEDIAN_INCOME_URL,
destination=self.STATE_MEDIAN_INCOME_SOURCE,
),
]
def _transform_geocorr(self) -> pd.DataFrame: def _transform_geocorr(self) -> pd.DataFrame:
# Transform the geocorr data # Transform the geocorr data
geocorr_df = self.raw_geocorr_df geocorr_df = self.raw_geocorr_df
@ -223,7 +274,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
) )
return state_median_incomes_df return state_median_incomes_df
def extract(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
# Load and clean GEOCORR data # Load and clean GEOCORR data
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census. # Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
# The specific query used is the following, which takes a couple of minutes to run: # The specific query used is the following, which takes a couple of minutes to run:
@ -239,18 +291,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
# - Core based statistical area (CBSA) # - Core based statistical area (CBSA)
# - CBSA Type (Metro or Micro) # - CBSA Type (Metro or Micro)
logger.debug("Starting download of 1.5MB Geocorr information.") logger.debug("Starting download of 1.5MB Geocorr information.")
super().extract(
unzip_file_from_url( use_cached_data_sources
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL ) # download and extract data sources
+ "/geocorr2014_all_states_tracts_only.csv.zip",
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path() / "geocorr",
)
self.raw_geocorr_df = pd.read_csv( self.raw_geocorr_df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() filepath_or_buffer=self.GEOCORR_ALL_STATES_SOURCE,
/ "geocorr"
/ "geocorr2014_all_states_tracts_only.csv",
# Skip second row, which has descriptions. # Skip second row, which has descriptions.
skiprows=[1], skiprows=[1],
# The following need to remain as strings for all of their digits, not get converted to numbers. # The following need to remain as strings for all of their digits, not get converted to numbers.
@ -264,39 +310,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
low_memory=False, low_memory=False,
) )
logger.debug("Pulling PR tract list down.")
# This step is necessary because PR is not in geocorr at the level that gets joined
pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
download_file_from_url(
file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file
)
self.pr_tracts = pd.read_csv( self.pr_tracts = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() filepath_or_buffer=self.PUERTO_RICO_ALL_STATES_SOURCE,
/ "pr_tracts"
/ "pr_tracts.csv",
# The following need to remain as strings for all of their digits, not get converted to numbers. # The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={"GEOID10_TRACT": str}, dtype={"GEOID10_TRACT": str},
low_memory=False, low_memory=False,
) )
self.pr_tracts["State Abbreviation"] = "PR" self.pr_tracts["State Abbreviation"] = "PR"
# Download MSA median incomes with self.MSA_MEDIAN_INCOME_SOURCE.open() as source:
logger.debug("Starting download of MSA median incomes.") self.msa_median_incomes = json.load(source)
download = requests.get(
self.MSA_MEDIAN_INCOME_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
self.msa_median_incomes = json.loads(download.content)
# Download state median incomes with self.STATE_MEDIAN_INCOME_SOURCE.open() as source:
logger.debug("Starting download of state median incomes.") self.state_median_incomes = json.load(source)
download_state = requests.get(
self.STATE_MEDIAN_INCOME_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
self.state_median_incomes = json.loads(download_state.content)
## NOTE we already have PR's MI here ## NOTE we already have PR's MI here
def transform(self) -> None: def transform(self) -> None:

View file

@ -1,13 +1,14 @@
import json import json
from typing import List from typing import List
import os
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import requests
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
pd.options.mode.chained_assignment = "raise" pd.options.mode.chained_assignment = "raise"
@ -342,20 +343,23 @@ class CensusDecennialETL(ExtractTransformLoad):
+ "&for=tract:*&in=state:{}%20county:{}" + "&for=tract:*&in=state:{}%20county:{}"
) )
census_api_key = os.environ.get("CENSUS_API_KEY")
if census_api_key:
self.API_URL = self.API_URL + f"&key={census_api_key}"
self.final_race_fields: List[str] = [] self.final_race_fields: List[str] = []
self.df: pd.DataFrame self.df: pd.DataFrame
self.df_vi: pd.DataFrame self.df_vi: pd.DataFrame
self.df_all: pd.DataFrame self.df_all: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
dfs = []
dfs_vi = [] sources = []
for island in self.ISLAND_TERRITORIES: for island in self.ISLAND_TERRITORIES:
logger.debug(
f"Downloading data for state/territory {island['state_abbreviation']}"
)
for county in island["county_fips"]: for county in island["county_fips"]:
api_url = self.API_URL.format( api_url = self.API_URL.format(
self.DECENNIAL_YEAR, self.DECENNIAL_YEAR,
island["state_abbreviation"], island["state_abbreviation"],
@ -363,17 +367,48 @@ class CensusDecennialETL(ExtractTransformLoad):
island["fips"], island["fips"],
county, county,
) )
logger.debug(f"CENSUS: Requesting {api_url}")
download = requests.get( sources.append(
api_url, FileDataSource(
timeout=settings.REQUESTS_DEFAULT_TIMOUT, source=api_url,
destination=self.get_sources_path()
/ str(self.DECENNIAL_YEAR)
/ island["state_abbreviation"]
/ island["fips"]
/ county
/ "census.json",
)
) )
return sources
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
dfs = []
dfs_vi = []
for island in self.ISLAND_TERRITORIES:
logger.debug(
f"Downloading data for state/territory {island['state_abbreviation']}"
)
for county in island["county_fips"]:
try: try:
df = json.loads(download.content) filepath = (
self.get_sources_path()
/ str(self.DECENNIAL_YEAR)
/ island["state_abbreviation"]
/ island["fips"]
/ county
/ "census.json"
)
df = json.load(filepath.open())
except ValueError as e: except ValueError as e:
logger.error( logger.error(
f"Could not load content in census decennial ETL because {e}. Content is {download.content}." f"Could not load content in census decennial ETL because {e}."
) )
# First row is the header # First row is the header

View file

@ -5,6 +5,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -38,17 +40,26 @@ class ChildOpportunityIndex(ExtractTransformLoad):
PUERTO_RICO_EXPECTED_IN_DATA = False PUERTO_RICO_EXPECTED_IN_DATA = False
def __init__(self): def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS: if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.SOURCE_URL = ( self.child_opportunity_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"child_opportunity_index/raw.zip" "child_opportunity_index/raw.zip"
) )
else: else:
self.SOURCE_URL = ( self.child_opportunity_url = (
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-" "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
"3a0ededa30a0?format=csv" "3a0ededa30a0?format=csv"
) )
# input
self.child_opportunity_index_source = (
self.get_sources_path() / "raw.csv"
)
# output
# TODO: Decide about nixing this # TODO: Decide about nixing this
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
@ -62,17 +73,25 @@ class ChildOpportunityIndex(ExtractTransformLoad):
self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN" self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN"
self.READING_INPUT_FIELD = "ED_READING" self.READING_INPUT_FIELD = "ED_READING"
self.raw_df: pd.DataFrame
self.output_df: pd.DataFrame self.output_df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
super().extract( return [
source_url=self.SOURCE_URL, ZIPDataSource(
extract_path=self.get_tmp_path(), source=self.child_opportunity_url,
destination=self.get_sources_path(),
) )
]
def transform(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
raw_df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() / "raw.csv", super().extract(
use_cached_data_sources
) # download and extract data sources
self.raw_df = pd.read_csv(
filepath_or_buffer=self.child_opportunity_index_source,
# The following need to remain as strings for all of their digits, not get # The following need to remain as strings for all of their digits, not get
# converted to numbers. # converted to numbers.
dtype={ dtype={
@ -81,7 +100,9 @@ class ChildOpportunityIndex(ExtractTransformLoad):
low_memory=False, low_memory=False,
) )
output_df = raw_df.rename( def transform(self) -> None:
output_df = self.raw_df.rename(
columns={ columns={
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD, self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD,

View file

@ -5,22 +5,35 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
class DOEEnergyBurden(ExtractTransformLoad): class DOEEnergyBurden(ExtractTransformLoad):
NAME = "doe_energy_burden" NAME = "doe_energy_burden"
SOURCE_URL: str = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
)
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
LOAD_YAML_CONFIG: bool = True LOAD_YAML_CONFIG: bool = True
REVISED_ENERGY_BURDEN_FIELD_NAME: str REVISED_ENERGY_BURDEN_FIELD_NAME: str
def __init__(self): def __init__(self):
# fetch
self.doe_energy_burden_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
)
# input
self.doe_energy_burden_source = (
self.get_sources_path() / "DOE_LEAD_AMI_TRACT_2018_ALL.csv"
)
# output
self.OUTPUT_PATH: Path = ( self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "doe_energy_burden" self.DATA_PATH / "dataset" / "doe_energy_burden"
) )
@ -29,10 +42,22 @@ class DOEEnergyBurden(ExtractTransformLoad):
self.raw_df: pd.DataFrame self.raw_df: pd.DataFrame
self.output_df: pd.DataFrame self.output_df: pd.DataFrame
def transform(self) -> None: def get_data_sources(self) -> [DataSource]:
raw_df: pd.DataFrame = pd.read_csv( return [
filepath_or_buffer=self.get_tmp_path() ZIPDataSource(
/ "DOE_LEAD_AMI_TRACT_2018_ALL.csv", source=self.doe_energy_burden_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.raw_df = pd.read_csv(
filepath_or_buffer=self.doe_energy_burden_source,
# The following need to remain as strings for all of their digits, not get converted to numbers. # The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={ dtype={
self.INPUT_GEOID_TRACT_FIELD_NAME: "string", self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
@ -40,8 +65,10 @@ class DOEEnergyBurden(ExtractTransformLoad):
low_memory=False, low_memory=False,
) )
def transform(self) -> None:
logger.debug("Renaming columns and ensuring output format is correct") logger.debug("Renaming columns and ensuring output format is correct")
output_df = raw_df.rename( output_df = self.raw_df.rename(
columns={ columns={
self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME, self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME, self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,

View file

@ -3,6 +3,8 @@
import geopandas as gpd import geopandas as gpd
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings from data_pipeline.config import settings
@ -15,14 +17,6 @@ class TravelCompositeETL(ExtractTransformLoad):
NAME = "travel_composite" NAME = "travel_composite"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
SOURCE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"dot_travel_composite/Shapefile_and_Metadata.zip"
)
else:
SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True LOAD_YAML_CONFIG: bool = True
@ -31,14 +25,29 @@ class TravelCompositeETL(ExtractTransformLoad):
TRAVEL_BURDEN_FIELD_NAME: str TRAVEL_BURDEN_FIELD_NAME: str
def __init__(self): def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.travel_composite_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"dot_travel_composite/Shapefile_and_Metadata.zip"
)
else:
self.travel_composite_url = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
# input
# define the full path for the input CSV file # define the full path for the input CSV file
self.INPUT_SHP = ( self.disadvantage_layer_shape_source = (
self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp" self.get_sources_path()
/ "DOT_Disadvantage_Layer_Final_April2022.shp"
) )
# output
# this is the main dataframe # this is the main dataframe
self.df: pd.DataFrame self.df: pd.DataFrame
self.df_dot: pd.DataFrame
# Start dataset-specific vars here # Start dataset-specific vars here
## Average of Transportation Indicator Percentiles (calculated) ## Average of Transportation Indicator Percentiles (calculated)
## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS ## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
@ -46,6 +55,22 @@ class TravelCompositeETL(ExtractTransformLoad):
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH" self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS" self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.travel_composite_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df_dot = gpd.read_file(self.disadvantage_layer_shape_source)
def transform(self) -> None: def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following """Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method: transformations to prepare it for the load() method:
@ -54,15 +79,15 @@ class TravelCompositeETL(ExtractTransformLoad):
- Converts to CSV - Converts to CSV
""" """
# read in the unzipped shapefile from data source
# reformat it to be standard df, remove unassigned rows, and # reformat it to be standard df, remove unassigned rows, and
# then rename the Census Tract column for merging # then rename the Census Tract column for merging
df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
df_dot = df_dot.rename( self.df_dot = self.df_dot.rename(
columns={ columns={
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME, self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME, self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
} }
).dropna(subset=[self.GEOID_TRACT_FIELD_NAME]) ).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
# Assign the final df to the class' output_df for the load method # Assign the final df to the class' output_df for the load method
self.output_df = df_dot self.output_df = self.df_dot

View file

@ -1,12 +1,15 @@
from pathlib import Path from pathlib import Path
import geopandas as gpd
import pandas as pd import pandas as pd
import geopandas as gpd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -39,13 +42,20 @@ class AbandonedMineETL(ExtractTransformLoad):
"55", "55",
] ]
# Define these for easy code completion
def __init__(self): def __init__(self):
self.SOURCE_URL = (
# fetch
self.eamlis_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/eAMLIS export of all data.tsv.zip" + "/eAMLIS export of all data.tsv.zip"
) )
# input
self.eamlis_source = (
self.get_sources_path() / "eAMLIS export of all data.tsv"
)
# output
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
self.OUTPUT_PATH: Path = ( self.OUTPUT_PATH: Path = (
@ -58,18 +68,34 @@ class AbandonedMineETL(ExtractTransformLoad):
] ]
self.output_df: pd.DataFrame self.output_df: pd.DataFrame
self.df: pd.DataFrame
def transform(self) -> None: def get_data_sources(self) -> [DataSource]:
df = pd.read_csv( return [
self.get_tmp_path() / "eAMLIS export of all data.tsv", ZIPDataSource(
source=self.eamlis_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.eamlis_source,
sep="\t", sep="\t",
low_memory=False, low_memory=False,
) )
def transform(self) -> None:
gdf = gpd.GeoDataFrame( gdf = gpd.GeoDataFrame(
df, self.df,
geometry=gpd.points_from_xy( geometry=gpd.points_from_xy(
x=df["Longitude"], x=self.df["Longitude"],
y=df["Latitude"], y=self.df["Latitude"],
), ),
crs="epsg:4326", crs="epsg:4326",
) )
@ -77,4 +103,5 @@ class AbandonedMineETL(ExtractTransformLoad):
gdf_tracts = add_tracts_for_geometries(gdf) gdf_tracts = add_tracts_for_geometries(gdf)
gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME) gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
gdf_tracts[self.AML_BOOLEAN] = True gdf_tracts[self.AML_BOOLEAN] = True
self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP] self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]

View file

@ -3,6 +3,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -15,11 +17,18 @@ class EJSCREENETL(ExtractTransformLoad):
INPUT_GEOID_TRACT_FIELD_NAME: str = "ID" INPUT_GEOID_TRACT_FIELD_NAME: str = "ID"
def __init__(self): def __init__(self):
self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
self.EJSCREEN_CSV = ( # fetch
self.get_tmp_path() / "EJSCREEN_2021_USPR_Tracts.csv" self.ejscreen_url = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
# input
self.ejscreen_source = (
self.get_sources_path() / "EJSCREEN_2021_USPR_Tracts.csv"
) )
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen" self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen"
self.df: pd.DataFrame self.df: pd.DataFrame
self.COLUMNS_TO_KEEP = [ self.COLUMNS_TO_KEEP = [
@ -43,22 +52,29 @@ class EJSCREENETL(ExtractTransformLoad):
field_names.UST_FIELD, field_names.UST_FIELD,
] ]
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
super().extract( return [
self.EJSCREEN_FTP_URL, ZIPDataSource(
self.get_tmp_path(), source=self.ejscreen_url, destination=self.get_sources_path()
verify=False, # EPA EJScreen end point has certificate issues often
) )
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
def transform(self) -> None:
self.df = pd.read_csv( self.df = pd.read_csv(
self.EJSCREEN_CSV, self.ejscreen_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
# EJSCREEN writes the word "None" for NA data. # EJSCREEN writes the word "None" for NA data.
na_values=["None"], na_values=["None"],
low_memory=False, low_memory=False,
) )
def transform(self) -> None:
# rename ID to Tract ID # rename ID to Tract ID
self.output_df = self.df.rename( self.output_df = self.df.rename(
columns={ columns={

View file

@ -1,5 +1,6 @@
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -9,12 +10,17 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
# Note: while we normally set these properties in `__init__`, # Note: while we normally set these properties in `__init__`,
# we are setting them as class properties here so they can be accessed by the # we are setting them as class properties here so they can be accessed by the
# class method `ejscreen_areas_of_concern_data_exists`. # class method `ejscreen_areas_of_concern_data_exists`.
LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = ( EJSCREEN_AREAS_OF_CONCERN_SOURCE = (
LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv" ExtractTransformLoad.DATA_PATH
/ "sources"
/ "EJSCREENAreasOfConcernETL"
/ "ejscreen_areas_of_concerns_indicators.csv"
) )
def __init__(self): def __init__(self):
# output
self.OUTPUT_PATH = ( self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern" self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
) )
@ -22,6 +28,10 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
# TO DO: Load from actual source; the issue is that this dataset is not public for now # TO DO: Load from actual source; the issue is that this dataset is not public for now
self.df: pd.DataFrame self.df: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
"""The source for this must be downloaded and saved manually. It is not publicly available"""
return []
@classmethod @classmethod
def ejscreen_areas_of_concern_data_exists(cls): def ejscreen_areas_of_concern_data_exists(cls):
"""Check whether or not the EJSCREEN areas of concern data exists. """Check whether or not the EJSCREEN areas of concern data exists.
@ -35,13 +45,19 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
not reference this data. not reference this data.
""" """
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file() return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE.is_file()
def extract(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
logger.info(self.EJSCREEN_AREAS_OF_CONCERN_SOURCE)
if self.ejscreen_areas_of_concern_data_exists(): if self.ejscreen_areas_of_concern_data_exists():
logger.debug("Loading EJSCREEN Areas of Concern Data Locally") logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
self.df = pd.read_csv( self.df = pd.read_csv(
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA, filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE,
dtype={ dtype={
self.GEOID_FIELD_NAME: "string", self.GEOID_FIELD_NAME: "string",
}, },

View file

@ -5,18 +5,27 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
class EnergyDefinitionAlternativeDraft(ExtractTransformLoad): class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
def __init__(self): def __init__(self):
self.DEFINITION_ALTERNATIVE_FILE_URL = (
# fetch
self.definition_alternative_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/alternative DAC definition.csv.zip" + "/alternative DAC definition.csv.zip"
) )
# input
self.definition_alternative_source = (
self.get_sources_path() / "J40 alternative DAC definition.csv"
)
# output
self.OUTPUT_PATH: Path = ( self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "energy_definition_alternative_draft" self.DATA_PATH / "dataset" / "energy_definition_alternative_draft"
) )
@ -48,18 +57,22 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
unzip_file_from_url( return [
file_url=self.DEFINITION_ALTERNATIVE_FILE_URL, ZIPDataSource(
download_path=self.get_tmp_path(), source=self.definition_alternative_url,
unzipped_file_path=self.get_tmp_path() destination=self.get_sources_path(),
/ "energy_definition_alternative_draft",
) )
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv( self.df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() filepath_or_buffer=self.definition_alternative_source,
/ "energy_definition_alternative_draft"
/ "J40 alternative DAC definition.csv",
# The following need to remain as strings for all of their digits, not get converted to numbers. # The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={ dtype={
self.TRACT_INPUT_COLUMN_NAME: "string", self.TRACT_INPUT_COLUMN_NAME: "string",
@ -68,6 +81,7 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
) )
def transform(self) -> None: def transform(self) -> None:
self.df = self.df.rename( self.df = self.df.rename(
columns={ columns={
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME, self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,

View file

@ -4,8 +4,9 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -23,17 +24,25 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
def __init__(self): def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS: if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.AGGREGATED_RSEI_SCORE_FILE_URL = ( self.aggregated_rsei_score_file_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"epa_rsei/CensusMicroTracts2019_2019_aggregated.zip" "epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
) )
else: else:
self.AGGREGATED_RSEI_SCORE_FILE_URL = ( self.aggregated_rsei_score_file_url = (
"http://abt-rsei.s3.amazonaws.com/microdata2019/" "http://abt-rsei.s3.amazonaws.com/microdata2019/"
"census_agg/CensusMicroTracts2019_2019_aggregated.zip" "census_agg/CensusMicroTracts2019_2019_aggregated.zip"
) )
# input
self.aggregated_rsei_score_source = (
self.get_sources_path()
/ "CensusMicroTracts2019_2019_aggregated.csv"
)
# output
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei" self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75 self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
self.TRACT_INPUT_COLUMN_NAME = "GEOID10" self.TRACT_INPUT_COLUMN_NAME = "GEOID10"
@ -64,7 +73,20 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.aggregated_rsei_score_file_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# the column headers from the above dataset are actually a census tract's data at this point # the column headers from the above dataset are actually a census tract's data at this point
# We will use this data structure later to specify the column names # We will use this data structure later to specify the column names
input_columns = [ input_columns = [
@ -79,16 +101,8 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
self.NCSCORE_INPUT_FIELD, self.NCSCORE_INPUT_FIELD,
] ]
unzip_file_from_url(
file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL,
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path() / "epa_rsei",
)
self.df = pd.read_csv( self.df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() filepath_or_buffer=self.aggregated_rsei_score_source,
/ "epa_rsei"
/ "CensusMicroTracts2019_2019_aggregated.csv",
# The following need to remain as strings for all of their digits, not get # The following need to remain as strings for all of their digits, not get
# converted to numbers. # converted to numbers.
low_memory=False, low_memory=False,

View file

@ -5,6 +5,8 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -15,7 +17,7 @@ class FloodRiskETL(ExtractTransformLoad):
NAME = "fsf_flood_risk" NAME = "fsf_flood_risk"
# These data were emailed to the J40 team while first street got # These data were emailed to the J40 team while first street got
# their official data sharing channels setup. # their official data sharing channels setup.
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
LOAD_YAML_CONFIG: bool = True LOAD_YAML_CONFIG: bool = True
@ -27,13 +29,16 @@ class FloodRiskETL(ExtractTransformLoad):
SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str
def __init__(self): def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = ( # fetch
self.get_tmp_path() / "fsf_flood" / "flood-tract2010.csv" self.flood_tract_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
) )
# this is the main dataframe # input
self.df: pd.DataFrame self.flood_tract_source = (
self.get_sources_path() / "fsf_flood" / "flood-tract2010.csv"
)
# Start dataset-specific vars here # Start dataset-specific vars here
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties" self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
@ -41,6 +46,29 @@ class FloodRiskETL(ExtractTransformLoad):
self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30" self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30"
self.CLIP_PROPERTIES_COUNT = 250 self.CLIP_PROPERTIES_COUNT = 250
self.df_fsf_flood: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.flood_tract_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# read in the unzipped csv data source then rename the
# Census Tract column for merging
self.df_fsf_flood = pd.read_csv(
self.flood_tract_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
def transform(self) -> None: def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following """Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method: transformations to prepare it for the load() method:
@ -49,35 +77,29 @@ class FloodRiskETL(ExtractTransformLoad):
- Calculates share of properties at risk, left-clipping number of properties at 250 - Calculates share of properties at risk, left-clipping number of properties at 250
""" """
# read in the unzipped csv data source then rename the self.df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_flood[
# Census Tract column for merging
df_fsf_flood: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = df_fsf_flood[
self.INPUT_GEOID_TRACT_FIELD_NAME self.INPUT_GEOID_TRACT_FIELD_NAME
].str.zfill(11) ].str.zfill(11)
df_fsf_flood[self.COUNT_PROPERTIES] = df_fsf_flood[ self.df_fsf_flood[self.COUNT_PROPERTIES] = self.df_fsf_flood[
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
].clip(lower=self.CLIP_PROPERTIES_COUNT) ].clip(lower=self.CLIP_PROPERTIES_COUNT)
df_fsf_flood[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY] = ( self.df_fsf_flood[
df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY] self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY
/ df_fsf_flood[self.COUNT_PROPERTIES] ] = (
self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
/ self.df_fsf_flood[self.COUNT_PROPERTIES]
) )
df_fsf_flood[ self.df_fsf_flood[
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS
] = ( ] = (
df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS] self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
/ df_fsf_flood[self.COUNT_PROPERTIES] / self.df_fsf_flood[self.COUNT_PROPERTIES]
) )
# Assign the final df to the class' output_df for the load method with rename # Assign the final df to the class' output_df for the load method with rename
self.output_df = df_fsf_flood.rename( self.output_df = self.df_fsf_flood.rename(
columns={ columns={
self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY, self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY,
self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS, self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS,

View file

@ -4,6 +4,8 @@ import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -15,7 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad):
NAME = "fsf_wildfire_risk" NAME = "fsf_wildfire_risk"
# These data were emailed to the J40 team while first street got # These data were emailed to the J40 team while first street got
# their official data sharing channels setup. # their official data sharing channels setup.
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True LOAD_YAML_CONFIG: bool = True
@ -29,18 +31,48 @@ class WildfireRiskETL(ExtractTransformLoad):
SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str
def __init__(self): def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = self.get_tmp_path() / "fsf_fire" / "fire-tract2010.csv"
# fetch
self.fsf_fire_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
)
# input
self.fsf_fire_source = (
self.get_sources_path() / "fsf_fire" / "fire-tract2010.csv"
)
# output
# this is the main dataframe # this is the main dataframe
self.df: pd.DataFrame self.df: pd.DataFrame
self.df_fsf_fire: pd.DataFrame
# Start dataset-specific vars here # Start dataset-specific vars here
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties" self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag" self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag"
self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag" self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag"
self.CLIP_PROPERTIES_COUNT = 250 self.CLIP_PROPERTIES_COUNT = 250
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.fsf_fire_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df_fsf_fire = pd.read_csv(
self.fsf_fire_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
def transform(self) -> None: def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following """Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method: transformations to prepare it for the load() method:
@ -50,31 +82,28 @@ class WildfireRiskETL(ExtractTransformLoad):
""" """
# read in the unzipped csv data source then rename the # read in the unzipped csv data source then rename the
# Census Tract column for merging # Census Tract column for merging
df_fsf_fire: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = df_fsf_fire[ self.df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_fire[
self.INPUT_GEOID_TRACT_FIELD_NAME self.INPUT_GEOID_TRACT_FIELD_NAME
].str.zfill(11) ].str.zfill(11)
df_fsf_fire[self.COUNT_PROPERTIES] = df_fsf_fire[ self.df_fsf_fire[self.COUNT_PROPERTIES] = self.df_fsf_fire[
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
].clip(lower=self.CLIP_PROPERTIES_COUNT) ].clip(lower=self.CLIP_PROPERTIES_COUNT)
df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = ( self.df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY] self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
/ df_fsf_fire[self.COUNT_PROPERTIES] / self.df_fsf_fire[self.COUNT_PROPERTIES]
) )
df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS] = ( self.df_fsf_fire[
df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS] self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS
/ df_fsf_fire[self.COUNT_PROPERTIES] ] = (
self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
/ self.df_fsf_fire[self.COUNT_PROPERTIES]
) )
# Assign the final df to the class' output_df for the load method with rename # Assign the final df to the class' output_df for the load method with rename
self.output_df = df_fsf_fire.rename( self.output_df = self.df_fsf_fire.rename(
columns={ columns={
self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY, self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY,
self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS, self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS,

View file

@ -3,17 +3,33 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
class GeoCorrETL(ExtractTransformLoad): class GeoCorrETL(ExtractTransformLoad):
NAME = "geocorr" NAME = "geocorr"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False PUERTO_RICO_EXPECTED_IN_DATA = False
def __init__(self): def __init__(self):
# fetch
self.geocorr_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr_urban_rural.csv.zip"
)
# input
self.geocorr_source = (
self.get_sources_path() / "geocorr_urban_rural.csv"
)
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr" self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"
# Need to change hyperlink to S3 # Need to change hyperlink to S3
@ -23,7 +39,7 @@ class GeoCorrETL(ExtractTransformLoad):
# The source data for this notebook was downloaded from GeoCorr; # The source data for this notebook was downloaded from GeoCorr;
# the instructions for generating the source data is here: # the instructions for generating the source data is here:
# https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787 # https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787
self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip" # self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT" self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag" self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag"
self.COLUMNS_TO_KEEP = [ self.COLUMNS_TO_KEEP = [
@ -33,16 +49,21 @@ class GeoCorrETL(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
unzip_file_from_url( return [
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL ZIPDataSource(
+ "/geocorr_urban_rural.csv.zip", source=self.geocorr_url, destination=self.get_sources_path()
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path(),
) )
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv( self.df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv", filepath_or_buffer=self.geocorr_source,
dtype={ dtype={
self.GEOCORR_GEOID_FIELD_NAME: "string", self.GEOCORR_GEOID_FIELD_NAME: "string",
}, },

View file

@ -3,12 +3,16 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
class HistoricRedliningETL(ExtractTransformLoad): class HistoricRedliningETL(ExtractTransformLoad):
NAME = "historic_redlining" NAME = "historic_redlining"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
EXPECTED_MISSING_STATES = [ EXPECTED_MISSING_STATES = [
"10", "10",
@ -25,14 +29,14 @@ class HistoricRedliningETL(ExtractTransformLoad):
] ]
PUERTO_RICO_EXPECTED_IN_DATA = False PUERTO_RICO_EXPECTED_IN_DATA = False
ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
def __init__(self): def __init__(self):
self.CSV_PATH = self.DATA_PATH / "dataset" / "historic_redlining"
self.HISTORIC_REDLINING_FILE_PATH = ( # fetch
self.get_tmp_path() / "HRS_2010.xlsx" self.hrs_url = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
)
# input
self.hrs_source = self.get_sources_path() / "HRS_2010.xlsx"
self.REDLINING_SCALAR = "Tract-level redlining score" self.REDLINING_SCALAR = "Tract-level redlining score"
@ -40,30 +44,47 @@ class HistoricRedliningETL(ExtractTransformLoad):
self.GEOID_TRACT_FIELD_NAME, self.GEOID_TRACT_FIELD_NAME,
self.REDLINING_SCALAR, self.REDLINING_SCALAR,
] ]
self.df: pd.DataFrame self.df: pd.DataFrame
self.historic_redlining_data: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.hrs_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.historic_redlining_data = pd.read_excel(self.hrs_source)
def transform(self) -> None: def transform(self) -> None:
# this is obviously temporary # this is obviously temporary
historic_redlining_data = pd.read_excel(
self.HISTORIC_REDLINING_FILE_PATH self.historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
self.historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
) )
historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = ( self.historic_redlining_data = self.historic_redlining_data.rename(
historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
)
historic_redlining_data = historic_redlining_data.rename(
columns={"HRS2010": self.REDLINING_SCALAR} columns={"HRS2010": self.REDLINING_SCALAR}
) )
logger.debug(f"{historic_redlining_data.columns}") logger.debug(f"{self.historic_redlining_data.columns}")
# Calculate lots of different score thresholds for convenience # Calculate lots of different score thresholds for convenience
for threshold in [3.25, 3.5, 3.75]: for threshold in [3.25, 3.5, 3.75]:
historic_redlining_data[ self.historic_redlining_data[
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}" f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
] = (historic_redlining_data[self.REDLINING_SCALAR] >= threshold) ] = (
self.historic_redlining_data[self.REDLINING_SCALAR] >= threshold
)
## NOTE We add to columns to keep here ## NOTE We add to columns to keep here
self.COLUMNS_TO_KEEP.append( self.COLUMNS_TO_KEEP.append(
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}" f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
) )
self.output_df = historic_redlining_data self.output_df = self.historic_redlining_data

View file

@ -1,8 +1,9 @@
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from pandas.errors import EmptyDataError from pandas.errors import EmptyDataError
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -10,36 +11,46 @@ logger = get_module_logger(__name__)
class HousingTransportationETL(ExtractTransformLoad): class HousingTransportationETL(ExtractTransformLoad):
def __init__(self): def __init__(self):
self.HOUSING_FTP_URL = (
"https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
)
self.OUTPUT_PATH = ( self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / "housing_and_transportation_index" self.DATA_PATH / "dataset" / "housing_and_transportation_index"
) )
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
housing_url = (
"https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
)
sources = []
for fips in get_state_fips_codes(self.DATA_PATH):
sources.append(
ZIPDataSource(
source=f"{housing_url}{fips}",
destination=self.get_sources_path(),
)
)
return sources
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# Download each state / territory individually # Download each state / territory individually
dfs = [] dfs = []
zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
for fips in get_state_fips_codes(self.DATA_PATH): for fips in get_state_fips_codes(self.DATA_PATH):
logger.debug(
f"Downloading housing data for state/territory with FIPS code {fips}"
)
unzip_file_from_url( csv_source = (
f"{self.HOUSING_FTP_URL}{fips}", self.get_sources_path() / f"htaindex2019_data_tracts_{fips}.csv"
self.get_tmp_path(),
zip_file_dir,
)
# New file name:
tmp_csv_file_path = (
zip_file_dir / f"htaindex2019_data_tracts_{fips}.csv"
) )
try: try:
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path) tmp_df = pd.read_csv(filepath_or_buffer=csv_source)
except EmptyDataError: except EmptyDataError:
logger.error( logger.error(
f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}" f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"

View file

@ -3,24 +3,33 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
class HudHousingETL(ExtractTransformLoad): class HudHousingETL(ExtractTransformLoad):
NAME = "hud_housing" NAME = "hud_housing"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
def __init__(self): def __init__(self):
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS: if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.HOUSING_FTP_URL = ( self.housing_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"hud_housing/2014thru2018-140-csv.zip" "hud_housing/2014thru2018-140-csv.zip"
) )
else: else:
self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip" self.housing_url = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
# source
# output
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path() self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()
@ -55,15 +64,16 @@ class HudHousingETL(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
super().extract( return [
self.HOUSING_FTP_URL, ZIPDataSource(
self.HOUSING_ZIP_FILE_DIR, source=self.housing_url, destination=self.get_sources_path()
) )
]
def _read_chas_table(self, file_name): def _read_chas_table(self, file_name):
# New file name:
tmp_csv_file_path = self.HOUSING_ZIP_FILE_DIR / "140" / file_name tmp_csv_file_path = self.get_sources_path() / "140" / file_name
tmp_df = pd.read_csv( tmp_df = pd.read_csv(
filepath_or_buffer=tmp_csv_file_path, filepath_or_buffer=tmp_csv_file_path,
encoding="latin-1", encoding="latin-1",
@ -78,7 +88,12 @@ class HudHousingETL(ExtractTransformLoad):
return tmp_df return tmp_df
def transform(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
table_8 = self._read_chas_table("Table8.csv") table_8 = self._read_chas_table("Table8.csv")
table_3 = self._read_chas_table("Table3.csv") table_3 = self._read_chas_table("Table3.csv")
@ -86,6 +101,8 @@ class HudHousingETL(ExtractTransformLoad):
table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME
) )
def transform(self) -> None:
# Calculate share that lacks indoor plumbing or kitchen # Calculate share that lacks indoor plumbing or kitchen
# This is computed as # This is computed as
# ( # (

View file

@ -1,7 +1,9 @@
import pandas as pd import pandas as pd
import requests
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
@ -11,44 +13,51 @@ logger = get_module_logger(__name__)
class HudRecapETL(ExtractTransformLoad): class HudRecapETL(ExtractTransformLoad):
def __init__(self): def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS: if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.HUD_RECAP_CSV_URL = ( self.hud_recap_csv_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv" "hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
) )
else: else:
self.HUD_RECAP_CSV_URL = ( self.hud_recap_csv_url = (
"https://opendata.arcgis.com/api/v3/datasets/" "https://opendata.arcgis.com/api/v3/datasets/"
"56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326" "56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
) )
self.HUD_RECAP_CSV = ( # input
self.get_tmp_path() self.hud_recap_source = (
self.get_sources_path()
/ "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv" / "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
) )
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap" self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"
# Definining some variable names # Defining some variable names
self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = ( self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
"hud_recap_priority_community" "hud_recap_priority_community"
) )
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
download = requests.get( return [
self.HUD_RECAP_CSV_URL, FileDataSource(
verify=None, source=self.hud_recap_csv_url, destination=self.hud_recap_source
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
) )
file_contents = download.content ]
csv_file = open(self.HUD_RECAP_CSV, "wb")
csv_file.write(file_contents) def extract(self, use_cached_data_sources: bool = False) -> None:
csv_file.close()
super().extract(
use_cached_data_sources
) # download and extract data sources
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(self.hud_recap_source, dtype={"GEOID": "string"})
def transform(self) -> None: def transform(self) -> None:
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
self.df.rename( self.df.rename(
columns={ columns={

View file

@ -2,6 +2,8 @@ import geopandas as gpd
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
@ -10,16 +12,25 @@ logger = get_module_logger(__name__)
class MappingForEJETL(ExtractTransformLoad): class MappingForEJETL(ExtractTransformLoad):
def __init__(self): def __init__(self):
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
self.MAPPING_FOR_EJ_VA_URL = ( # fetch
self.mapping_for_ej_va_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip" settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip"
) )
self.MAPPING_FOR_EJ_CO_URL = ( self.mapping_for_ej_co_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip" settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
) )
self.VA_SHP_FILE_PATH = self.get_tmp_path() / "mej_virginia_7_1.shp"
self.CO_SHP_FILE_PATH = self.get_tmp_path() / "mej_colorado_final.shp" # input
self.va_shp_file_source = (
self.get_sources_path() / "mej_virginia_7_1.shp"
)
self.co_shp_file_source = (
self.get_sources_path() / "mej_colorado_final.shp"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
# Defining variables # Defining variables
self.COLUMNS_TO_KEEP = [ self.COLUMNS_TO_KEEP = [
@ -38,26 +49,35 @@ class MappingForEJETL(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
super().extract( return [
self.MAPPING_FOR_EJ_VA_URL, ZIPDataSource(
self.get_tmp_path(), source=self.mapping_for_ej_va_url,
) destination=self.get_sources_path(),
super().extract( ),
self.MAPPING_FOR_EJ_CO_URL, ZIPDataSource(
self.get_tmp_path(), source=self.mapping_for_ej_co_url,
) destination=self.get_sources_path(),
),
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
def transform(self) -> None:
# Join (here, it's just concatenating) the two dataframes from # Join (here, it's just concatenating) the two dataframes from
# CO and VA # CO and VA
self.df = pd.concat( self.df = pd.concat(
[ [
gpd.read_file(self.VA_SHP_FILE_PATH), gpd.read_file(self.va_shp_file_source),
gpd.read_file(self.CO_SHP_FILE_PATH), gpd.read_file(self.co_shp_file_source),
] ]
) )
def transform(self) -> None:
# Fill Census tract to get it to be 11 digits, incl. leading 0s # Fill Census tract to get it to be 11 digits, incl. leading 0s
# Note that VA and CO should never have leading 0s, so this isn't # Note that VA and CO should never have leading 0s, so this isn't
# strictly necessary, but if in the future, there are more states # strictly necessary, but if in the future, there are more states

View file

@ -3,8 +3,9 @@ import pathlib
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings from data_pipeline.config import settings
@ -19,31 +20,35 @@ class MappingInequalityETL(ExtractTransformLoad):
Information on the mapping of this data to census tracts is available at Information on the mapping of this data to census tracts is available at
https://github.com/americanpanorama/Census_HOLC_Research. https://github.com/americanpanorama/Census_HOLC_Research.
""" """
def __init__(self): def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS: if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.MAPPING_INEQUALITY_CSV_URL = ( self.mapping_inequality_csv_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/" f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"mapping_inequality/holc_tract_lookup.csv" "mapping_inequality/holc_tract_lookup.csv"
) )
else: else:
self.MAPPING_INEQUALITY_CSV_URL = ( self.mapping_inequality_csv_url = (
"https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/" "https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
"main/2010_Census_Tracts/holc_tract_lookup.csv" "main/2010_Census_Tracts/holc_tract_lookup.csv"
) )
self.MAPPING_INEQUALITY_CSV = (
self.get_tmp_path() / "holc_tract_lookup.csv"
)
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
self.HOLC_MANUAL_MAPPING_CSV_PATH = ( # input
self.mapping_inequality_source = (
self.get_sources_path() / "holc_tract_lookup.csv"
)
self.holc_manual_mapping_source = ( # here be dragons this file is pulled from a different place than most
pathlib.Path(__file__).parent pathlib.Path(__file__).parent
/ "data" / "data"
/ "holc_grades_manually_mapped.csv" / "holc_grades_manually_mapped.csv"
) )
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
# Some input field names. From documentation: 'Census Tracts were intersected # Some input field names. From documentation: 'Census Tracts were intersected
# with HOLC Polygons. Census information can be joined via the "geoid" field. # with HOLC Polygons. Census information can be joined via the "geoid" field.
# There are two field "holc_prop" and "tract_prop" which give the proportion # There are two field "holc_prop" and "tract_prop" which give the proportion
@ -73,22 +78,39 @@ class MappingInequalityETL(ExtractTransformLoad):
] ]
self.df: pd.DataFrame self.df: pd.DataFrame
self.holc_manually_mapped_df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
download_file_from_url( return [
file_url=self.MAPPING_INEQUALITY_CSV_URL, FileDataSource(
download_file_name=self.MAPPING_INEQUALITY_CSV, source=self.mapping_inequality_csv_url,
destination=self.mapping_inequality_source,
) )
]
def transform(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
df: pd.DataFrame = pd.read_csv(
self.MAPPING_INEQUALITY_CSV, super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.mapping_inequality_source,
dtype={self.TRACT_INPUT_FIELD: "string"}, dtype={self.TRACT_INPUT_FIELD: "string"},
low_memory=False, low_memory=False,
) )
# Some data needs to be manually mapped to its grade.
# TODO: Investigate more data that may need to be manually mapped.
self.holc_manually_mapped_df = pd.read_csv(
filepath_or_buffer=self.holc_manual_mapping_source,
low_memory=False,
)
def transform(self) -> None:
# rename Tract ID # rename Tract ID
df.rename( self.df.rename(
columns={ columns={
self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME, self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
}, },
@ -98,28 +120,21 @@ class MappingInequalityETL(ExtractTransformLoad):
# Keep the first character, which is the HOLC grade (A, B, C, D). # Keep the first character, which is the HOLC grade (A, B, C, D).
# TODO: investigate why this dataframe triggers these pylint errors. # TODO: investigate why this dataframe triggers these pylint errors.
# pylint: disable=unsupported-assignment-operation, unsubscriptable-object # pylint: disable=unsupported-assignment-operation, unsubscriptable-object
df[self.HOLC_GRADE_DERIVED_FIELD] = df[ self.df[self.HOLC_GRADE_DERIVED_FIELD] = self.df[
self.HOLC_GRADE_AND_ID_FIELD self.HOLC_GRADE_AND_ID_FIELD
].str[0:1] ].str[0:1]
# Remove nonsense when the field has no grade or invalid grades. # Remove nonsense when the field has no grade or invalid grades.
valid_grades = ["A", "B", "C", "D"] valid_grades = ["A", "B", "C", "D"]
df.loc[ self.df.loc[
# pylint: disable=unsubscriptable-object # pylint: disable=unsubscriptable-object
~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades), ~self.df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
self.HOLC_GRADE_DERIVED_FIELD, self.HOLC_GRADE_DERIVED_FIELD,
] = None ] = None
# Some data needs to be manually mapped to its grade.
# TODO: Investigate more data that may need to be manually mapped.
holc_manually_mapped_df = pd.read_csv(
filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH,
low_memory=False,
)
# Join on the existing data # Join on the existing data
merged_df = df.merge( merged_df = self.df.merge(
right=holc_manually_mapped_df, right=self.holc_manually_mapped_df,
on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD], on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
how="left", how="left",
) )

View file

@ -4,6 +4,8 @@ import geopandas as gpd
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
@ -17,11 +19,16 @@ class MarylandEJScreenETL(ExtractTransformLoad):
""" """
def __init__(self): def __init__(self):
self.MARYLAND_EJSCREEN_URL = (
# fetch
self.maryland_ejscreen_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip" settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
) )
self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen" # input
self.shape_files_source = self.get_sources_path() / "mdejscreen"
# output
self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen" self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"
self.COLUMNS_TO_KEEP = [ self.COLUMNS_TO_KEEP = [
@ -31,37 +38,47 @@ class MarylandEJScreenETL(ExtractTransformLoad):
] ]
self.df: pd.DataFrame self.df: pd.DataFrame
self.dfs_list: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
logger.debug("Downloading 207MB Maryland EJSCREEN Data") return [
super().extract( ZIPDataSource(
self.MARYLAND_EJSCREEN_URL, source=self.maryland_ejscreen_url,
self.get_tmp_path(), destination=self.get_sources_path(),
) )
]
def transform(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
# Ignore counties becauses this is not the level of measurement super().extract(
use_cached_data_sources
) # download and extract data sources
logger.debug("Downloading 207MB Maryland EJSCREEN Data")
list_of_files = list(glob(str(self.shape_files_source) + "/*.shp"))
# Ignore counties because this is not the level of measurement
# that is consistent with our current scoring and ranking methodology. # that is consistent with our current scoring and ranking methodology.
dfs_list = [ self.dfs_list = [
gpd.read_file(f) gpd.read_file(f)
for f in list_of_files for f in list_of_files
if not f.endswith("CountiesEJScore.shp") if not f.endswith("CountiesEJScore.shp")
] ]
def transform(self) -> None:
# Set the Census tract as the index and drop the geometry column # Set the Census tract as the index and drop the geometry column
# that produces the census tract boundaries. # that produces the census tract boundaries.
# The latter is because Geopandas raises an exception if there # The latter is because Geopandas raises an exception if there
# are duplicate geometry columns. # are duplicate geometry columns.
# Moreover, since the unit of measurement is at the tract level # Moreover, since the unit of measurement is at the tract level
# we can consistantly merge this with other datasets # we can consistantly merge this with other datasets
dfs_list = [ self.dfs_list = [
df.set_index("Census_Tra").drop("geometry", axis=1) df.set_index("Census_Tra").drop("geometry", axis=1)
for df in dfs_list for df in self.dfs_list
] ]
# pylint: disable=unsubscriptable-object # pylint: disable=unsubscriptable-object
self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1)) self.df = gpd.GeoDataFrame(pd.concat(self.dfs_list, axis=1))
# Reset index so that we no longer have the tract as our index # Reset index so that we no longer have the tract as our index
self.df = self.df.reset_index() self.df = self.df.reset_index()

View file

@ -1,6 +1,8 @@
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
@ -15,12 +17,21 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
""" """
def __init__(self): def __init__(self):
self.MICHIGAN_EJSCREEN_S3_URL = (
# fetch
self.michigan_ejscreen_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/michigan_ejscore_12212021.csv" + "/michigan_ejscore_12212021.csv"
) )
# input
self.michigan_ejscreen_source = (
self.get_sources_path() / "michigan_ejscore_12212021.csv"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen" self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen"
self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75 self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75
self.COLUMNS_TO_KEEP = [ self.COLUMNS_TO_KEEP = [
@ -32,14 +43,28 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.michigan_ejscreen_url,
destination=self.michigan_ejscreen_source,
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv( self.df = pd.read_csv(
filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL, filepath_or_buffer=self.michigan_ejscreen_source,
dtype={"GEO_ID": "string"}, dtype={"GEO_ID": "string"},
low_memory=False, low_memory=False,
) )
def transform(self) -> None: def transform(self) -> None:
self.df.rename( self.df.rename(
columns={ columns={
"GEO_ID": self.GEOID_TRACT_FIELD_NAME, "GEO_ID": self.GEOID_TRACT_FIELD_NAME,

View file

@ -4,6 +4,8 @@
# pylint: disable=unsupported-assignment-operation # pylint: disable=unsupported-assignment-operation
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings from data_pipeline.config import settings
@ -16,17 +18,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
NAME = "national_risk_index" NAME = "national_risk_index"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
SOURCE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"national_risk_index/NRI_Table_CensusTracts.zip"
)
else:
SOURCE_URL = (
"https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
"NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
)
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True LOAD_YAML_CONFIG: bool = True
@ -46,11 +37,28 @@ class NationalRiskIndexETL(ExtractTransformLoad):
AGRIVALUE_LOWER_BOUND = 408000 AGRIVALUE_LOWER_BOUND = 408000
def __init__(self): def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.risk_index_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"national_risk_index/NRI_Table_CensusTracts.zip"
)
else:
self.risk_index_url = (
"https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
"NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
)
# source
self.risk_index_source = (
self.get_sources_path() / "NRI_Table_CensusTracts.csv"
)
# output
# this is the main dataframe # this is the main dataframe
self.df: pd.DataFrame self.df: pd.DataFrame
self.df_nri: pd.DataFrame
# Start dataset-specific vars here # Start dataset-specific vars here
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = ( self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
@ -65,14 +73,26 @@ class NationalRiskIndexETL(ExtractTransformLoad):
self.POPULATION_INPUT_FIELD_NAME = "POPULATION" self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE" self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
"""Unzips NRI dataset from the FEMA data source and writes the files return [
to the temporary data folder for use in the transform() method ZIPDataSource(
""" source=self.risk_index_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract( super().extract(
source_url=self.SOURCE_URL, use_cached_data_sources
extract_path=self.get_tmp_path(), ) # download and extract data sources
# read in the unzipped csv from NRI data source then rename the
# Census Tract column for merging
self.df_nri = pd.read_csv(
self.risk_index_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
na_values=["None"],
low_memory=False,
) )
def transform(self) -> None: def transform(self) -> None:
@ -84,16 +104,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
Groups inside of that Tract Groups inside of that Tract
""" """
# read in the unzipped csv from NRI data source then rename the self.df_nri.rename(
# Census Tract column for merging
df_nri: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
na_values=["None"],
low_memory=False,
)
df_nri.rename(
columns={ columns={
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME, self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME, self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
@ -123,42 +134,46 @@ class NationalRiskIndexETL(ExtractTransformLoad):
agriculture_columns = [ agriculture_columns = [
f"{x}_EALA" f"{x}_EALA"
for x in disaster_categories for x in disaster_categories
if f"{x}_EALA" in list(df_nri.columns) if f"{x}_EALA" in list(self.df_nri.columns)
] ]
population_columns = [ population_columns = [
f"{x}_EALP" f"{x}_EALP"
for x in disaster_categories for x in disaster_categories
if f"{x}_EALP" in list(df_nri.columns) if f"{x}_EALP" in list(self.df_nri.columns)
] ]
buildings_columns = [ buildings_columns = [
f"{x}_EALB" f"{x}_EALB"
for x in disaster_categories for x in disaster_categories
if f"{x}_EALB" in list(df_nri.columns) if f"{x}_EALB" in list(self.df_nri.columns)
] ]
disaster_population_sum_series = df_nri[population_columns].sum(axis=1) disaster_population_sum_series = self.df_nri[population_columns].sum(
disaster_agriculture_sum_series = df_nri[agriculture_columns].sum(
axis=1 axis=1
) )
disaster_buildings_sum_series = df_nri[buildings_columns].sum(axis=1) disaster_agriculture_sum_series = self.df_nri[agriculture_columns].sum(
axis=1
)
disaster_buildings_sum_series = self.df_nri[buildings_columns].sum(
axis=1
)
# Population EAL Rate = Eal Valp / Population # Population EAL Rate = Eal Valp / Population
df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = ( self.df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
disaster_population_sum_series disaster_population_sum_series
/ df_nri[self.POPULATION_INPUT_FIELD_NAME] / self.df_nri[self.POPULATION_INPUT_FIELD_NAME]
) )
# Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000) # Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000)
## FORMULA ADJUSTMENT 2/17 ## FORMULA ADJUSTMENT 2/17
## Because AGRIVALUE contains a lot of 0s, we are going to consider ## Because AGRIVALUE contains a lot of 0s, we are going to consider
## 90th percentile only for places that have some agrivalue at all ## 90th percentile only for places that have some agrivalue at all
df_nri[ self.df_nri[
self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
] = disaster_agriculture_sum_series / df_nri[ ] = disaster_agriculture_sum_series / self.df_nri[
self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME
].clip( ].clip(
lower=self.AGRIVALUE_LOWER_BOUND lower=self.AGRIVALUE_LOWER_BOUND
@ -167,11 +182,11 @@ class NationalRiskIndexETL(ExtractTransformLoad):
## Check that this clip worked -- that the only place the value has changed is when the clip took effect ## Check that this clip worked -- that the only place the value has changed is when the clip took effect
base_expectation = ( base_expectation = (
disaster_agriculture_sum_series disaster_agriculture_sum_series
/ df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] / self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
) )
assert ( assert (
df_nri[ self.df_nri[
df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
!= base_expectation != base_expectation
][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max() ][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
<= self.AGRIVALUE_LOWER_BOUND <= self.AGRIVALUE_LOWER_BOUND
@ -181,27 +196,27 @@ class NationalRiskIndexETL(ExtractTransformLoad):
) )
assert ( assert (
df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME] self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
!= base_expectation != base_expectation
).sum() > 0, "Clipping the agrivalue did nothing!" ).sum() > 0, "Clipping the agrivalue did nothing!"
# This produces a boolean that is True in the case of non-zero agricultural value # This produces a boolean that is True in the case of non-zero agricultural value
df_nri[self.CONTAINS_AGRIVALUE] = ( self.df_nri[self.CONTAINS_AGRIVALUE] = (
df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0 self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
) )
# divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)). # divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)).
df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = ( self.df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
disaster_buildings_sum_series disaster_buildings_sum_series
/ df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME] / self.df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
) )
# Round all float columns to just 10 digits. # Round all float columns to just 10 digits.
# Note: `round` is smart enough to only apply to float columns. # Note: `round` is smart enough to only apply to float columns.
df_nri = df_nri.round(10) self.df_nri = self.df_nri.round(10)
# Assign the final df to the class' output_df for the load method # Assign the final df to the class' output_df for the load method
self.output_df = df_nri self.output_df = self.df_nri
def load(self) -> None: def load(self) -> None:
# Suppress scientific notation. # Suppress scientific notation.

View file

@ -3,6 +3,8 @@
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
@ -13,10 +15,7 @@ class NatureDeprivedETL(ExtractTransformLoad):
"""ETL class for the Nature Deprived Communities dataset""" """ETL class for the Nature Deprived Communities dataset"""
NAME = "nlcd_nature_deprived" NAME = "nlcd_nature_deprived"
SOURCE_URL = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
)
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True LOAD_YAML_CONFIG: bool = True
@ -29,14 +28,25 @@ class NatureDeprivedETL(ExtractTransformLoad):
TRACT_PERCENT_CROPLAND_FIELD_NAME: str TRACT_PERCENT_CROPLAND_FIELD_NAME: str
def __init__(self): def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = ( # fetch
self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv" self.nature_deprived_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
) )
# source
# define the full path for the input CSV file
self.nature_deprived_source = (
self.get_sources_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
)
# output
# this is the main dataframe # this is the main dataframe
self.df: pd.DataFrame self.df: pd.DataFrame
self.df_ncld: pd.DataFrame
# Start dataset-specific vars here # Start dataset-specific vars here
self.PERCENT_NATURAL_FIELD_NAME = "PctNatural" self.PERCENT_NATURAL_FIELD_NAME = "PctNatural"
self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv" self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv"
@ -47,28 +57,43 @@ class NatureDeprivedETL(ExtractTransformLoad):
# for area. This does indeed remove tracts from the 90th+ percentile later on # for area. This does indeed remove tracts from the 90th+ percentile later on
self.TRACT_ACRES_LOWER_BOUND = 35 self.TRACT_ACRES_LOWER_BOUND = 35
def transform(self) -> None: def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.nature_deprived_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
"""Reads the unzipped data file into memory and applies the following """Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method: transformations to prepare it for the load() method:
- Renames columns as needed - Renames columns as needed
""" """
df_ncld: pd.DataFrame = pd.read_csv( super().extract(
self.INPUT_CSV, use_cached_data_sources
) # download and extract data sources
self.df_ncld = pd.read_csv(
self.nature_deprived_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str}, dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False, low_memory=False,
) )
df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = ( def transform(self) -> None:
df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND
self.df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
self.df_ncld[self.TRACT_ACRES_FIELD_NAME]
>= self.TRACT_ACRES_LOWER_BOUND
) )
df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = ( self.df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
100 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME] 100 - self.df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
) )
# Assign the final df to the class' output_df for the load method with rename # Assign the final df to the class' output_df for the load method with rename
self.output_df = df_ncld.rename( self.output_df = self.df_ncld.rename(
columns={ columns={
self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME, self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME,
self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME, self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME,

View file

@ -3,9 +3,10 @@ import functools
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -23,6 +24,26 @@ class PersistentPovertyETL(ExtractTransformLoad):
PUERTO_RICO_EXPECTED_IN_DATA = False PUERTO_RICO_EXPECTED_IN_DATA = False
def __init__(self): def __init__(self):
# fetch
self.poverty_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/LTDB_Std_All_Sample.zip"
)
# source
self.poverty_sources = [
self.get_sources_path()
/ "ltdb_std_all_sample"
/ "ltdb_std_1990_sample.csv",
self.get_sources_path()
/ "ltdb_std_all_sample"
/ "ltdb_std_2000_sample.csv",
self.get_sources_path()
/ "ltdb_std_all_sample"
/ "ltdb_std_2010_sample.csv",
]
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty" self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"
# Need to change hyperlink to S3 # Need to change hyperlink to S3
@ -44,6 +65,13 @@ class PersistentPovertyETL(ExtractTransformLoad):
self.df: pd.DataFrame self.df: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.poverty_url, destination=self.get_sources_path()
)
]
def _join_input_dfs(self, dfs: list) -> pd.DataFrame: def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
df = functools.reduce( df = functools.reduce(
lambda df_a, df_b: pd.merge( lambda df_a, df_b: pd.merge(
@ -75,28 +103,17 @@ class PersistentPovertyETL(ExtractTransformLoad):
return df return df
def extract(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
unzipped_file_path = self.get_tmp_path()
unzip_file_from_url( super().extract(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL use_cached_data_sources
+ "/LTDB_Std_All_Sample.zip", ) # download and extract data sources
download_path=self.get_tmp_path(),
unzipped_file_path=unzipped_file_path,
)
file_names = [
"ltdb_std_1990_sample.csv",
"ltdb_std_2000_sample.csv",
"ltdb_std_2010_sample.csv",
]
temporary_input_dfs = [] temporary_input_dfs = []
for file_name in file_names: for file_name in self.poverty_sources:
temporary_input_df = pd.read_csv( temporary_input_df = pd.read_csv(
filepath_or_buffer=unzipped_file_path filepath_or_buffer=file_name,
/ f"ltdb_std_all_sample/{file_name}",
dtype={ dtype={
self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string", self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string", self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",

View file

@ -1,6 +1,8 @@
import geopandas as gpd import geopandas as gpd
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -20,10 +22,17 @@ class TreeEquityScoreETL(ExtractTransformLoad):
""" """
def __init__(self): def __init__(self):
self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
self.TES_CSV = self.get_tmp_path() / "tes_2021_data.csv" # input
self.TES_CSV = self.get_sources_path() / "tes_2021_data.csv"
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score" self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
self.df: gpd.GeoDataFrame self.df: gpd.GeoDataFrame
self.tes_state_dfs = []
# config
self.states = [ self.states = [
"al", "al",
"az", "az",
@ -76,21 +85,36 @@ class TreeEquityScoreETL(ExtractTransformLoad):
"wy", "wy",
] ]
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
tes_url = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
sources = []
for state in self.states: for state in self.states:
sources.append(
ZIPDataSource(
source=f"{tes_url}{state}.zip.zip",
destination=self.get_sources_path() / state,
)
)
return sources
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract( super().extract(
f"{self.TES_URL}{state}.zip.zip", use_cached_data_sources
f"{self.get_tmp_path()}/{state}", ) # download and extract data sources
for state in self.states:
self.tes_state_dfs.append(
gpd.read_file(f"{self.get_sources_path()}/{state}/{state}.shp")
) )
def transform(self) -> None: def transform(self) -> None:
tes_state_dfs = []
for state in self.states:
tes_state_dfs.append(
gpd.read_file(f"{self.get_tmp_path()}/{state}/{state}.shp")
)
self.df = gpd.GeoDataFrame( self.df = gpd.GeoDataFrame(
pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs pd.concat(self.tes_state_dfs), crs=self.tes_state_dfs[0].crs
) )
# rename ID to Tract ID # rename ID to Tract ID

View file

@ -4,63 +4,57 @@ import geopandas as gpd
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.score import field_names from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
class TribalETL(ExtractTransformLoad): class TribalETL(ExtractTransformLoad):
def __init__(self): def __init__(self):
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
self.GEOGRAPHIC_BASE_PATH = ( self.GEOGRAPHIC_BASE_PATH = (
self.DATA_PATH / "tribal" / "geographic_data" self.DATA_PATH / "tribal" / "geographic_data"
) )
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
self.NATIONAL_TRIBAL_GEOJSON_PATH = ( self.NATIONAL_TRIBAL_GEOJSON_PATH = (
self.GEOGRAPHIC_BASE_PATH / "usa.json" self.GEOGRAPHIC_BASE_PATH / "usa.json"
) )
self.USA_TRIBAL_DF_LIST = [] self.USA_TRIBAL_DF_LIST = []
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
"""Extract the tribal geojson zip files from Justice40 S3 data folder
Returns: national_lar_url = (
None
"""
bia_shapefile_zip_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/BIA_National_LAR_updated_20220929.zip" + "/BIA_National_LAR_updated_20220929.zip"
) )
tsa_and_aian_url = (
tsa_and_aian_geojson_zip_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/BIA_TSA_and_AIAN_json.zip" + "/BIA_TSA_and_AIAN_json.zip"
) )
alaska_native_villages_url = (
alaska_geojson_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/Alaska_Native_Villages_json.zip" + "/Alaska_Native_Villages_json.zip"
) )
unzip_file_from_url( return [
bia_shapefile_zip_url, ZIPDataSource(
self.TMP_PATH, national_lar_url,
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar", destination=self.get_sources_path() / "bia_national_lar",
) ),
ZIPDataSource(
unzip_file_from_url( source=tsa_and_aian_url,
tsa_and_aian_geojson_zip_url, destination=self.get_sources_path() / "tsa_and_aian",
self.TMP_PATH, ),
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian", ZIPDataSource(
) source=alaska_native_villages_url,
destination=self.get_sources_path() / "alaska_native_villages",
unzip_file_from_url( ),
alaska_geojson_url, ]
self.TMP_PATH,
self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages",
)
def _transform_bia_national_lar(self, path: Path) -> None: def _transform_bia_national_lar(self, path: Path) -> None:
"""Transform the Tribal BIA National Lar Geodataframe and appends it to the """Transform the Tribal BIA National Lar Geodataframe and appends it to the
@ -187,21 +181,21 @@ class TribalETL(ExtractTransformLoad):
""" """
# Set the filepaths: # Set the filepaths:
bia_national_lar_shapefile = ( bia_national_lar_shapefile = (
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar" self.get_sources_path() / "bia_national_lar"
) )
bia_aian_supplemental_geojson = ( bia_aian_supplemental_geojson = (
self.GEOGRAPHIC_BASE_PATH self.get_sources_path()
/ "tsa_and_aian" / "tsa_and_aian"
/ "BIA_AIAN_Supplemental.json" / "BIA_AIAN_Supplemental.json"
) )
bia_tsa_geojson = ( bia_tsa_geojson = (
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json" self.get_sources_path() / "tsa_and_aian" / "BIA_TSA.json"
) )
alaska_native_villages_geojson = ( alaska_native_villages_geojson = (
self.GEOGRAPHIC_BASE_PATH self.get_sources_path()
/ "alaska_native_villages" / "alaska_native_villages"
/ "AlaskaNativeVillages.gdb.geojson" / "AlaskaNativeVillages.gdb.geojson"
) )
@ -225,7 +219,11 @@ class TribalETL(ExtractTransformLoad):
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
) )
# note this works a little different than many of the ETLs. The file
# being written here is used again downstream, so it's placed in a
# special directory.
logger.debug("Writing national geojson file") logger.debug("Writing national geojson file")
self.GEOGRAPHIC_BASE_PATH.mkdir(parents=True, exist_ok=True)
usa_tribal_df.to_file( usa_tribal_df.to_file(
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON" self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
) )

View file

@ -4,6 +4,7 @@ import geopandas as gpd
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.etl.sources.geo_utils import get_tract_geojson from data_pipeline.etl.sources.geo_utils import get_tract_geojson
@ -67,6 +68,9 @@ class TribalOverlapETL(ExtractTransformLoad):
self.census_tract_gdf: gpd.GeoDataFrame self.census_tract_gdf: gpd.GeoDataFrame
self.tribal_gdf: gpd.GeoDataFrame self.tribal_gdf: gpd.GeoDataFrame
def get_data_sources(self) -> [DataSource]:
return [] # this uses already retrieved / calculated data
@staticmethod @staticmethod
def _create_string_from_list(series: pd.Series) -> str: def _create_string_from_list(series: pd.Series) -> str:
"""Helper method that creates a sorted string list (for tribal names).""" """Helper method that creates a sorted string list (for tribal names)."""
@ -89,7 +93,12 @@ class TribalOverlapETL(ExtractTransformLoad):
return percentage_float return percentage_float
def extract(self) -> None: def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.census_tract_gdf = get_tract_geojson() self.census_tract_gdf = get_tract_geojson()
self.tribal_gdf = get_tribal_geojson() self.tribal_gdf = get_tribal_geojson()

View file

@ -4,9 +4,10 @@ import geopandas as gpd
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings from data_pipeline.config import settings
@ -28,19 +29,6 @@ class USArmyFUDS(ExtractTransformLoad):
def __init__(self): def __init__(self):
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.FILE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
"all_data_reported_to_Congress_in_FY2020.geojson"
)
else:
self.FILE_URL: str = (
"https://opendata.arcgis.com/api/v3/datasets/"
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
"data?format=geojson&spatialRefId=4326&where=1%3D1"
)
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds" self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"
# Constants for output # Constants for output
@ -50,17 +38,27 @@ class USArmyFUDS(ExtractTransformLoad):
self.INELIGIBLE_FUDS_COUNT_FIELD_NAME, self.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
self.ELIGIBLE_FUDS_BINARY_FIELD_NAME, self.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
] ]
self.DOWNLOAD_FILE_NAME = self.get_tmp_path() / "fuds.geojson" self.fuds_source = self.get_sources_path() / "fuds.geojson"
self.raw_df: gpd.GeoDataFrame self.raw_df: gpd.GeoDataFrame
self.output_df: pd.DataFrame self.output_df: pd.DataFrame
def extract(self) -> None: def get_data_sources(self) -> [DataSource]:
download_file_from_url(
file_url=self.FILE_URL, if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
download_file_name=self.DOWNLOAD_FILE_NAME, fuds_url = (
verify=True, f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
"all_data_reported_to_Congress_in_FY2020.geojson"
) )
else:
fuds_url: str = (
"https://opendata.arcgis.com/api/v3/datasets/"
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
"data?format=geojson&spatialRefId=4326&where=1%3D1"
)
return [FileDataSource(source=fuds_url, destination=self.fuds_source)]
def transform(self) -> None: def transform(self) -> None:
# before we try to do any transformation, get the tract data # before we try to do any transformation, get the tract data
@ -68,7 +66,7 @@ class USArmyFUDS(ExtractTransformLoad):
logger.debug("Loading FUDS data as GeoDataFrame for transform") logger.debug("Loading FUDS data as GeoDataFrame for transform")
raw_df = gpd.read_file( raw_df = gpd.read_file(
filename=self.DOWNLOAD_FILE_NAME, filename=self.fuds_source,
low_memory=False, low_memory=False,
) )

View file

@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
data. A basic version of that patching is included here for classes that can use it. data. A basic version of that patching is included here for classes that can use it.
""" """
data_path, tmp_path = mock_paths
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
sources_path.mkdir(parents=True, exist_ok=True)
with mock.patch( with mock.patch(
"data_pipeline.utils.requests" "data_pipeline.etl.downloader.requests"
) as requests_mock, mock.patch( ) as requests_mock, mock.patch(
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
) as sources_mock, mock.patch(
"data_pipeline.etl.score.etl_utils.get_state_fips_codes" "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
) as mock_get_state_fips_codes: ) as mock_get_state_fips_codes:
tmp_path = mock_paths[1]
# requests mock
def fake_get(url, *args, **kwargs): def fake_get(url, *args, **kwargs):
file_path = url.split("/")[-1] file_path = url.split("/")[-1]
with open( with open(
@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
return response_mock return response_mock
requests_mock.get = fake_get requests_mock.get = fake_get
# fips codes mock
mock_get_state_fips_codes.return_value = [ mock_get_state_fips_codes.return_value = [
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
] ]
# sources mock
sources_mock.return_value = sources_path
# Instantiate the ETL class. # Instantiate the ETL class.
etl = self._get_instance_of_etl_class() etl = self._get_instance_of_etl_class()
# Monkey-patch the temporary directory to the one used in the test # Monkey-patch the temporary directory to the one used in the test
etl.TMP_PATH = tmp_path etl.TMP_PATH = tmp_path
etl.SOURCES_PATH = data_path / "sources"
# Run the extract method. # Run the extract method.
etl.extract() etl.extract()
def fake_get_sources_path() -> pathlib.PosixPath:
return sources_path
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
return etl return etl
def test_init(self, mock_etl, mock_paths): def test_init(self, mock_etl, mock_paths):

View file

@ -28,7 +28,7 @@ class TestTravelCompositeETL(TestETL):
mock_paths=mock_paths, mock_paths=mock_paths,
) )
df = gpd.read_file( df = gpd.read_file(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME, etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str}, dtype={etl.GEOID_TRACT_FIELD_NAME: str},
) )
assert df.shape[0] == 30 assert df.shape[0] == 30

View file

@ -5,6 +5,7 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
@ -30,6 +31,9 @@ class ExampleETL(ExtractTransformLoad):
self.EXAMPLE_FIELD_NAME, self.EXAMPLE_FIELD_NAME,
] ]
def get_data_sources(self) -> [DataSource]:
return []
def extract(self): def extract(self):
# Pretend to download zip from external URL, write it to CSV. # Pretend to download zip from external URL, write it to CSV.
zip_file_path = ( zip_file_path = (
@ -42,11 +46,11 @@ class ExampleETL(ExtractTransformLoad):
) )
with zipfile.ZipFile(zip_file_path, "r") as zip_ref: with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(self.get_tmp_path()) zip_ref.extractall(self.get_sources_path())
def transform(self): def transform(self):
df: pd.DataFrame = pd.read_csv( df: pd.DataFrame = pd.read_csv(
self.get_tmp_path() / "input.csv", self.get_sources_path() / "input.csv",
dtype={self.GEOID_TRACT_FIELD_NAME: "string"}, dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
low_memory=False, low_memory=False,
) )

View file

@ -124,12 +124,18 @@ class TestETL:
data. A basic version of that patching is included here for classes that can use it. data. A basic version of that patching is included here for classes that can use it.
""" """
data_path, tmp_path = mock_paths
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
sources_path.mkdir(parents=True, exist_ok=True)
with mock.patch( with mock.patch(
"data_pipeline.utils.requests" "data_pipeline.etl.downloader.requests"
) as requests_mock, mock.patch( ) as requests_mock, mock.patch(
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
) as sources_mock, mock.patch(
"data_pipeline.etl.score.etl_utils.get_state_fips_codes" "data_pipeline.etl.score.etl_utils.get_state_fips_codes"
) as mock_get_state_fips_codes: ) as mock_get_state_fips_codes:
tmp_path = mock_paths[1]
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None: if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
zip_file_fixture_src = ( zip_file_fixture_src = (
self._DATA_DIRECTORY_FOR_TEST self._DATA_DIRECTORY_FOR_TEST
@ -145,6 +151,7 @@ class TestETL:
"rb", "rb",
) as file: ) as file:
file_contents = file.read() file_contents = file.read()
response_mock = requests.Response() response_mock = requests.Response()
response_mock.status_code = 200 response_mock.status_code = 200
# pylint: disable=protected-access # pylint: disable=protected-access
@ -154,15 +161,25 @@ class TestETL:
mock_get_state_fips_codes.return_value = [ mock_get_state_fips_codes.return_value = [
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
] ]
# sources mock
sources_mock.return_value = sources_path
# Instantiate the ETL class. # Instantiate the ETL class.
etl = self._get_instance_of_etl_class() etl = self._get_instance_of_etl_class()
# Monkey-patch the temporary directory to the one used in the test # Monkey-patch the temporary directory to the one used in the test
etl.TMP_PATH = tmp_path etl.TMP_PATH = tmp_path
etl.SOURCES_PATH = data_path / "sources"
# Run the extract method. # Run the extract method.
etl.extract() etl.extract()
def fake_get_sources_path() -> pathlib.PosixPath:
return sources_path
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
return etl return etl
def test_init_base(self, mock_etl, mock_paths): def test_init_base(self, mock_etl, mock_paths):
@ -263,17 +280,12 @@ class TestETL:
file was unzipped from a "fake" downloaded zip (located in data) in a temporary path. file was unzipped from a "fake" downloaded zip (located in data) in a temporary path.
""" """
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None: if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
tmp_path = mock_paths[1]
_ = self._setup_etl_instance_and_run_extract( etl = self._setup_etl_instance_and_run_extract(
mock_etl=mock_etl, mock_etl=mock_etl,
mock_paths=mock_paths, mock_paths=mock_paths,
) )
assert ( assert (etl.get_sources_path()).exists()
tmp_path
/ self._EXTRACT_TMP_FOLDER_NAME
/ self._SAMPLE_DATA_FILE_NAME
).exists()
def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths): def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
"""Tests the extract method. """Tests the extract method.
@ -285,8 +297,11 @@ class TestETL:
mock_etl=mock_etl, mock_etl=mock_etl,
mock_paths=mock_paths, mock_paths=mock_paths,
) )
data_path, tmp_path = mock_paths
tmp_df = pd.read_csv( tmp_df = pd.read_csv(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME, etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str}, dtype={etl.GEOID_TRACT_FIELD_NAME: str},
) )
snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST

View file

@ -29,7 +29,7 @@ class TestHistoricRedliningETL(TestETL):
mock_paths=mock_paths, mock_paths=mock_paths,
) )
tmp_df = pd.read_excel( tmp_df = pd.read_excel(
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME, etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
dtype={etl.GEOID_TRACT_FIELD_NAME: str}, dtype={etl.GEOID_TRACT_FIELD_NAME: str},
) )
assert tmp_df.shape == (15, 5) assert tmp_df.shape == (15, 5)

View file

@ -36,23 +36,12 @@ class TestNationalRiskIndexETL(TestETL):
def test_init(self, mock_etl, mock_paths): def test_init(self, mock_etl, mock_paths):
"""Tests that the mock NationalRiskIndexETL class instance was """Tests that the mock NationalRiskIndexETL class instance was
initiliazed correctly. initialized correctly.
Validates the following conditions:
- self.DATA_PATH points to the "data" folder in the temp directory
- self.TMP_PATH points to the "data/tmp" folder in the temp directory
- self.INPUT_PATH points to the correct path in the temp directory
- self.OUTPUT_PATH points to the correct path in the temp directory
""" """
# setup # setup
etl = NationalRiskIndexETL() etl = NationalRiskIndexETL()
data_path, tmp_path = mock_paths
input_csv = (
tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
)
# validation # validation
assert etl.INPUT_CSV == input_csv
assert etl.GEOID_FIELD_NAME == "GEOID10" assert etl.GEOID_FIELD_NAME == "GEOID10"
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT" assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
assert etl.NAME == "national_risk_index" assert etl.NAME == "national_risk_index"