mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-22 17:44:20 -08:00
Add ability to cache ETL data sources (#2169)
* Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline
This commit is contained in:
parent
4d9c1dd11e
commit
6f39033dde
52 changed files with 1787 additions and 686 deletions
|
@ -92,7 +92,6 @@ If you want to run specific data tasks, you can open a terminal window, navigate
|
||||||
- Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles`
|
- Generate Map Tiles: `docker run --rm -it -v ${PWD}/data/data-pipeline/data_pipeline/data:/data_pipeline/data j40_data_pipeline python3 -m data_pipeline.application generate-map-tiles`
|
||||||
|
|
||||||
To learn more about these commands and when they should be run, refer to [Running for Local Development](#running-for-local-development).
|
To learn more about these commands and when they should be run, refer to [Running for Local Development](#running-for-local-development).
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
@ -136,6 +135,9 @@ Once you've downloaded the census data, run the following commands – in order
|
||||||
|
|
||||||
Many commands have options. For example, you can run a single dataset with `etl-run` by passing the command line parameter `-d name-of-dataset-to-run`. Please use the `--help` option to find out more.
|
Many commands have options. For example, you can run a single dataset with `etl-run` by passing the command line parameter `-d name-of-dataset-to-run`. Please use the `--help` option to find out more.
|
||||||
|
|
||||||
|
> :bulb: **NOTE**
|
||||||
|
> One important command line option is enabling cached data sources. Pass the command line parameter `-u` to many commands (e.g. `etl-run`) to use locally cached data sources within the ETL portion of the pipeline. This will ensure that you don't download many GB of data with each run of the data pipeline.
|
||||||
|
|
||||||
## How Scoring Works
|
## How Scoring Works
|
||||||
|
|
||||||
Scores are generated by running the `score-run` command via Poetry or Docker. This command executes [`data_pipeline/etl/score/etl_score.py`](data_pipeline/etl/score/etl_score.py). During execution,
|
Scores are generated by running the `score-run` command via Poetry or Docker. This command executes [`data_pipeline/etl/score/etl_score.py`](data_pipeline/etl/score/etl_score.py). During execution,
|
||||||
|
|
|
@ -7,6 +7,9 @@ from data_pipeline.etl.runner import etl_runner
|
||||||
from data_pipeline.etl.runner import score_generate
|
from data_pipeline.etl.runner import score_generate
|
||||||
from data_pipeline.etl.runner import score_geo
|
from data_pipeline.etl.runner import score_geo
|
||||||
from data_pipeline.etl.runner import score_post
|
from data_pipeline.etl.runner import score_post
|
||||||
|
from data_pipeline.etl.runner import get_data_sources
|
||||||
|
from data_pipeline.etl.runner import extract_data_sources as extract_ds
|
||||||
|
from data_pipeline.etl.runner import clear_data_source_cache as clear_ds_cache
|
||||||
from data_pipeline.etl.sources.census.etl_utils import check_census_data_source
|
from data_pipeline.etl.sources.census.etl_utils import check_census_data_source
|
||||||
from data_pipeline.etl.sources.census.etl_utils import (
|
from data_pipeline.etl.sources.census.etl_utils import (
|
||||||
reset_data_directories as census_reset,
|
reset_data_directories as census_reset,
|
||||||
|
@ -79,7 +82,14 @@ def data_cleanup():
|
||||||
is_flag=True,
|
is_flag=True,
|
||||||
help="Upload to AWS S3 a zipped archive of the census data.",
|
help="Upload to AWS S3 a zipped archive of the census data.",
|
||||||
)
|
)
|
||||||
def census_data_download(zip_compress):
|
@click.option(
|
||||||
|
"-u",
|
||||||
|
"--use-cache",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
|
||||||
|
)
|
||||||
|
def census_data_download(zip_compress, use_cache):
|
||||||
"""CLI command to download all census shape files from the Census FTP and extract the geojson
|
"""CLI command to download all census shape files from the Census FTP and extract the geojson
|
||||||
to generate national and by state Census Block Group CSVs"""
|
to generate national and by state Census Block Group CSVs"""
|
||||||
log_title("Download Census Data ")
|
log_title("Download Census Data ")
|
||||||
|
@ -88,7 +98,7 @@ def census_data_download(zip_compress):
|
||||||
census_reset(data_path)
|
census_reset(data_path)
|
||||||
|
|
||||||
log_info("Downloading census data")
|
log_info("Downloading census data")
|
||||||
etl_runner("census")
|
etl_runner("census", use_cache)
|
||||||
|
|
||||||
if zip_compress:
|
if zip_compress:
|
||||||
log_info("Zipping census data")
|
log_info("Zipping census data")
|
||||||
|
@ -129,7 +139,14 @@ def pull_census_data(data_source: str):
|
||||||
type=str,
|
type=str,
|
||||||
help=dataset_cli_help,
|
help=dataset_cli_help,
|
||||||
)
|
)
|
||||||
def etl_run(dataset: str):
|
@click.option(
|
||||||
|
"-u",
|
||||||
|
"--use-cache",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
|
||||||
|
)
|
||||||
|
def etl_run(dataset: str, use_cache: bool):
|
||||||
"""Run a specific or all ETL processes
|
"""Run a specific or all ETL processes
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -141,7 +158,7 @@ def etl_run(dataset: str):
|
||||||
log_title("Run ETL")
|
log_title("Run ETL")
|
||||||
|
|
||||||
log_info("Running dataset(s)")
|
log_info("Running dataset(s)")
|
||||||
etl_runner(dataset)
|
etl_runner(dataset, use_cache)
|
||||||
|
|
||||||
log_goodbye()
|
log_goodbye()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
@ -167,7 +184,14 @@ def score_run():
|
||||||
@cli.command(
|
@cli.command(
|
||||||
help="Run ETL + Score Generation",
|
help="Run ETL + Score Generation",
|
||||||
)
|
)
|
||||||
def score_full_run():
|
@click.option(
|
||||||
|
"-u",
|
||||||
|
"--use-cache",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
|
||||||
|
)
|
||||||
|
def score_full_run(use_cache: bool):
|
||||||
"""CLI command to run ETL and generate the score in one command"""
|
"""CLI command to run ETL and generate the score in one command"""
|
||||||
log_title("Score Full Run", "Run ETL and Generate Score (no tiles)")
|
log_title("Score Full Run", "Run ETL and Generate Score (no tiles)")
|
||||||
|
|
||||||
|
@ -177,7 +201,7 @@ def score_full_run():
|
||||||
temp_folder_cleanup()
|
temp_folder_cleanup()
|
||||||
|
|
||||||
log_info("Running all ETLs")
|
log_info("Running all ETLs")
|
||||||
etl_runner()
|
etl_runner(use_cache=use_cache)
|
||||||
|
|
||||||
log_info("Generating score")
|
log_info("Generating score")
|
||||||
score_generate()
|
score_generate()
|
||||||
|
@ -297,7 +321,14 @@ def generate_map_tiles(generate_tribal_layer):
|
||||||
type=str,
|
type=str,
|
||||||
help=dataset_cli_help,
|
help=dataset_cli_help,
|
||||||
)
|
)
|
||||||
def data_full_run(check: bool, data_source: str):
|
@click.option(
|
||||||
|
"-u",
|
||||||
|
"--use-cache",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
|
||||||
|
)
|
||||||
|
def data_full_run(check: bool, data_source: str, use_cache: bool):
|
||||||
"""CLI command to run ETL, score, JSON combine and generate tiles in one command
|
"""CLI command to run ETL, score, JSON combine and generate tiles in one command
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -330,10 +361,10 @@ def data_full_run(check: bool, data_source: str):
|
||||||
|
|
||||||
if data_source == "local":
|
if data_source == "local":
|
||||||
log_info("Downloading census data")
|
log_info("Downloading census data")
|
||||||
etl_runner("census")
|
etl_runner("census", use_cache)
|
||||||
|
|
||||||
log_info("Running all ETLs")
|
log_info("Running all ETLs")
|
||||||
etl_runner()
|
etl_runner(use_cache=use_cache)
|
||||||
|
|
||||||
log_info("Generating score")
|
log_info("Generating score")
|
||||||
score_generate()
|
score_generate()
|
||||||
|
@ -357,6 +388,103 @@ def data_full_run(check: bool, data_source: str):
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(
|
||||||
|
help="Print data sources for all ETL processes (or a specific one)",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-d",
|
||||||
|
"--dataset",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
help=dataset_cli_help,
|
||||||
|
)
|
||||||
|
def print_data_sources(dataset: str):
|
||||||
|
"""Print data sources for all ETL processes (or a specific one)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset (str): Name of the ETL module to be run (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
log_title("Print ETL Datasources")
|
||||||
|
|
||||||
|
log_info("Retrieving dataset(s)")
|
||||||
|
sources = get_data_sources(dataset)
|
||||||
|
|
||||||
|
log_info(f"Discovered {len(sources)} files")
|
||||||
|
|
||||||
|
for s in sources:
|
||||||
|
log_info(s)
|
||||||
|
|
||||||
|
log_goodbye()
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(
|
||||||
|
help="Fetch data sources for all ETL processes (or a specific one)",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-d",
|
||||||
|
"--dataset",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
help=dataset_cli_help,
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-u",
|
||||||
|
"--use-cache",
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Check if data source has been downloaded already, and if it has, use the cached version of the data source.",
|
||||||
|
)
|
||||||
|
def extract_data_sources(dataset: str, use_cache: bool):
|
||||||
|
"""Extract and cache data source(s) for all ETL processes (or a specific one)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset (str): Name of the ETL module whose data sources you wish to fetch
|
||||||
|
use_cache (bool): Use this flag if you wish to use the cached data sources (if they exist)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
log_title("Fetch ETL Datasources")
|
||||||
|
|
||||||
|
log_info("Fetching data source(s)")
|
||||||
|
extract_ds(dataset, use_cache)
|
||||||
|
|
||||||
|
log_goodbye()
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command(
|
||||||
|
help="Clear data source cache for all ETL processes (or a specific one)",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-d",
|
||||||
|
"--dataset",
|
||||||
|
required=False,
|
||||||
|
type=str,
|
||||||
|
help=dataset_cli_help,
|
||||||
|
)
|
||||||
|
def clear_data_source_cache(dataset: str):
|
||||||
|
"""Clear data source(s) cache for all ETL processes (or a specific one)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset (str): Name of the ETL module whose cache you wish to clear
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
log_title("Fetch ETL Datasources")
|
||||||
|
|
||||||
|
log_info("Clear data source cache")
|
||||||
|
clear_ds_cache(dataset)
|
||||||
|
|
||||||
|
log_goodbye()
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
def log_title(title: str, subtitle: str = None):
|
def log_title(title: str, subtitle: str = None):
|
||||||
"""Logs a title in our fancy title format"""
|
"""Logs a title in our fancy title format"""
|
||||||
logger.info("-" * LOG_LINE_WIDTH)
|
logger.info("-" * LOG_LINE_WIDTH)
|
||||||
|
|
|
@ -2,7 +2,9 @@ import enum
|
||||||
import pathlib
|
import pathlib
|
||||||
import sys
|
import sys
|
||||||
import typing
|
import typing
|
||||||
|
import shutil
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
@ -13,7 +15,7 @@ from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import load_yaml_dict_from_file
|
from data_pipeline.utils import load_yaml_dict_from_file
|
||||||
from data_pipeline.utils import remove_all_from_dir
|
from data_pipeline.utils import remove_all_from_dir
|
||||||
from data_pipeline.utils import unzip_file_from_url
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -25,7 +27,7 @@ class ValidGeoLevel(enum.Enum):
|
||||||
CENSUS_BLOCK_GROUP = enum.auto()
|
CENSUS_BLOCK_GROUP = enum.auto()
|
||||||
|
|
||||||
|
|
||||||
class ExtractTransformLoad:
|
class ExtractTransformLoad(ABC):
|
||||||
"""
|
"""
|
||||||
A class used to instantiate an ETL object to retrieve and process data from
|
A class used to instantiate an ETL object to retrieve and process data from
|
||||||
datasets.
|
datasets.
|
||||||
|
@ -45,6 +47,7 @@ class ExtractTransformLoad:
|
||||||
# Directories
|
# Directories
|
||||||
DATA_PATH: pathlib.Path = settings.DATA_PATH
|
DATA_PATH: pathlib.Path = settings.DATA_PATH
|
||||||
TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
|
TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
|
||||||
|
SOURCES_PATH: pathlib.Path = DATA_PATH / "sources"
|
||||||
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
|
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
|
||||||
DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
|
DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
|
||||||
DATASET_CONFIG: Optional[dict] = None
|
DATASET_CONFIG: Optional[dict] = None
|
||||||
|
@ -177,45 +180,60 @@ class ExtractTransformLoad:
|
||||||
output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
|
output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
|
||||||
return output_file_path
|
return output_file_path
|
||||||
|
|
||||||
def get_tmp_path(self) -> pathlib.Path:
|
def get_sources_path(self) -> pathlib.Path:
|
||||||
"""Returns the temporary path associated with this ETL class."""
|
"""Returns the sources path associated with this ETL class. The sources path
|
||||||
# Note: the temporary path will be defined on `init`, because it uses the class
|
is the home for cached data sources used by this ETL."""
|
||||||
# of the instance which is often a child class.
|
|
||||||
tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
|
sources_path = self.SOURCES_PATH / str(self.__class__.__name__)
|
||||||
|
|
||||||
# Create directory if it doesn't exist
|
# Create directory if it doesn't exist
|
||||||
tmp_path.mkdir(parents=True, exist_ok=True)
|
sources_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
return tmp_path
|
return sources_path
|
||||||
|
|
||||||
def extract(
|
@abstractmethod
|
||||||
self,
|
def get_data_sources(self) -> [DataSource]:
|
||||||
source_url: str = None,
|
pass
|
||||||
extract_path: pathlib.Path = None,
|
|
||||||
verify: Optional[bool] = True,
|
|
||||||
) -> None:
|
|
||||||
"""Extract the data from a remote source. By default it provides code
|
|
||||||
to get the file from a source url, unzips it and stores it on an
|
|
||||||
extract_path."""
|
|
||||||
|
|
||||||
if source_url is None:
|
def _fetch(self) -> None:
|
||||||
source_url = self.SOURCE_URL
|
"""Fetch all data sources for this ETL. When data sources are fetched, they
|
||||||
|
are stored in a cache directory for consistency between runs."""
|
||||||
|
for ds in self.get_data_sources():
|
||||||
|
ds.fetch()
|
||||||
|
|
||||||
if extract_path is None:
|
def clear_data_source_cache(self) -> None:
|
||||||
extract_path = self.get_tmp_path()
|
"""Clears the cache for this ETLs data source(s)"""
|
||||||
|
shutil.rmtree(self.get_sources_path())
|
||||||
|
|
||||||
unzip_file_from_url(
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
file_url=source_url,
|
"""Extract (download) data from a remote source, and validate
|
||||||
download_path=self.get_tmp_path(),
|
that data. By default, this method fetches data from the set of
|
||||||
unzipped_file_path=extract_path,
|
data sources returned by get_data_sources.
|
||||||
verify=verify,
|
|
||||||
|
If use_cached_data_sources is true, this method attempts to use cached data
|
||||||
|
rather than re-downloading from the original source. The cache algorithm is very
|
||||||
|
simple: it just looks to see if the directory has any contents. If so, it uses
|
||||||
|
that content. If not, it downloads all data sources.
|
||||||
|
|
||||||
|
Subclasses should call super() before performing any work if they wish to take
|
||||||
|
advantage of the automatic downloading and caching ability of this superclass.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if use_cached_data_sources and any(self.get_sources_path().iterdir()):
|
||||||
|
logger.info(
|
||||||
|
f"Using cached data sources for {self.__class__.__name__}"
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
self.clear_data_source_cache()
|
||||||
|
self._fetch()
|
||||||
|
|
||||||
|
# the rest of the work should be performed here
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
"""Transform the data extracted into a format that can be consumed by the
|
"""Transform the data extracted into a format that can be consumed by the
|
||||||
score generator"""
|
score generator"""
|
||||||
|
pass
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def validate(self) -> None:
|
def validate(self) -> None:
|
||||||
"""Validates the output.
|
"""Validates the output.
|
||||||
|
@ -380,3 +398,14 @@ class ExtractTransformLoad:
|
||||||
def cleanup(self) -> None:
|
def cleanup(self) -> None:
|
||||||
"""Clears out any files stored in the TMP folder"""
|
"""Clears out any files stored in the TMP folder"""
|
||||||
remove_all_from_dir(self.get_tmp_path())
|
remove_all_from_dir(self.get_tmp_path())
|
||||||
|
|
||||||
|
def get_tmp_path(self) -> pathlib.Path:
|
||||||
|
"""Returns the temporary path associated with this ETL class."""
|
||||||
|
# Note: the temporary path will be defined on `init`, because it uses the class
|
||||||
|
# of the instance which is often a child class.
|
||||||
|
tmp_path = self.DATA_PATH / "tmp" / str(self.__class__.__name__)
|
||||||
|
|
||||||
|
# Create directory if it doesn't exist
|
||||||
|
tmp_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
return tmp_path
|
||||||
|
|
124
data/data-pipeline/data_pipeline/etl/datasource.py
Normal file
124
data/data-pipeline/data_pipeline/etl/datasource.py
Normal file
|
@ -0,0 +1,124 @@
|
||||||
|
"""This module defines a set of classes that can be used to fetch data
|
||||||
|
from a remote source. They are meant to be used in conjuction with ETLs
|
||||||
|
or other classes that require downloading data.
|
||||||
|
|
||||||
|
There are three types of data sources defined in this file:
|
||||||
|
|
||||||
|
FileDataSource – meant to be used when you have a single file to
|
||||||
|
retrive from a remote location and save to a destination.
|
||||||
|
|
||||||
|
ZipDataSource – used when you need to fetch and unzip a file, and save
|
||||||
|
the contents of that file to a destination.
|
||||||
|
|
||||||
|
CensusDataSource – used to download data from the Census API and store
|
||||||
|
the contents to a destination.
|
||||||
|
|
||||||
|
DataSource subclasses must implement the fetch method to define how
|
||||||
|
they will reach out to a remote source, download the data, and save
|
||||||
|
that data to the destination.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
from data_pipeline.etl.downloader import Downloader
|
||||||
|
from data_pipeline.etl.sources.census_acs.etl_utils import (
|
||||||
|
retrieve_census_acs_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DataSource(ABC):
|
||||||
|
"""A data source represents any source of data that is fetchable
|
||||||
|
from a remote location.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
source : str
|
||||||
|
the location of this data source, as a url
|
||||||
|
destination : Path
|
||||||
|
the Path where the data source should be saved locally upon being fetched
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
source: str
|
||||||
|
destination: Path
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def fetch(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FileDataSource(DataSource):
|
||||||
|
"""A data source representing a single file.
|
||||||
|
|
||||||
|
This single file will be fetched from the source and saved to a single
|
||||||
|
destination.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def fetch(self) -> None:
|
||||||
|
"""Fetches a single file from a source and saves it to a destination."""
|
||||||
|
|
||||||
|
self.destination.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
Downloader.download_file_from_url(
|
||||||
|
file_url=self.source,
|
||||||
|
download_file_name=self.destination,
|
||||||
|
verify=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"File – {self.source}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ZIPDataSource(DataSource):
|
||||||
|
"""A data source representing ZIP files.
|
||||||
|
|
||||||
|
Zip files will be fetched and placed in the destination folder, then unzipped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def fetch(self) -> None:
|
||||||
|
|
||||||
|
self.destination.mkdir(parents=True, exist_ok=True)
|
||||||
|
Downloader.download_zip_file_from_url(
|
||||||
|
file_url=self.source,
|
||||||
|
unzipped_file_path=self.destination,
|
||||||
|
verify=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"Zip – {self.source}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CensusDataSource(DataSource):
|
||||||
|
"""A data source representing census data.
|
||||||
|
|
||||||
|
Data will be fetched using the Census API and saved to the destination file. Source is ignored.
|
||||||
|
"""
|
||||||
|
|
||||||
|
acs_year: int
|
||||||
|
variables: List[str]
|
||||||
|
tract_output_field_name: str
|
||||||
|
data_path_for_fips_codes: Path
|
||||||
|
acs_type: str
|
||||||
|
|
||||||
|
def fetch(self) -> None:
|
||||||
|
|
||||||
|
df = retrieve_census_acs_data(
|
||||||
|
acs_year=self.acs_year,
|
||||||
|
variables=self.variables,
|
||||||
|
tract_output_field_name=self.tract_output_field_name,
|
||||||
|
data_path_for_fips_codes=self.data_path_for_fips_codes,
|
||||||
|
acs_type=self.acs_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.destination.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Write CSV representation of census data
|
||||||
|
df.to_csv(self.destination, index=False)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"Census – {self.acs_type}, {self.acs_year}"
|
95
data/data-pipeline/data_pipeline/etl/downloader.py
Normal file
95
data/data-pipeline/data_pipeline/etl/downloader.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
import uuid
|
||||||
|
import urllib3
|
||||||
|
import requests
|
||||||
|
import zipfile
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from data_pipeline.config import settings
|
||||||
|
|
||||||
|
|
||||||
|
class Downloader:
|
||||||
|
"""A simple class to encapsulate the download capabilities of the application"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def download_file_from_url(
|
||||||
|
cls,
|
||||||
|
file_url: str,
|
||||||
|
download_file_name: Path,
|
||||||
|
verify: bool = True,
|
||||||
|
) -> str:
|
||||||
|
"""Downloads a file from a remote URL location and returns the file location.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_url (str): URL where the zip file is located
|
||||||
|
download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default)
|
||||||
|
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
|
||||||
|
error (optional, default to False)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
|
||||||
|
"""
|
||||||
|
# disable https warning
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
download_file_name.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
response = requests.get(
|
||||||
|
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
file_contents = response.content
|
||||||
|
else:
|
||||||
|
raise Exception(
|
||||||
|
f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Write the contents to disk.
|
||||||
|
file = open(download_file_name, "wb")
|
||||||
|
file.write(file_contents)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
return download_file_name
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def download_zip_file_from_url(
|
||||||
|
cls,
|
||||||
|
file_url: str,
|
||||||
|
unzipped_file_path: Path,
|
||||||
|
verify: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_url (str): URL where the zip file is located
|
||||||
|
unzipped_file_path (pathlib.Path): directory and name of the extracted file
|
||||||
|
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
|
||||||
|
error (optional, default to False)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
|
||||||
|
"""
|
||||||
|
# dir_id allows us to evade race conditions on parallel ETLs
|
||||||
|
dir_id = uuid.uuid4()
|
||||||
|
|
||||||
|
zip_download_path = (
|
||||||
|
settings.DATA_PATH
|
||||||
|
/ "tmp"
|
||||||
|
/ "downloads"
|
||||||
|
/ f"{dir_id}"
|
||||||
|
/ "download.zip"
|
||||||
|
)
|
||||||
|
|
||||||
|
zip_file_path = Downloader.download_file_from_url(
|
||||||
|
file_url=file_url,
|
||||||
|
download_file_name=zip_download_path,
|
||||||
|
verify=verify,
|
||||||
|
)
|
||||||
|
|
||||||
|
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
||||||
|
zip_ref.extractall(unzipped_file_path)
|
||||||
|
|
||||||
|
# cleanup temporary file and directory
|
||||||
|
shutil.rmtree(zip_download_path.parent)
|
|
@ -2,10 +2,14 @@ import concurrent.futures
|
||||||
import importlib
|
import importlib
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
|
from functools import reduce
|
||||||
|
|
||||||
from data_pipeline.etl.score.etl_score import ScoreETL
|
from data_pipeline.etl.score.etl_score import ScoreETL
|
||||||
from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
|
from data_pipeline.etl.score.etl_score_geo import GeoScoreETL
|
||||||
from data_pipeline.etl.score.etl_score_post import PostScoreETL
|
from data_pipeline.etl.score.etl_score_post import PostScoreETL
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
|
||||||
from . import constants
|
from . import constants
|
||||||
|
|
||||||
|
@ -40,20 +44,26 @@ def _get_datasets_to_run(dataset_to_run: str) -> typing.List[dict]:
|
||||||
return dataset_list
|
return dataset_list
|
||||||
|
|
||||||
|
|
||||||
def _run_one_dataset(dataset: dict) -> None:
|
def _get_dataset(dataset: dict) -> ExtractTransformLoad:
|
||||||
"""Runs one etl process."""
|
"""Instantiates a dataset object from a dictionary description of that object's class"""
|
||||||
|
|
||||||
logger.info(f"Running ETL for {dataset['name']}")
|
|
||||||
|
|
||||||
etl_module = importlib.import_module(
|
etl_module = importlib.import_module(
|
||||||
f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
|
f"data_pipeline.etl.sources.{dataset['module_dir']}.etl"
|
||||||
)
|
)
|
||||||
etl_class = getattr(etl_module, dataset["class_name"])
|
etl_class = getattr(etl_module, dataset["class_name"])
|
||||||
etl_instance = etl_class()
|
etl_instance = etl_class()
|
||||||
|
|
||||||
|
return etl_instance
|
||||||
|
|
||||||
|
|
||||||
|
def _run_one_dataset(dataset: dict, use_cache: bool = False) -> None:
|
||||||
|
"""Runs one etl process."""
|
||||||
|
|
||||||
|
logger.info(f"Running ETL for {dataset['name']}")
|
||||||
|
etl_instance = _get_dataset(dataset)
|
||||||
|
|
||||||
# run extract
|
# run extract
|
||||||
logger.debug(f"Extracting {dataset['name']}")
|
logger.debug(f"Extracting {dataset['name']}")
|
||||||
etl_instance.extract()
|
etl_instance.extract(use_cache)
|
||||||
|
|
||||||
# run transform
|
# run transform
|
||||||
logger.debug(f"Transforming {dataset['name']}")
|
logger.debug(f"Transforming {dataset['name']}")
|
||||||
|
@ -74,11 +84,12 @@ def _run_one_dataset(dataset: dict) -> None:
|
||||||
logger.info(f"Finished ETL for dataset {dataset['name']}")
|
logger.info(f"Finished ETL for dataset {dataset['name']}")
|
||||||
|
|
||||||
|
|
||||||
def etl_runner(dataset_to_run: str = None) -> None:
|
def etl_runner(dataset_to_run: str = None, use_cache: bool = False) -> None:
|
||||||
"""Runs all etl processes or a specific one
|
"""Runs all etl processes or a specific one
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
|
dataset_to_run (str): Run a specific ETL process. If missing, runs all processes (optional)
|
||||||
|
use_cache (bool): Use the cached data sources – if they exist – rather than downloading them all from scratch
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
|
@ -105,7 +116,9 @@ def etl_runner(dataset_to_run: str = None) -> None:
|
||||||
logger.info("Running concurrent ETL jobs")
|
logger.info("Running concurrent ETL jobs")
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
futures = {
|
futures = {
|
||||||
executor.submit(_run_one_dataset, dataset=dataset)
|
executor.submit(
|
||||||
|
_run_one_dataset, dataset=dataset, use_cache=use_cache
|
||||||
|
)
|
||||||
for dataset in concurrent_datasets
|
for dataset in concurrent_datasets
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -119,7 +132,50 @@ def etl_runner(dataset_to_run: str = None) -> None:
|
||||||
if high_memory_datasets:
|
if high_memory_datasets:
|
||||||
logger.info("Running high-memory ETL jobs")
|
logger.info("Running high-memory ETL jobs")
|
||||||
for dataset in high_memory_datasets:
|
for dataset in high_memory_datasets:
|
||||||
_run_one_dataset(dataset=dataset)
|
_run_one_dataset(dataset=dataset, use_cache=use_cache)
|
||||||
|
|
||||||
|
|
||||||
|
def get_data_sources(dataset_to_run: str = None) -> [DataSource]:
|
||||||
|
|
||||||
|
dataset_list = _get_datasets_to_run(dataset_to_run)
|
||||||
|
|
||||||
|
sources = []
|
||||||
|
|
||||||
|
for dataset in dataset_list:
|
||||||
|
etl_instance = _get_dataset(dataset)
|
||||||
|
sources.append(etl_instance.get_data_sources())
|
||||||
|
|
||||||
|
sources = reduce(
|
||||||
|
list.__add__, sources
|
||||||
|
) # flatten the list of lists into a single list
|
||||||
|
|
||||||
|
return sources
|
||||||
|
|
||||||
|
|
||||||
|
def extract_data_sources(
|
||||||
|
dataset_to_run: str = None, use_cache: bool = False
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
dataset_list = _get_datasets_to_run(dataset_to_run)
|
||||||
|
|
||||||
|
for dataset in dataset_list:
|
||||||
|
etl_instance = _get_dataset(dataset)
|
||||||
|
logger.info(
|
||||||
|
f"Extracting data set for {etl_instance.__class__.__name__}"
|
||||||
|
)
|
||||||
|
etl_instance.extract(use_cache)
|
||||||
|
|
||||||
|
|
||||||
|
def clear_data_source_cache(dataset_to_run: str = None) -> None:
|
||||||
|
|
||||||
|
dataset_list = _get_datasets_to_run(dataset_to_run)
|
||||||
|
|
||||||
|
for dataset in dataset_list:
|
||||||
|
etl_instance = _get_dataset(dataset)
|
||||||
|
logger.info(
|
||||||
|
f"Clearing data set cache for {etl_instance.__class__.__name__}"
|
||||||
|
)
|
||||||
|
etl_instance.clear_data_source_cache()
|
||||||
|
|
||||||
|
|
||||||
def score_generate() -> None:
|
def score_generate() -> None:
|
||||||
|
|
|
@ -22,6 +22,8 @@ from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.score.score_runner import ScoreRunner
|
from data_pipeline.score.score_runner import ScoreRunner
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -55,7 +57,13 @@ class ScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
|
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return (
|
||||||
|
[]
|
||||||
|
) # we have all prerequisite sources locally as a result of running the ETLs
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
# EJSCreen csv Load
|
# EJSCreen csv Load
|
||||||
ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
|
ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
|
||||||
self.ejscreen_df = pd.read_csv(
|
self.ejscreen_df = pd.read_csv(
|
||||||
|
|
|
@ -15,6 +15,7 @@ from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import load_dict_from_yaml_object_fields
|
from data_pipeline.utils import load_dict_from_yaml_object_fields
|
||||||
from data_pipeline.utils import load_yaml_dict_from_file
|
from data_pipeline.utils import load_yaml_dict_from_file
|
||||||
from data_pipeline.utils import zip_files
|
from data_pipeline.utils import zip_files
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -68,7 +69,13 @@ class GeoScoreETL(ExtractTransformLoad):
|
||||||
self.geojson_score_usa_high: gpd.GeoDataFrame
|
self.geojson_score_usa_high: gpd.GeoDataFrame
|
||||||
self.geojson_score_usa_low: gpd.GeoDataFrame
|
self.geojson_score_usa_low: gpd.GeoDataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return (
|
||||||
|
[]
|
||||||
|
) # we have all prerequisite sources locally as a result of generating the previous steps in the pipeline
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
# check census data
|
# check census data
|
||||||
check_census_data_source(
|
check_census_data_source(
|
||||||
census_data_path=self.DATA_PATH / "census",
|
census_data_path=self.DATA_PATH / "census",
|
||||||
|
|
|
@ -2,7 +2,9 @@ import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from numpy import float64
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from data_pipeline.content.schemas.download_schemas import CodebookConfig
|
from data_pipeline.content.schemas.download_schemas import CodebookConfig
|
||||||
from data_pipeline.content.schemas.download_schemas import CSVConfig
|
from data_pipeline.content.schemas.download_schemas import CSVConfig
|
||||||
from data_pipeline.content.schemas.download_schemas import ExcelConfig
|
from data_pipeline.content.schemas.download_schemas import ExcelConfig
|
||||||
|
@ -16,7 +18,8 @@ from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import load_dict_from_yaml_object_fields
|
from data_pipeline.utils import load_dict_from_yaml_object_fields
|
||||||
from data_pipeline.utils import load_yaml_dict_from_file
|
from data_pipeline.utils import load_yaml_dict_from_file
|
||||||
from data_pipeline.utils import zip_files
|
from data_pipeline.utils import zip_files
|
||||||
from numpy import float64
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.downloader import Downloader
|
||||||
|
|
||||||
from . import constants
|
from . import constants
|
||||||
|
|
||||||
|
@ -61,6 +64,11 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
self.yaml_global_config_sort_by_label = "sort_by_label"
|
self.yaml_global_config_sort_by_label = "sort_by_label"
|
||||||
# End YAML definition constants
|
# End YAML definition constants
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return (
|
||||||
|
[]
|
||||||
|
) # we have all prerequisite sources locally as a result of generating the score
|
||||||
|
|
||||||
def _extract_counties(self, county_path: Path) -> pd.DataFrame:
|
def _extract_counties(self, county_path: Path) -> pd.DataFrame:
|
||||||
logger.debug("Reading Counties CSV")
|
logger.debug("Reading Counties CSV")
|
||||||
return pd.read_csv(
|
return pd.read_csv(
|
||||||
|
@ -97,17 +105,23 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
# check census data
|
# check census data
|
||||||
check_census_data_source(
|
check_census_data_source(
|
||||||
census_data_path=self.DATA_PATH / "census",
|
census_data_path=self.DATA_PATH / "census",
|
||||||
census_data_source=self.DATA_SOURCE,
|
census_data_source=self.DATA_SOURCE,
|
||||||
)
|
)
|
||||||
|
|
||||||
super().extract(
|
# TODO would could probably add this to the data sources for this file
|
||||||
constants.CENSUS_COUNTIES_ZIP_URL,
|
Downloader.download_zip_file_from_url(
|
||||||
constants.TMP_PATH,
|
constants.CENSUS_COUNTIES_ZIP_URL, constants.TMP_PATH
|
||||||
)
|
)
|
||||||
|
|
||||||
self.input_counties_df = self._extract_counties(
|
self.input_counties_df = self._extract_counties(
|
||||||
constants.CENSUS_COUNTIES_FILE_NAME
|
constants.CENSUS_COUNTIES_FILE_NAME
|
||||||
)
|
)
|
||||||
|
|
|
@ -13,7 +13,7 @@ from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
|
||||||
from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE
|
from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE
|
||||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import download_file_from_url
|
from data_pipeline.etl.downloader import Downloader
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
from . import constants
|
from . import constants
|
||||||
|
@ -48,7 +48,7 @@ def check_score_data_source(
|
||||||
# download from s3 if census_data_source is aws
|
# download from s3 if census_data_source is aws
|
||||||
if score_data_source == "aws":
|
if score_data_source == "aws":
|
||||||
logger.debug("Fetching Score Tile data from AWS S3")
|
logger.debug("Fetching Score Tile data from AWS S3")
|
||||||
download_file_from_url(
|
Downloader.download_file_from_url(
|
||||||
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
|
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,23 +1,36 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class CalEnviroScreenETL(ExtractTransformLoad):
|
class CalEnviroScreenETL(ExtractTransformLoad):
|
||||||
|
"""California environmental screen
|
||||||
|
|
||||||
|
TODO: Need good description
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.CALENVIROSCREEN_FTP_URL = (
|
|
||||||
|
# fetch
|
||||||
|
self.calenviroscreen_ftp_url = (
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
+ "/CalEnviroScreen_4.0_2021.zip"
|
+ "/CalEnviroScreen_4.0_2021.zip"
|
||||||
)
|
)
|
||||||
self.CALENVIROSCREEN_CSV = (
|
|
||||||
self.get_tmp_path() / "CalEnviroScreen_4.0_2021.csv"
|
|
||||||
)
|
|
||||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
|
|
||||||
|
|
||||||
# Definining some variable names
|
# input
|
||||||
|
self.calenviroscreen_source = (
|
||||||
|
self.get_sources_path() / "CalEnviroScreen_4.0_2021.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
|
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
|
||||||
|
|
||||||
|
# Defining some variable names
|
||||||
self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
|
self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
|
||||||
self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
|
self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
|
||||||
"calenviroscreen_percentile"
|
"calenviroscreen_percentile"
|
||||||
|
@ -32,19 +45,28 @@ class CalEnviroScreenETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [
|
||||||
|
ZIPDataSource(
|
||||||
|
source=self.calenviroscreen_ftp_url,
|
||||||
|
destination=self.get_sources_path(),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
super().extract(
|
super().extract(
|
||||||
self.CALENVIROSCREEN_FTP_URL,
|
use_cached_data_sources
|
||||||
self.get_tmp_path(),
|
) # download and extract data sources
|
||||||
|
|
||||||
|
self.df = pd.read_csv(
|
||||||
|
self.calenviroscreen_source, dtype={"Census Tract": "string"}
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
|
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
|
||||||
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
|
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
|
||||||
# Load comparison index (CalEnviroScreen 4)
|
# Load comparison index (CalEnviroScreen 4)
|
||||||
self.df = pd.read_csv(
|
|
||||||
self.CALENVIROSCREEN_CSV, dtype={"Census Tract": "string"}
|
|
||||||
)
|
|
||||||
|
|
||||||
self.df.rename(
|
self.df.rename(
|
||||||
columns={
|
columns={
|
||||||
|
@ -68,5 +90,5 @@ class CalEnviroScreenETL(ExtractTransformLoad):
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
# write nationwide csv
|
# write nationwide csv
|
||||||
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)
|
self.df.to_csv(self.OUTPUT_PATH / "data06.csv", index=False)
|
||||||
|
|
|
@ -7,8 +7,9 @@ from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.etl.score.etl_utils import (
|
from data_pipeline.etl.score.etl_utils import (
|
||||||
compare_to_list_of_expected_state_fips_codes,
|
compare_to_list_of_expected_state_fips_codes,
|
||||||
)
|
)
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import FileDataSource
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import download_file_from_url
|
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
|
||||||
|
@ -17,59 +18,74 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class CDCLifeExpectancy(ExtractTransformLoad):
|
class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
|
"""#TODO: create description"""
|
||||||
|
|
||||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
|
|
||||||
NAME = "cdc_life_expectancy"
|
NAME = "cdc_life_expectancy"
|
||||||
|
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
|
||||||
USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
|
|
||||||
else:
|
|
||||||
USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
|
|
||||||
|
|
||||||
LOAD_YAML_CONFIG: bool = False
|
LOAD_YAML_CONFIG: bool = False
|
||||||
LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
|
LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
|
||||||
INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"
|
INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"
|
||||||
|
|
||||||
STATES_MISSING_FROM_USA_FILE = ["23", "55"]
|
STATES_MISSING_FROM_USA_FILE = ["23", "55"]
|
||||||
|
|
||||||
# For some reason, LEEP does not include Maine or Wisconsin in its "All of
|
|
||||||
# USA" file. Load these separately.
|
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
|
||||||
WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
|
|
||||||
MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
|
|
||||||
else:
|
|
||||||
WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
|
|
||||||
MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
|
|
||||||
|
|
||||||
TRACT_INPUT_COLUMN_NAME = "Tract ID"
|
TRACT_INPUT_COLUMN_NAME = "Tract ID"
|
||||||
STATE_INPUT_COLUMN_NAME = "STATE2KX"
|
STATE_INPUT_COLUMN_NAME = "STATE2KX"
|
||||||
|
|
||||||
raw_df: pd.DataFrame
|
raw_df: pd.DataFrame # result of extraction
|
||||||
output_df: pd.DataFrame
|
output_df: pd.DataFrame # result of transformation
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
# fetch
|
||||||
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
|
self.usa_file_url = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
|
||||||
|
else:
|
||||||
|
self.usa_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
|
||||||
|
|
||||||
|
# For some reason, LEEP does not include Maine or Wisconsin in its "All of USA" file. Load these separately.
|
||||||
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
|
self.wisconsin_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
|
||||||
|
self.maine_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
|
||||||
|
else:
|
||||||
|
self.wisconsin_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
|
||||||
|
self.maine_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.usa_source = self.get_sources_path() / "US_A.CSV"
|
||||||
|
self.maine_source = self.get_sources_path() / "ME_A.CSV"
|
||||||
|
self.wisconsin_source = self.get_sources_path() / "WI_A.CSV"
|
||||||
|
|
||||||
|
# output
|
||||||
self.OUTPUT_PATH: Path = (
|
self.OUTPUT_PATH: Path = (
|
||||||
self.DATA_PATH / "dataset" / "cdc_life_expectancy"
|
self.DATA_PATH / "dataset" / "cdc_life_expectancy"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Constants for output
|
self.COLUMNS_TO_KEEP = [ # the columns to save on output
|
||||||
self.COLUMNS_TO_KEEP = [
|
|
||||||
self.GEOID_TRACT_FIELD_NAME,
|
self.GEOID_TRACT_FIELD_NAME,
|
||||||
field_names.LIFE_EXPECTANCY_FIELD,
|
field_names.LIFE_EXPECTANCY_FIELD,
|
||||||
]
|
]
|
||||||
|
|
||||||
def _download_and_prep_data(
|
def get_data_sources(self) -> [DataSource]:
|
||||||
self, file_url: str, download_file_name: pathlib.Path
|
return [
|
||||||
) -> pd.DataFrame:
|
FileDataSource(
|
||||||
download_file_from_url(
|
source=self.usa_file_url, destination=self.usa_source
|
||||||
file_url=file_url,
|
),
|
||||||
download_file_name=download_file_name,
|
FileDataSource(
|
||||||
verify=True,
|
source=self.maine_file_url, destination=self.maine_source
|
||||||
)
|
),
|
||||||
|
FileDataSource(
|
||||||
|
source=self.wisconsin_file_url,
|
||||||
|
destination=self.wisconsin_source,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
def _read_data(self, file_name: pathlib.Path) -> pd.DataFrame:
|
||||||
|
|
||||||
df = pd.read_csv(
|
df = pd.read_csv(
|
||||||
filepath_or_buffer=download_file_name,
|
filepath_or_buffer=file_name,
|
||||||
dtype={
|
dtype={
|
||||||
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
||||||
self.TRACT_INPUT_COLUMN_NAME: "string",
|
self.TRACT_INPUT_COLUMN_NAME: "string",
|
||||||
|
@ -80,12 +96,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
all_usa_raw_df = self._download_and_prep_data(
|
super().extract(
|
||||||
file_url=self.USA_FILE_URL,
|
use_cached_data_sources
|
||||||
download_file_name=self.get_tmp_path() / "US_A.CSV",
|
) # download and extract data sources
|
||||||
)
|
|
||||||
|
all_usa_raw_df = self._read_data(self.usa_source)
|
||||||
|
|
||||||
# Check which states are missing
|
# Check which states are missing
|
||||||
states_in_life_expectancy_usa_file = list(
|
states_in_life_expectancy_usa_file = list(
|
||||||
|
@ -101,17 +118,11 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
|
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug("Downloading data for Maine")
|
maine_raw_df = self._read_data(
|
||||||
maine_raw_df = self._download_and_prep_data(
|
self.maine_source,
|
||||||
file_url=self.MAINE_FILE_URL,
|
|
||||||
download_file_name=self.get_tmp_path() / "maine.csv",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug("Downloading data for Wisconsin")
|
wisconsin_raw_df = self._read_data(self.wisconsin_source)
|
||||||
wisconsin_raw_df = self._download_and_prep_data(
|
|
||||||
file_url=self.WISCONSIN_FILE_URL,
|
|
||||||
download_file_name=self.get_tmp_path() / "wisconsin.csv",
|
|
||||||
)
|
|
||||||
|
|
||||||
combined_df = pd.concat(
|
combined_df = pd.concat(
|
||||||
objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df],
|
objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df],
|
||||||
|
|
|
@ -4,14 +4,17 @@ import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import download_file_from_url
|
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import FileDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class CDCPlacesETL(ExtractTransformLoad):
|
class CDCPlacesETL(ExtractTransformLoad):
|
||||||
|
"""#TODO: Need description"""
|
||||||
|
|
||||||
NAME = "cdc_places"
|
NAME = "cdc_places"
|
||||||
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
|
||||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
|
@ -21,15 +24,21 @@ class CDCPlacesETL(ExtractTransformLoad):
|
||||||
CDC_MEASURE_FIELD_NAME = "Measure"
|
CDC_MEASURE_FIELD_NAME = "Measure"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
|
|
||||||
|
|
||||||
|
# fetch
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
self.CDC_PLACES_URL = (
|
self.cdc_places_url = (
|
||||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||||
"cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
|
"cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
|
self.cdc_places_url = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.places_source = self.get_sources_path() / "census_tract.csv"
|
||||||
|
|
||||||
|
# output
|
||||||
|
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
|
||||||
|
|
||||||
self.COLUMNS_TO_KEEP: typing.List[str] = [
|
self.COLUMNS_TO_KEEP: typing.List[str] = [
|
||||||
self.GEOID_TRACT_FIELD_NAME,
|
self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
@ -43,19 +52,27 @@ class CDCPlacesETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
file_path = download_file_from_url(
|
return [
|
||||||
file_url=self.CDC_PLACES_URL,
|
FileDataSource(
|
||||||
download_file_name=self.get_tmp_path() / "census_tract.csv",
|
source=self.cdc_places_url, destination=self.places_source
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
filepath_or_buffer=file_path,
|
filepath_or_buffer=self.places_source,
|
||||||
dtype={self.CDC_GEOID_FIELD_NAME: "string"},
|
dtype={self.CDC_GEOID_FIELD_NAME: "string"},
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
|
||||||
# Rename GEOID field
|
# Rename GEOID field
|
||||||
self.df.rename(
|
self.df.rename(
|
||||||
columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
|
columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import FileDataSource
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
@ -11,22 +13,28 @@ logger = get_module_logger(__name__)
|
||||||
class CDCSVIIndex(ExtractTransformLoad):
|
class CDCSVIIndex(ExtractTransformLoad):
|
||||||
"""CDC SVI Index class ingests 2018 dataset located
|
"""CDC SVI Index class ingests 2018 dataset located
|
||||||
here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html
|
here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html
|
||||||
|
|
||||||
Please see the README in this module for further details.
|
Please see the README in this module for further details.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
|
|
||||||
|
|
||||||
|
# fetch
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
self.CDC_SVI_INDEX_URL = (
|
self.cdc_svi_index_url = (
|
||||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||||
"cdc_svi_index/SVI2018_US.csv"
|
"cdc_svi_index/SVI2018_US.csv"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
|
self.cdc_svi_index_url = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.svi_source = self.get_sources_path() / "SVI2018_US.csv"
|
||||||
|
|
||||||
|
# output
|
||||||
|
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
|
||||||
|
|
||||||
self.CDC_RPL_THEMES_THRESHOLD = 0.90
|
self.CDC_RPL_THEMES_THRESHOLD = 0.90
|
||||||
|
|
||||||
self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS"
|
self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS"
|
||||||
|
|
||||||
self.COLUMNS_TO_KEEP = [
|
self.COLUMNS_TO_KEEP = [
|
||||||
|
@ -47,9 +55,21 @@ class CDCSVIIndex(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [
|
||||||
|
FileDataSource(
|
||||||
|
source=self.cdc_svi_index_url, destination=self.svi_source
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
filepath_or_buffer=self.CDC_SVI_INDEX_URL,
|
filepath_or_buffer=self.svi_source,
|
||||||
dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
|
dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
@ -107,8 +127,8 @@ class CDCSVIIndex(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
|
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
self.df[self.COLUMNS_TO_KEEP].to_csv(
|
||||||
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
|
||||||
)
|
)
|
||||||
|
|
|
@ -8,7 +8,8 @@ import geopandas as gpd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import unzip_file_from_url
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -20,7 +21,7 @@ class GeoFileType(Enum):
|
||||||
|
|
||||||
|
|
||||||
class CensusETL(ExtractTransformLoad):
|
class CensusETL(ExtractTransformLoad):
|
||||||
SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
|
# SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
|
||||||
GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
|
GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
|
||||||
CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
|
CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
|
||||||
GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
|
GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
|
||||||
|
@ -29,6 +30,9 @@ class CensusETL(ExtractTransformLoad):
|
||||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
self.shape_file_path = self.get_sources_path() / "shp"
|
||||||
|
|
||||||
# the fips_states_2010.csv is generated from data here
|
# the fips_states_2010.csv is generated from data here
|
||||||
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
|
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
|
||||||
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
|
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
|
||||||
|
@ -50,7 +54,7 @@ class CensusETL(ExtractTransformLoad):
|
||||||
file_path: Path
|
file_path: Path
|
||||||
if file_type == GeoFileType.SHP:
|
if file_type == GeoFileType.SHP:
|
||||||
file_path = Path(
|
file_path = Path(
|
||||||
self.SHP_BASE_PATH
|
self.shape_file_path
|
||||||
/ fips_code
|
/ fips_code
|
||||||
/ f"tl_2010_{fips_code}_tract10.shp"
|
/ f"tl_2010_{fips_code}_tract10.shp"
|
||||||
)
|
)
|
||||||
|
@ -60,33 +64,22 @@ class CensusETL(ExtractTransformLoad):
|
||||||
file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
|
file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
def _extract_shp(self, fips_code: str) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
"""Download the SHP file for the provided FIPS code
|
|
||||||
|
|
||||||
Args:
|
sources = []
|
||||||
fips_code (str): the FIPS code for the region of interest
|
|
||||||
|
|
||||||
Returns:
|
for fips_code in self.STATE_FIPS_CODES:
|
||||||
None
|
|
||||||
"""
|
|
||||||
shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
|
|
||||||
|
|
||||||
# check if file exists
|
|
||||||
if not shp_file_path.is_file():
|
|
||||||
tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
|
tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
|
||||||
unzip_file_from_url(
|
destination_path = self.shape_file_path / fips_code
|
||||||
tract_state_url,
|
|
||||||
self.TMP_PATH,
|
sources.append(
|
||||||
self.DATA_PATH / "census" / "shp" / fips_code,
|
ZIPDataSource(
|
||||||
|
source=tract_state_url, destination=destination_path
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def extract(self) -> None:
|
return sources
|
||||||
logger.debug("Extracting census data")
|
|
||||||
for index, fips_code in enumerate(self.STATE_FIPS_CODES):
|
|
||||||
logger.debug(
|
|
||||||
f"Extracting shape for FIPS {fips_code} – {index+1} of {len(self.STATE_FIPS_CODES)}"
|
|
||||||
)
|
|
||||||
self._extract_shp(fips_code)
|
|
||||||
|
|
||||||
def _transform_to_geojson(self, fips_code: str) -> None:
|
def _transform_to_geojson(self, fips_code: str) -> None:
|
||||||
"""Convert the downloaded SHP file for the associated FIPS to geojson
|
"""Convert the downloaded SHP file for the associated FIPS to geojson
|
||||||
|
|
|
@ -56,6 +56,7 @@ def get_state_fips_codes(data_path: Path) -> list:
|
||||||
else:
|
else:
|
||||||
fips = row[0].strip()
|
fips = row[0].strip()
|
||||||
fips_state_list.append(fips)
|
fips_state_list.append(fips)
|
||||||
|
|
||||||
return fips_state_list
|
return fips_state_list
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,12 +8,11 @@ from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.sources.census_acs.etl_imputations import (
|
from data_pipeline.etl.sources.census_acs.etl_imputations import (
|
||||||
calculate_income_measures,
|
calculate_income_measures,
|
||||||
)
|
)
|
||||||
from data_pipeline.etl.sources.census_acs.etl_utils import (
|
|
||||||
retrieve_census_acs_data,
|
|
||||||
)
|
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import unzip_file_from_url
|
from data_pipeline.utils import unzip_file_from_url
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import CensusDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -28,6 +27,9 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
|
MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
self.census_acs_source = self.get_sources_path() / "acs.csv"
|
||||||
|
|
||||||
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
|
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
|
||||||
self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
|
self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
|
||||||
self.EMPLOYMENT_FIELDS = [
|
self.EMPLOYMENT_FIELDS = [
|
||||||
|
@ -311,6 +313,34 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
# Define the variables to retrieve
|
||||||
|
variables = (
|
||||||
|
[
|
||||||
|
self.MEDIAN_INCOME_FIELD,
|
||||||
|
self.MEDIAN_HOUSE_VALUE_FIELD,
|
||||||
|
]
|
||||||
|
+ self.EMPLOYMENT_FIELDS
|
||||||
|
+ self.LINGUISTIC_ISOLATION_FIELDS
|
||||||
|
+ self.POVERTY_FIELDS
|
||||||
|
+ self.EDUCATIONAL_FIELDS
|
||||||
|
+ self.RE_FIELDS
|
||||||
|
+ self.COLLEGE_ATTENDANCE_FIELDS
|
||||||
|
+ self.AGE_INPUT_FIELDS
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
CensusDataSource(
|
||||||
|
source=None,
|
||||||
|
destination=self.census_acs_source,
|
||||||
|
acs_year=self.ACS_YEAR,
|
||||||
|
variables=variables,
|
||||||
|
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
data_path_for_fips_codes=self.DATA_PATH,
|
||||||
|
acs_type="acs5",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
# pylint: disable=too-many-arguments
|
# pylint: disable=too-many-arguments
|
||||||
def _merge_geojson(
|
def _merge_geojson(
|
||||||
self,
|
self,
|
||||||
|
@ -339,27 +369,15 @@ class CensusACSETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
# Define the variables to retrieve
|
|
||||||
variables = (
|
|
||||||
[
|
|
||||||
self.MEDIAN_INCOME_FIELD,
|
|
||||||
self.MEDIAN_HOUSE_VALUE_FIELD,
|
|
||||||
]
|
|
||||||
+ self.EMPLOYMENT_FIELDS
|
|
||||||
+ self.LINGUISTIC_ISOLATION_FIELDS
|
|
||||||
+ self.POVERTY_FIELDS
|
|
||||||
+ self.EDUCATIONAL_FIELDS
|
|
||||||
+ self.RE_FIELDS
|
|
||||||
+ self.COLLEGE_ATTENDANCE_FIELDS
|
|
||||||
+ self.AGE_INPUT_FIELDS
|
|
||||||
)
|
|
||||||
|
|
||||||
self.df = retrieve_census_acs_data(
|
super().extract(
|
||||||
acs_year=self.ACS_YEAR,
|
use_cached_data_sources
|
||||||
variables=variables,
|
) # download and extract data sources
|
||||||
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
|
||||||
data_path_for_fips_codes=self.DATA_PATH,
|
self.df = pd.read_csv(
|
||||||
|
self.census_acs_source,
|
||||||
|
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.sources.census_acs.etl_utils import (
|
|
||||||
retrieve_census_acs_data,
|
|
||||||
)
|
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import CensusDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -18,6 +17,9 @@ class CensusACS2010ETL(ExtractTransformLoad):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
self.census_acs_source = self.get_sources_path() / "acs_2010.csv"
|
||||||
|
|
||||||
self.ACS_YEAR = 2010
|
self.ACS_YEAR = 2010
|
||||||
self.ACS_TYPE = "acs5"
|
self.ACS_TYPE = "acs5"
|
||||||
self.OUTPUT_PATH = (
|
self.OUTPUT_PATH = (
|
||||||
|
@ -99,7 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
# Define the variables to retrieve
|
# Define the variables to retrieve
|
||||||
variables = (
|
variables = (
|
||||||
self.UNEMPLOYED_FIELDS
|
self.UNEMPLOYED_FIELDS
|
||||||
|
@ -107,14 +109,27 @@ class CensusACS2010ETL(ExtractTransformLoad):
|
||||||
+ self.POVERTY_FIELDS
|
+ self.POVERTY_FIELDS
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use the method defined on CensusACSETL to reduce coding redundancy.
|
return [
|
||||||
self.df = retrieve_census_acs_data(
|
CensusDataSource(
|
||||||
|
source=None,
|
||||||
|
destination=self.census_acs_source,
|
||||||
acs_year=self.ACS_YEAR,
|
acs_year=self.ACS_YEAR,
|
||||||
variables=variables,
|
variables=variables,
|
||||||
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
|
||||||
data_path_for_fips_codes=self.DATA_PATH,
|
data_path_for_fips_codes=self.DATA_PATH,
|
||||||
acs_type=self.ACS_TYPE,
|
acs_type=self.ACS_TYPE,
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
self.df = pd.read_csv(
|
||||||
|
self.census_acs_source, dtype={"GEOID10_TRACT": "string"}
|
||||||
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
df = self.df
|
df = self.df
|
||||||
|
|
|
@ -1,14 +1,16 @@
|
||||||
|
import os
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.utils import download_file_from_url
|
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import unzip_file_from_url
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
from data_pipeline.etl.datasource import FileDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -22,6 +24,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
/ f"census_acs_median_income_{self.ACS_YEAR}"
|
/ f"census_acs_median_income_{self.ACS_YEAR}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.GEOCORR_ALL_STATES_URL = (
|
||||||
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
|
+ "/geocorr2014_all_states_tracts_only.csv.zip"
|
||||||
|
)
|
||||||
|
self.GEOCORR_ALL_STATES_PATH = self.get_sources_path() / "geocorr"
|
||||||
|
self.GEOCORR_ALL_STATES_SOURCE = (
|
||||||
|
self.GEOCORR_ALL_STATES_PATH
|
||||||
|
/ "geocorr2014_all_states_tracts_only.csv"
|
||||||
|
)
|
||||||
|
|
||||||
# Set constants for Geocorr MSAs data.
|
# Set constants for Geocorr MSAs data.
|
||||||
self.PLACE_FIELD_NAME: str = "Census Place Name"
|
self.PLACE_FIELD_NAME: str = "Census Place Name"
|
||||||
self.COUNTY_FIELD_NAME: str = "County Name"
|
self.COUNTY_FIELD_NAME: str = "County Name"
|
||||||
|
@ -39,10 +51,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E"
|
f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E"
|
||||||
+ "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area"
|
+ "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area"
|
||||||
)
|
)
|
||||||
|
self.MSA_MEDIAN_INCOME_SOURCE = (
|
||||||
|
self.get_sources_path() / "msa" / "msa_median_income.json"
|
||||||
|
)
|
||||||
self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)"
|
self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)"
|
||||||
|
|
||||||
# Set constants for state median incomes
|
# Set constants for state median incomes
|
||||||
self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state"
|
self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state"
|
||||||
|
self.STATE_MEDIAN_INCOME_SOURCE = (
|
||||||
|
self.get_sources_path() / "state" / "state_median_income.json"
|
||||||
|
)
|
||||||
self.STATE_GEOID_FIELD_NAME: str = "GEOID2"
|
self.STATE_GEOID_FIELD_NAME: str = "GEOID2"
|
||||||
self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)"
|
self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)"
|
||||||
|
|
||||||
|
@ -50,6 +68,18 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
self.PUERTO_RICO_S3_LINK: str = (
|
self.PUERTO_RICO_S3_LINK: str = (
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv"
|
settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv"
|
||||||
)
|
)
|
||||||
|
self.PUERTO_RICO_ALL_STATES_SOURCE = (
|
||||||
|
self.get_sources_path() / "pr_tracts" / "pr_tracts.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
census_api_key = os.environ.get("CENSUS_API_KEY")
|
||||||
|
if census_api_key:
|
||||||
|
self.MSA_MEDIAN_INCOME_URL = (
|
||||||
|
self.MSA_MEDIAN_INCOME_URL + f"&key={census_api_key}"
|
||||||
|
)
|
||||||
|
self.STATE_MEDIAN_INCOME_URL = (
|
||||||
|
self.STATE_MEDIAN_INCOME_URL + f"&key={census_api_key}"
|
||||||
|
)
|
||||||
|
|
||||||
# Constants for output
|
# Constants for output
|
||||||
self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
|
self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
|
||||||
|
@ -76,6 +106,27 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
self.state_median_incomes: dict
|
self.state_median_incomes: dict
|
||||||
self.pr_tracts: pd.DataFrame
|
self.pr_tracts: pd.DataFrame
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
|
||||||
|
return [
|
||||||
|
ZIPDataSource(
|
||||||
|
source=self.GEOCORR_ALL_STATES_URL,
|
||||||
|
destination=self.GEOCORR_ALL_STATES_PATH,
|
||||||
|
),
|
||||||
|
FileDataSource(
|
||||||
|
source=self.PUERTO_RICO_S3_LINK,
|
||||||
|
destination=self.PUERTO_RICO_ALL_STATES_SOURCE,
|
||||||
|
),
|
||||||
|
FileDataSource(
|
||||||
|
source=self.MSA_MEDIAN_INCOME_URL,
|
||||||
|
destination=self.MSA_MEDIAN_INCOME_SOURCE,
|
||||||
|
),
|
||||||
|
FileDataSource(
|
||||||
|
source=self.STATE_MEDIAN_INCOME_URL,
|
||||||
|
destination=self.STATE_MEDIAN_INCOME_SOURCE,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
def _transform_geocorr(self) -> pd.DataFrame:
|
def _transform_geocorr(self) -> pd.DataFrame:
|
||||||
# Transform the geocorr data
|
# Transform the geocorr data
|
||||||
geocorr_df = self.raw_geocorr_df
|
geocorr_df = self.raw_geocorr_df
|
||||||
|
@ -223,7 +274,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
return state_median_incomes_df
|
return state_median_incomes_df
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
# Load and clean GEOCORR data
|
# Load and clean GEOCORR data
|
||||||
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
|
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
|
||||||
# The specific query used is the following, which takes a couple of minutes to run:
|
# The specific query used is the following, which takes a couple of minutes to run:
|
||||||
|
@ -239,18 +291,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
# - Core based statistical area (CBSA)
|
# - Core based statistical area (CBSA)
|
||||||
# - CBSA Type (Metro or Micro)
|
# - CBSA Type (Metro or Micro)
|
||||||
logger.debug("Starting download of 1.5MB Geocorr information.")
|
logger.debug("Starting download of 1.5MB Geocorr information.")
|
||||||
|
super().extract(
|
||||||
unzip_file_from_url(
|
use_cached_data_sources
|
||||||
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
) # download and extract data sources
|
||||||
+ "/geocorr2014_all_states_tracts_only.csv.zip",
|
|
||||||
download_path=self.get_tmp_path(),
|
|
||||||
unzipped_file_path=self.get_tmp_path() / "geocorr",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.raw_geocorr_df = pd.read_csv(
|
self.raw_geocorr_df = pd.read_csv(
|
||||||
filepath_or_buffer=self.get_tmp_path()
|
filepath_or_buffer=self.GEOCORR_ALL_STATES_SOURCE,
|
||||||
/ "geocorr"
|
|
||||||
/ "geocorr2014_all_states_tracts_only.csv",
|
|
||||||
# Skip second row, which has descriptions.
|
# Skip second row, which has descriptions.
|
||||||
skiprows=[1],
|
skiprows=[1],
|
||||||
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
||||||
|
@ -264,39 +310,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug("Pulling PR tract list down.")
|
|
||||||
# This step is necessary because PR is not in geocorr at the level that gets joined
|
|
||||||
pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
|
|
||||||
download_file_from_url(
|
|
||||||
file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file
|
|
||||||
)
|
|
||||||
self.pr_tracts = pd.read_csv(
|
self.pr_tracts = pd.read_csv(
|
||||||
filepath_or_buffer=self.get_tmp_path()
|
filepath_or_buffer=self.PUERTO_RICO_ALL_STATES_SOURCE,
|
||||||
/ "pr_tracts"
|
|
||||||
/ "pr_tracts.csv",
|
|
||||||
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
||||||
dtype={"GEOID10_TRACT": str},
|
dtype={"GEOID10_TRACT": str},
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
self.pr_tracts["State Abbreviation"] = "PR"
|
self.pr_tracts["State Abbreviation"] = "PR"
|
||||||
|
|
||||||
# Download MSA median incomes
|
with self.MSA_MEDIAN_INCOME_SOURCE.open() as source:
|
||||||
logger.debug("Starting download of MSA median incomes.")
|
self.msa_median_incomes = json.load(source)
|
||||||
download = requests.get(
|
|
||||||
self.MSA_MEDIAN_INCOME_URL,
|
|
||||||
verify=None,
|
|
||||||
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
|
||||||
)
|
|
||||||
self.msa_median_incomes = json.loads(download.content)
|
|
||||||
|
|
||||||
# Download state median incomes
|
with self.STATE_MEDIAN_INCOME_SOURCE.open() as source:
|
||||||
logger.debug("Starting download of state median incomes.")
|
self.state_median_incomes = json.load(source)
|
||||||
download_state = requests.get(
|
|
||||||
self.STATE_MEDIAN_INCOME_URL,
|
|
||||||
verify=None,
|
|
||||||
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
|
||||||
)
|
|
||||||
self.state_median_incomes = json.loads(download_state.content)
|
|
||||||
## NOTE we already have PR's MI here
|
## NOTE we already have PR's MI here
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
import json
|
import json
|
||||||
from typing import List
|
from typing import List
|
||||||
|
import os
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
|
||||||
from data_pipeline.config import settings
|
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import FileDataSource
|
||||||
|
|
||||||
pd.options.mode.chained_assignment = "raise"
|
pd.options.mode.chained_assignment = "raise"
|
||||||
|
|
||||||
|
@ -342,20 +343,23 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
+ "&for=tract:*&in=state:{}%20county:{}"
|
+ "&for=tract:*&in=state:{}%20county:{}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
census_api_key = os.environ.get("CENSUS_API_KEY")
|
||||||
|
if census_api_key:
|
||||||
|
self.API_URL = self.API_URL + f"&key={census_api_key}"
|
||||||
|
|
||||||
self.final_race_fields: List[str] = []
|
self.final_race_fields: List[str] = []
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
self.df_vi: pd.DataFrame
|
self.df_vi: pd.DataFrame
|
||||||
self.df_all: pd.DataFrame
|
self.df_all: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
dfs = []
|
|
||||||
dfs_vi = []
|
sources = []
|
||||||
|
|
||||||
for island in self.ISLAND_TERRITORIES:
|
for island in self.ISLAND_TERRITORIES:
|
||||||
logger.debug(
|
|
||||||
f"Downloading data for state/territory {island['state_abbreviation']}"
|
|
||||||
)
|
|
||||||
for county in island["county_fips"]:
|
for county in island["county_fips"]:
|
||||||
|
|
||||||
api_url = self.API_URL.format(
|
api_url = self.API_URL.format(
|
||||||
self.DECENNIAL_YEAR,
|
self.DECENNIAL_YEAR,
|
||||||
island["state_abbreviation"],
|
island["state_abbreviation"],
|
||||||
|
@ -363,17 +367,48 @@ class CensusDecennialETL(ExtractTransformLoad):
|
||||||
island["fips"],
|
island["fips"],
|
||||||
county,
|
county,
|
||||||
)
|
)
|
||||||
logger.debug(f"CENSUS: Requesting {api_url}")
|
|
||||||
download = requests.get(
|
sources.append(
|
||||||
api_url,
|
FileDataSource(
|
||||||
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
source=api_url,
|
||||||
|
destination=self.get_sources_path()
|
||||||
|
/ str(self.DECENNIAL_YEAR)
|
||||||
|
/ island["state_abbreviation"]
|
||||||
|
/ island["fips"]
|
||||||
|
/ county
|
||||||
|
/ "census.json",
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return sources
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
dfs = []
|
||||||
|
dfs_vi = []
|
||||||
|
for island in self.ISLAND_TERRITORIES:
|
||||||
|
logger.debug(
|
||||||
|
f"Downloading data for state/territory {island['state_abbreviation']}"
|
||||||
|
)
|
||||||
|
for county in island["county_fips"]:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
df = json.loads(download.content)
|
filepath = (
|
||||||
|
self.get_sources_path()
|
||||||
|
/ str(self.DECENNIAL_YEAR)
|
||||||
|
/ island["state_abbreviation"]
|
||||||
|
/ island["fips"]
|
||||||
|
/ county
|
||||||
|
/ "census.json"
|
||||||
|
)
|
||||||
|
df = json.load(filepath.open())
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Could not load content in census decennial ETL because {e}. Content is {download.content}."
|
f"Could not load content in census decennial ETL because {e}."
|
||||||
)
|
)
|
||||||
|
|
||||||
# First row is the header
|
# First row is the header
|
||||||
|
|
|
@ -5,6 +5,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -38,17 +40,26 @@ class ChildOpportunityIndex(ExtractTransformLoad):
|
||||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
# fetch
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
self.SOURCE_URL = (
|
self.child_opportunity_url = (
|
||||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||||
"child_opportunity_index/raw.zip"
|
"child_opportunity_index/raw.zip"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.SOURCE_URL = (
|
self.child_opportunity_url = (
|
||||||
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
|
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
|
||||||
"3a0ededa30a0?format=csv"
|
"3a0ededa30a0?format=csv"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.child_opportunity_index_source = (
|
||||||
|
self.get_sources_path() / "raw.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
|
|
||||||
# TODO: Decide about nixing this
|
# TODO: Decide about nixing this
|
||||||
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
|
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
|
||||||
|
|
||||||
|
@ -62,17 +73,25 @@ class ChildOpportunityIndex(ExtractTransformLoad):
|
||||||
self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN"
|
self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN"
|
||||||
self.READING_INPUT_FIELD = "ED_READING"
|
self.READING_INPUT_FIELD = "ED_READING"
|
||||||
|
|
||||||
|
self.raw_df: pd.DataFrame
|
||||||
self.output_df: pd.DataFrame
|
self.output_df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
super().extract(
|
return [
|
||||||
source_url=self.SOURCE_URL,
|
ZIPDataSource(
|
||||||
extract_path=self.get_tmp_path(),
|
source=self.child_opportunity_url,
|
||||||
|
destination=self.get_sources_path(),
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def transform(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
raw_df = pd.read_csv(
|
|
||||||
filepath_or_buffer=self.get_tmp_path() / "raw.csv",
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
self.raw_df = pd.read_csv(
|
||||||
|
filepath_or_buffer=self.child_opportunity_index_source,
|
||||||
# The following need to remain as strings for all of their digits, not get
|
# The following need to remain as strings for all of their digits, not get
|
||||||
# converted to numbers.
|
# converted to numbers.
|
||||||
dtype={
|
dtype={
|
||||||
|
@ -81,7 +100,9 @@ class ChildOpportunityIndex(ExtractTransformLoad):
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
output_df = raw_df.rename(
|
def transform(self) -> None:
|
||||||
|
|
||||||
|
output_df = self.raw_df.rename(
|
||||||
columns={
|
columns={
|
||||||
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
|
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
|
||||||
self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD,
|
self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD,
|
||||||
|
|
|
@ -5,22 +5,35 @@ from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DOEEnergyBurden(ExtractTransformLoad):
|
class DOEEnergyBurden(ExtractTransformLoad):
|
||||||
|
|
||||||
NAME = "doe_energy_burden"
|
NAME = "doe_energy_burden"
|
||||||
SOURCE_URL: str = (
|
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
|
||||||
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
|
|
||||||
)
|
|
||||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||||
LOAD_YAML_CONFIG: bool = True
|
LOAD_YAML_CONFIG: bool = True
|
||||||
|
|
||||||
REVISED_ENERGY_BURDEN_FIELD_NAME: str
|
REVISED_ENERGY_BURDEN_FIELD_NAME: str
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
# fetch
|
||||||
|
self.doe_energy_burden_url = (
|
||||||
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
|
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
|
||||||
|
)
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.doe_energy_burden_source = (
|
||||||
|
self.get_sources_path() / "DOE_LEAD_AMI_TRACT_2018_ALL.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
self.OUTPUT_PATH: Path = (
|
self.OUTPUT_PATH: Path = (
|
||||||
self.DATA_PATH / "dataset" / "doe_energy_burden"
|
self.DATA_PATH / "dataset" / "doe_energy_burden"
|
||||||
)
|
)
|
||||||
|
@ -29,10 +42,22 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
||||||
self.raw_df: pd.DataFrame
|
self.raw_df: pd.DataFrame
|
||||||
self.output_df: pd.DataFrame
|
self.output_df: pd.DataFrame
|
||||||
|
|
||||||
def transform(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
raw_df: pd.DataFrame = pd.read_csv(
|
return [
|
||||||
filepath_or_buffer=self.get_tmp_path()
|
ZIPDataSource(
|
||||||
/ "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
|
source=self.doe_energy_burden_url,
|
||||||
|
destination=self.get_sources_path(),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
self.raw_df = pd.read_csv(
|
||||||
|
filepath_or_buffer=self.doe_energy_burden_source,
|
||||||
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
||||||
dtype={
|
dtype={
|
||||||
self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
|
self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
|
||||||
|
@ -40,8 +65,10 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def transform(self) -> None:
|
||||||
|
|
||||||
logger.debug("Renaming columns and ensuring output format is correct")
|
logger.debug("Renaming columns and ensuring output format is correct")
|
||||||
output_df = raw_df.rename(
|
output_df = self.raw_df.rename(
|
||||||
columns={
|
columns={
|
||||||
self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
|
self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
|
||||||
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
|
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
|
|
@ -3,6 +3,8 @@
|
||||||
import geopandas as gpd
|
import geopandas as gpd
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
@ -15,14 +17,6 @@ class TravelCompositeETL(ExtractTransformLoad):
|
||||||
|
|
||||||
NAME = "travel_composite"
|
NAME = "travel_composite"
|
||||||
|
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
|
||||||
SOURCE_URL = (
|
|
||||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
|
||||||
"dot_travel_composite/Shapefile_and_Metadata.zip"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
|
|
||||||
|
|
||||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
LOAD_YAML_CONFIG: bool = True
|
LOAD_YAML_CONFIG: bool = True
|
||||||
|
@ -31,14 +25,29 @@ class TravelCompositeETL(ExtractTransformLoad):
|
||||||
TRAVEL_BURDEN_FIELD_NAME: str
|
TRAVEL_BURDEN_FIELD_NAME: str
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
# fetch
|
||||||
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
|
self.travel_composite_url = (
|
||||||
|
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||||
|
"dot_travel_composite/Shapefile_and_Metadata.zip"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.travel_composite_url = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
|
||||||
|
|
||||||
|
# input
|
||||||
# define the full path for the input CSV file
|
# define the full path for the input CSV file
|
||||||
self.INPUT_SHP = (
|
self.disadvantage_layer_shape_source = (
|
||||||
self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
|
self.get_sources_path()
|
||||||
|
/ "DOT_Disadvantage_Layer_Final_April2022.shp"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
# this is the main dataframe
|
# this is the main dataframe
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
self.df_dot: pd.DataFrame
|
||||||
|
|
||||||
# Start dataset-specific vars here
|
# Start dataset-specific vars here
|
||||||
## Average of Transportation Indicator Percentiles (calculated)
|
## Average of Transportation Indicator Percentiles (calculated)
|
||||||
## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
|
## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
|
||||||
|
@ -46,6 +55,22 @@ class TravelCompositeETL(ExtractTransformLoad):
|
||||||
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
|
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
|
||||||
self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
|
self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [
|
||||||
|
ZIPDataSource(
|
||||||
|
source=self.travel_composite_url,
|
||||||
|
destination=self.get_sources_path(),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
self.df_dot = gpd.read_file(self.disadvantage_layer_shape_source)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
"""Reads the unzipped data file into memory and applies the following
|
"""Reads the unzipped data file into memory and applies the following
|
||||||
transformations to prepare it for the load() method:
|
transformations to prepare it for the load() method:
|
||||||
|
@ -54,15 +79,15 @@ class TravelCompositeETL(ExtractTransformLoad):
|
||||||
- Converts to CSV
|
- Converts to CSV
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# read in the unzipped shapefile from data source
|
|
||||||
# reformat it to be standard df, remove unassigned rows, and
|
# reformat it to be standard df, remove unassigned rows, and
|
||||||
# then rename the Census Tract column for merging
|
# then rename the Census Tract column for merging
|
||||||
df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
|
|
||||||
df_dot = df_dot.rename(
|
self.df_dot = self.df_dot.rename(
|
||||||
columns={
|
columns={
|
||||||
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
|
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
|
||||||
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
|
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
|
||||||
}
|
}
|
||||||
).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
|
).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
|
||||||
|
|
||||||
# Assign the final df to the class' output_df for the load method
|
# Assign the final df to the class' output_df for the load method
|
||||||
self.output_df = df_dot
|
self.output_df = self.df_dot
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import geopandas as gpd
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import geopandas as gpd
|
||||||
|
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
|
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -39,13 +42,20 @@ class AbandonedMineETL(ExtractTransformLoad):
|
||||||
"55",
|
"55",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Define these for easy code completion
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.SOURCE_URL = (
|
|
||||||
|
# fetch
|
||||||
|
self.eamlis_url = (
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
+ "/eAMLIS export of all data.tsv.zip"
|
+ "/eAMLIS export of all data.tsv.zip"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.eamlis_source = (
|
||||||
|
self.get_sources_path() / "eAMLIS export of all data.tsv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
|
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
|
||||||
|
|
||||||
self.OUTPUT_PATH: Path = (
|
self.OUTPUT_PATH: Path = (
|
||||||
|
@ -58,18 +68,34 @@ class AbandonedMineETL(ExtractTransformLoad):
|
||||||
]
|
]
|
||||||
|
|
||||||
self.output_df: pd.DataFrame
|
self.output_df: pd.DataFrame
|
||||||
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def transform(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
df = pd.read_csv(
|
return [
|
||||||
self.get_tmp_path() / "eAMLIS export of all data.tsv",
|
ZIPDataSource(
|
||||||
|
source=self.eamlis_url, destination=self.get_sources_path()
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
self.df = pd.read_csv(
|
||||||
|
self.eamlis_source,
|
||||||
sep="\t",
|
sep="\t",
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def transform(self) -> None:
|
||||||
|
|
||||||
gdf = gpd.GeoDataFrame(
|
gdf = gpd.GeoDataFrame(
|
||||||
df,
|
self.df,
|
||||||
geometry=gpd.points_from_xy(
|
geometry=gpd.points_from_xy(
|
||||||
x=df["Longitude"],
|
x=self.df["Longitude"],
|
||||||
y=df["Latitude"],
|
y=self.df["Latitude"],
|
||||||
),
|
),
|
||||||
crs="epsg:4326",
|
crs="epsg:4326",
|
||||||
)
|
)
|
||||||
|
@ -77,4 +103,5 @@ class AbandonedMineETL(ExtractTransformLoad):
|
||||||
gdf_tracts = add_tracts_for_geometries(gdf)
|
gdf_tracts = add_tracts_for_geometries(gdf)
|
||||||
gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
|
gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
|
||||||
gdf_tracts[self.AML_BOOLEAN] = True
|
gdf_tracts[self.AML_BOOLEAN] = True
|
||||||
|
|
||||||
self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]
|
self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]
|
||||||
|
|
|
@ -3,6 +3,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -15,11 +17,18 @@ class EJSCREENETL(ExtractTransformLoad):
|
||||||
INPUT_GEOID_TRACT_FIELD_NAME: str = "ID"
|
INPUT_GEOID_TRACT_FIELD_NAME: str = "ID"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
|
|
||||||
self.EJSCREEN_CSV = (
|
# fetch
|
||||||
self.get_tmp_path() / "EJSCREEN_2021_USPR_Tracts.csv"
|
self.ejscreen_url = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.ejscreen_source = (
|
||||||
|
self.get_sources_path() / "EJSCREEN_2021_USPR_Tracts.csv"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen"
|
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen"
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
self.COLUMNS_TO_KEEP = [
|
self.COLUMNS_TO_KEEP = [
|
||||||
|
@ -43,22 +52,29 @@ class EJSCREENETL(ExtractTransformLoad):
|
||||||
field_names.UST_FIELD,
|
field_names.UST_FIELD,
|
||||||
]
|
]
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
super().extract(
|
return [
|
||||||
self.EJSCREEN_FTP_URL,
|
ZIPDataSource(
|
||||||
self.get_tmp_path(),
|
source=self.ejscreen_url, destination=self.get_sources_path()
|
||||||
verify=False, # EPA EJScreen end point has certificate issues often
|
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
def transform(self) -> None:
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
self.EJSCREEN_CSV,
|
self.ejscreen_source,
|
||||||
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
||||||
# EJSCREEN writes the word "None" for NA data.
|
# EJSCREEN writes the word "None" for NA data.
|
||||||
na_values=["None"],
|
na_values=["None"],
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def transform(self) -> None:
|
||||||
|
|
||||||
# rename ID to Tract ID
|
# rename ID to Tract ID
|
||||||
self.output_df = self.df.rename(
|
self.output_df = self.df.rename(
|
||||||
columns={
|
columns={
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
@ -9,12 +10,17 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
|
||||||
# Note: while we normally set these properties in `__init__`,
|
# Note: while we normally set these properties in `__init__`,
|
||||||
# we are setting them as class properties here so they can be accessed by the
|
# we are setting them as class properties here so they can be accessed by the
|
||||||
# class method `ejscreen_areas_of_concern_data_exists`.
|
# class method `ejscreen_areas_of_concern_data_exists`.
|
||||||
LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
|
|
||||||
EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
|
EJSCREEN_AREAS_OF_CONCERN_SOURCE = (
|
||||||
LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
|
ExtractTransformLoad.DATA_PATH
|
||||||
|
/ "sources"
|
||||||
|
/ "EJSCREENAreasOfConcernETL"
|
||||||
|
/ "ejscreen_areas_of_concerns_indicators.csv"
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
# output
|
||||||
self.OUTPUT_PATH = (
|
self.OUTPUT_PATH = (
|
||||||
self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
|
self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
|
||||||
)
|
)
|
||||||
|
@ -22,6 +28,10 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
|
||||||
# TO DO: Load from actual source; the issue is that this dataset is not public for now
|
# TO DO: Load from actual source; the issue is that this dataset is not public for now
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
"""The source for this must be downloaded and saved manually. It is not publicly available"""
|
||||||
|
return []
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def ejscreen_areas_of_concern_data_exists(cls):
|
def ejscreen_areas_of_concern_data_exists(cls):
|
||||||
"""Check whether or not the EJSCREEN areas of concern data exists.
|
"""Check whether or not the EJSCREEN areas of concern data exists.
|
||||||
|
@ -35,13 +45,19 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
|
||||||
not reference this data.
|
not reference this data.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
|
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE.is_file()
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
logger.info(self.EJSCREEN_AREAS_OF_CONCERN_SOURCE)
|
||||||
if self.ejscreen_areas_of_concern_data_exists():
|
if self.ejscreen_areas_of_concern_data_exists():
|
||||||
logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
|
logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
|
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE,
|
||||||
dtype={
|
dtype={
|
||||||
self.GEOID_FIELD_NAME: "string",
|
self.GEOID_FIELD_NAME: "string",
|
||||||
},
|
},
|
||||||
|
|
|
@ -5,18 +5,27 @@ from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import unzip_file_from_url
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.DEFINITION_ALTERNATIVE_FILE_URL = (
|
|
||||||
|
# fetch
|
||||||
|
self.definition_alternative_url = (
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
+ "/alternative DAC definition.csv.zip"
|
+ "/alternative DAC definition.csv.zip"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.definition_alternative_source = (
|
||||||
|
self.get_sources_path() / "J40 alternative DAC definition.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
self.OUTPUT_PATH: Path = (
|
self.OUTPUT_PATH: Path = (
|
||||||
self.DATA_PATH / "dataset" / "energy_definition_alternative_draft"
|
self.DATA_PATH / "dataset" / "energy_definition_alternative_draft"
|
||||||
)
|
)
|
||||||
|
@ -48,18 +57,22 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
unzip_file_from_url(
|
return [
|
||||||
file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
|
ZIPDataSource(
|
||||||
download_path=self.get_tmp_path(),
|
source=self.definition_alternative_url,
|
||||||
unzipped_file_path=self.get_tmp_path()
|
destination=self.get_sources_path(),
|
||||||
/ "energy_definition_alternative_draft",
|
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
filepath_or_buffer=self.get_tmp_path()
|
filepath_or_buffer=self.definition_alternative_source,
|
||||||
/ "energy_definition_alternative_draft"
|
|
||||||
/ "J40 alternative DAC definition.csv",
|
|
||||||
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
# The following need to remain as strings for all of their digits, not get converted to numbers.
|
||||||
dtype={
|
dtype={
|
||||||
self.TRACT_INPUT_COLUMN_NAME: "string",
|
self.TRACT_INPUT_COLUMN_NAME: "string",
|
||||||
|
@ -68,6 +81,7 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
|
||||||
self.df = self.df.rename(
|
self.df = self.df.rename(
|
||||||
columns={
|
columns={
|
||||||
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
|
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
|
|
@ -4,8 +4,9 @@ import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import unzip_file_from_url
|
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -23,17 +24,25 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
# fetch
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
self.AGGREGATED_RSEI_SCORE_FILE_URL = (
|
self.aggregated_rsei_score_file_url = (
|
||||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||||
"epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
|
"epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.AGGREGATED_RSEI_SCORE_FILE_URL = (
|
self.aggregated_rsei_score_file_url = (
|
||||||
"http://abt-rsei.s3.amazonaws.com/microdata2019/"
|
"http://abt-rsei.s3.amazonaws.com/microdata2019/"
|
||||||
"census_agg/CensusMicroTracts2019_2019_aggregated.zip"
|
"census_agg/CensusMicroTracts2019_2019_aggregated.zip"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.aggregated_rsei_score_source = (
|
||||||
|
self.get_sources_path()
|
||||||
|
/ "CensusMicroTracts2019_2019_aggregated.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
|
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
|
||||||
self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
|
self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
|
||||||
self.TRACT_INPUT_COLUMN_NAME = "GEOID10"
|
self.TRACT_INPUT_COLUMN_NAME = "GEOID10"
|
||||||
|
@ -64,7 +73,20 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [
|
||||||
|
ZIPDataSource(
|
||||||
|
source=self.aggregated_rsei_score_file_url,
|
||||||
|
destination=self.get_sources_path(),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
# the column headers from the above dataset are actually a census tract's data at this point
|
# the column headers from the above dataset are actually a census tract's data at this point
|
||||||
# We will use this data structure later to specify the column names
|
# We will use this data structure later to specify the column names
|
||||||
input_columns = [
|
input_columns = [
|
||||||
|
@ -79,16 +101,8 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
|
||||||
self.NCSCORE_INPUT_FIELD,
|
self.NCSCORE_INPUT_FIELD,
|
||||||
]
|
]
|
||||||
|
|
||||||
unzip_file_from_url(
|
|
||||||
file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL,
|
|
||||||
download_path=self.get_tmp_path(),
|
|
||||||
unzipped_file_path=self.get_tmp_path() / "epa_rsei",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
filepath_or_buffer=self.get_tmp_path()
|
filepath_or_buffer=self.aggregated_rsei_score_source,
|
||||||
/ "epa_rsei"
|
|
||||||
/ "CensusMicroTracts2019_2019_aggregated.csv",
|
|
||||||
# The following need to remain as strings for all of their digits, not get
|
# The following need to remain as strings for all of their digits, not get
|
||||||
# converted to numbers.
|
# converted to numbers.
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
|
|
|
@ -5,6 +5,8 @@ from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -15,7 +17,7 @@ class FloodRiskETL(ExtractTransformLoad):
|
||||||
NAME = "fsf_flood_risk"
|
NAME = "fsf_flood_risk"
|
||||||
# These data were emailed to the J40 team while first street got
|
# These data were emailed to the J40 team while first street got
|
||||||
# their official data sharing channels setup.
|
# their official data sharing channels setup.
|
||||||
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
|
|
||||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||||
LOAD_YAML_CONFIG: bool = True
|
LOAD_YAML_CONFIG: bool = True
|
||||||
|
|
||||||
|
@ -27,13 +29,16 @@ class FloodRiskETL(ExtractTransformLoad):
|
||||||
SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str
|
SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# define the full path for the input CSV file
|
|
||||||
self.INPUT_CSV = (
|
# fetch
|
||||||
self.get_tmp_path() / "fsf_flood" / "flood-tract2010.csv"
|
self.flood_tract_url = (
|
||||||
|
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
|
||||||
)
|
)
|
||||||
|
|
||||||
# this is the main dataframe
|
# input
|
||||||
self.df: pd.DataFrame
|
self.flood_tract_source = (
|
||||||
|
self.get_sources_path() / "fsf_flood" / "flood-tract2010.csv"
|
||||||
|
)
|
||||||
|
|
||||||
# Start dataset-specific vars here
|
# Start dataset-specific vars here
|
||||||
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
|
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
|
||||||
|
@ -41,6 +46,29 @@ class FloodRiskETL(ExtractTransformLoad):
|
||||||
self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30"
|
self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30"
|
||||||
self.CLIP_PROPERTIES_COUNT = 250
|
self.CLIP_PROPERTIES_COUNT = 250
|
||||||
|
|
||||||
|
self.df_fsf_flood: pd.DataFrame
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [
|
||||||
|
ZIPDataSource(
|
||||||
|
source=self.flood_tract_url, destination=self.get_sources_path()
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
# read in the unzipped csv data source then rename the
|
||||||
|
# Census Tract column for merging
|
||||||
|
self.df_fsf_flood = pd.read_csv(
|
||||||
|
self.flood_tract_source,
|
||||||
|
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
||||||
|
low_memory=False,
|
||||||
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
"""Reads the unzipped data file into memory and applies the following
|
"""Reads the unzipped data file into memory and applies the following
|
||||||
transformations to prepare it for the load() method:
|
transformations to prepare it for the load() method:
|
||||||
|
@ -49,35 +77,29 @@ class FloodRiskETL(ExtractTransformLoad):
|
||||||
- Calculates share of properties at risk, left-clipping number of properties at 250
|
- Calculates share of properties at risk, left-clipping number of properties at 250
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# read in the unzipped csv data source then rename the
|
self.df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_flood[
|
||||||
# Census Tract column for merging
|
|
||||||
df_fsf_flood: pd.DataFrame = pd.read_csv(
|
|
||||||
self.INPUT_CSV,
|
|
||||||
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
|
||||||
low_memory=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = df_fsf_flood[
|
|
||||||
self.INPUT_GEOID_TRACT_FIELD_NAME
|
self.INPUT_GEOID_TRACT_FIELD_NAME
|
||||||
].str.zfill(11)
|
].str.zfill(11)
|
||||||
|
|
||||||
df_fsf_flood[self.COUNT_PROPERTIES] = df_fsf_flood[
|
self.df_fsf_flood[self.COUNT_PROPERTIES] = self.df_fsf_flood[
|
||||||
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
|
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
|
||||||
].clip(lower=self.CLIP_PROPERTIES_COUNT)
|
].clip(lower=self.CLIP_PROPERTIES_COUNT)
|
||||||
|
|
||||||
df_fsf_flood[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY] = (
|
self.df_fsf_flood[
|
||||||
df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
|
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY
|
||||||
/ df_fsf_flood[self.COUNT_PROPERTIES]
|
] = (
|
||||||
|
self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
|
||||||
|
/ self.df_fsf_flood[self.COUNT_PROPERTIES]
|
||||||
)
|
)
|
||||||
df_fsf_flood[
|
self.df_fsf_flood[
|
||||||
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS
|
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS
|
||||||
] = (
|
] = (
|
||||||
df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
|
self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
|
||||||
/ df_fsf_flood[self.COUNT_PROPERTIES]
|
/ self.df_fsf_flood[self.COUNT_PROPERTIES]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Assign the final df to the class' output_df for the load method with rename
|
# Assign the final df to the class' output_df for the load method with rename
|
||||||
self.output_df = df_fsf_flood.rename(
|
self.output_df = self.df_fsf_flood.rename(
|
||||||
columns={
|
columns={
|
||||||
self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY,
|
self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY,
|
||||||
self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS,
|
self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS,
|
||||||
|
|
|
@ -4,6 +4,8 @@ import pandas as pd
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
@ -15,7 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad):
|
||||||
NAME = "fsf_wildfire_risk"
|
NAME = "fsf_wildfire_risk"
|
||||||
# These data were emailed to the J40 team while first street got
|
# These data were emailed to the J40 team while first street got
|
||||||
# their official data sharing channels setup.
|
# their official data sharing channels setup.
|
||||||
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
|
|
||||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
LOAD_YAML_CONFIG: bool = True
|
LOAD_YAML_CONFIG: bool = True
|
||||||
|
@ -29,18 +31,48 @@ class WildfireRiskETL(ExtractTransformLoad):
|
||||||
SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str
|
SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# define the full path for the input CSV file
|
|
||||||
self.INPUT_CSV = self.get_tmp_path() / "fsf_fire" / "fire-tract2010.csv"
|
|
||||||
|
|
||||||
|
# fetch
|
||||||
|
self.fsf_fire_url = (
|
||||||
|
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
|
||||||
|
)
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.fsf_fire_source = (
|
||||||
|
self.get_sources_path() / "fsf_fire" / "fire-tract2010.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
# this is the main dataframe
|
# this is the main dataframe
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
self.df_fsf_fire: pd.DataFrame
|
||||||
|
|
||||||
# Start dataset-specific vars here
|
# Start dataset-specific vars here
|
||||||
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
|
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
|
||||||
self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag"
|
self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag"
|
||||||
self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag"
|
self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag"
|
||||||
self.CLIP_PROPERTIES_COUNT = 250
|
self.CLIP_PROPERTIES_COUNT = 250
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [
|
||||||
|
ZIPDataSource(
|
||||||
|
source=self.fsf_fire_url, destination=self.get_sources_path()
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
self.df_fsf_fire = pd.read_csv(
|
||||||
|
self.fsf_fire_source,
|
||||||
|
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
||||||
|
low_memory=False,
|
||||||
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
"""Reads the unzipped data file into memory and applies the following
|
"""Reads the unzipped data file into memory and applies the following
|
||||||
transformations to prepare it for the load() method:
|
transformations to prepare it for the load() method:
|
||||||
|
@ -50,31 +82,28 @@ class WildfireRiskETL(ExtractTransformLoad):
|
||||||
"""
|
"""
|
||||||
# read in the unzipped csv data source then rename the
|
# read in the unzipped csv data source then rename the
|
||||||
# Census Tract column for merging
|
# Census Tract column for merging
|
||||||
df_fsf_fire: pd.DataFrame = pd.read_csv(
|
|
||||||
self.INPUT_CSV,
|
|
||||||
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
|
||||||
low_memory=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = df_fsf_fire[
|
self.df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_fire[
|
||||||
self.INPUT_GEOID_TRACT_FIELD_NAME
|
self.INPUT_GEOID_TRACT_FIELD_NAME
|
||||||
].str.zfill(11)
|
].str.zfill(11)
|
||||||
|
|
||||||
df_fsf_fire[self.COUNT_PROPERTIES] = df_fsf_fire[
|
self.df_fsf_fire[self.COUNT_PROPERTIES] = self.df_fsf_fire[
|
||||||
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
|
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
|
||||||
].clip(lower=self.CLIP_PROPERTIES_COUNT)
|
].clip(lower=self.CLIP_PROPERTIES_COUNT)
|
||||||
|
|
||||||
df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
|
self.df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
|
||||||
df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
|
self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
|
||||||
/ df_fsf_fire[self.COUNT_PROPERTIES]
|
/ self.df_fsf_fire[self.COUNT_PROPERTIES]
|
||||||
)
|
)
|
||||||
df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS] = (
|
self.df_fsf_fire[
|
||||||
df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
|
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS
|
||||||
/ df_fsf_fire[self.COUNT_PROPERTIES]
|
] = (
|
||||||
|
self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
|
||||||
|
/ self.df_fsf_fire[self.COUNT_PROPERTIES]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Assign the final df to the class' output_df for the load method with rename
|
# Assign the final df to the class' output_df for the load method with rename
|
||||||
self.output_df = df_fsf_fire.rename(
|
self.output_df = self.df_fsf_fire.rename(
|
||||||
columns={
|
columns={
|
||||||
self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY,
|
self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY,
|
||||||
self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS,
|
self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS,
|
||||||
|
|
|
@ -3,17 +3,33 @@ from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import unzip_file_from_url
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class GeoCorrETL(ExtractTransformLoad):
|
class GeoCorrETL(ExtractTransformLoad):
|
||||||
|
|
||||||
NAME = "geocorr"
|
NAME = "geocorr"
|
||||||
|
|
||||||
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
|
||||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
# fetch
|
||||||
|
self.geocorr_url = (
|
||||||
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
|
+ "/geocorr_urban_rural.csv.zip"
|
||||||
|
)
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.geocorr_source = (
|
||||||
|
self.get_sources_path() / "geocorr_urban_rural.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"
|
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"
|
||||||
|
|
||||||
# Need to change hyperlink to S3
|
# Need to change hyperlink to S3
|
||||||
|
@ -23,7 +39,7 @@ class GeoCorrETL(ExtractTransformLoad):
|
||||||
# The source data for this notebook was downloaded from GeoCorr;
|
# The source data for this notebook was downloaded from GeoCorr;
|
||||||
# the instructions for generating the source data is here:
|
# the instructions for generating the source data is here:
|
||||||
# https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787
|
# https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787
|
||||||
self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
|
# self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
|
||||||
self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
|
self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
|
||||||
self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag"
|
self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag"
|
||||||
self.COLUMNS_TO_KEEP = [
|
self.COLUMNS_TO_KEEP = [
|
||||||
|
@ -33,16 +49,21 @@ class GeoCorrETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
unzip_file_from_url(
|
return [
|
||||||
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
ZIPDataSource(
|
||||||
+ "/geocorr_urban_rural.csv.zip",
|
source=self.geocorr_url, destination=self.get_sources_path()
|
||||||
download_path=self.get_tmp_path(),
|
|
||||||
unzipped_file_path=self.get_tmp_path(),
|
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv",
|
filepath_or_buffer=self.geocorr_source,
|
||||||
dtype={
|
dtype={
|
||||||
self.GEOCORR_GEOID_FIELD_NAME: "string",
|
self.GEOCORR_GEOID_FIELD_NAME: "string",
|
||||||
},
|
},
|
||||||
|
|
|
@ -3,12 +3,16 @@ from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HistoricRedliningETL(ExtractTransformLoad):
|
class HistoricRedliningETL(ExtractTransformLoad):
|
||||||
|
|
||||||
NAME = "historic_redlining"
|
NAME = "historic_redlining"
|
||||||
|
|
||||||
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
|
||||||
EXPECTED_MISSING_STATES = [
|
EXPECTED_MISSING_STATES = [
|
||||||
"10",
|
"10",
|
||||||
|
@ -25,14 +29,14 @@ class HistoricRedliningETL(ExtractTransformLoad):
|
||||||
]
|
]
|
||||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False
|
ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False
|
||||||
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "historic_redlining"
|
|
||||||
|
|
||||||
self.HISTORIC_REDLINING_FILE_PATH = (
|
# fetch
|
||||||
self.get_tmp_path() / "HRS_2010.xlsx"
|
self.hrs_url = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
|
||||||
)
|
|
||||||
|
# input
|
||||||
|
self.hrs_source = self.get_sources_path() / "HRS_2010.xlsx"
|
||||||
|
|
||||||
self.REDLINING_SCALAR = "Tract-level redlining score"
|
self.REDLINING_SCALAR = "Tract-level redlining score"
|
||||||
|
|
||||||
|
@ -40,30 +44,47 @@ class HistoricRedliningETL(ExtractTransformLoad):
|
||||||
self.GEOID_TRACT_FIELD_NAME,
|
self.GEOID_TRACT_FIELD_NAME,
|
||||||
self.REDLINING_SCALAR,
|
self.REDLINING_SCALAR,
|
||||||
]
|
]
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
self.historic_redlining_data: pd.DataFrame
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [
|
||||||
|
ZIPDataSource(
|
||||||
|
source=self.hrs_url, destination=self.get_sources_path()
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
self.historic_redlining_data = pd.read_excel(self.hrs_source)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
# this is obviously temporary
|
# this is obviously temporary
|
||||||
historic_redlining_data = pd.read_excel(
|
|
||||||
self.HISTORIC_REDLINING_FILE_PATH
|
self.historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
|
||||||
|
self.historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
|
||||||
)
|
)
|
||||||
historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
|
self.historic_redlining_data = self.historic_redlining_data.rename(
|
||||||
historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
|
|
||||||
)
|
|
||||||
historic_redlining_data = historic_redlining_data.rename(
|
|
||||||
columns={"HRS2010": self.REDLINING_SCALAR}
|
columns={"HRS2010": self.REDLINING_SCALAR}
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug(f"{historic_redlining_data.columns}")
|
logger.debug(f"{self.historic_redlining_data.columns}")
|
||||||
|
|
||||||
# Calculate lots of different score thresholds for convenience
|
# Calculate lots of different score thresholds for convenience
|
||||||
for threshold in [3.25, 3.5, 3.75]:
|
for threshold in [3.25, 3.5, 3.75]:
|
||||||
historic_redlining_data[
|
self.historic_redlining_data[
|
||||||
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
|
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
|
||||||
] = (historic_redlining_data[self.REDLINING_SCALAR] >= threshold)
|
] = (
|
||||||
|
self.historic_redlining_data[self.REDLINING_SCALAR] >= threshold
|
||||||
|
)
|
||||||
## NOTE We add to columns to keep here
|
## NOTE We add to columns to keep here
|
||||||
self.COLUMNS_TO_KEEP.append(
|
self.COLUMNS_TO_KEEP.append(
|
||||||
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
|
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.output_df = historic_redlining_data
|
self.output_df = self.historic_redlining_data
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import unzip_file_from_url
|
|
||||||
from pandas.errors import EmptyDataError
|
from pandas.errors import EmptyDataError
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
@ -10,36 +11,46 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
class HousingTransportationETL(ExtractTransformLoad):
|
class HousingTransportationETL(ExtractTransformLoad):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.HOUSING_FTP_URL = (
|
|
||||||
"https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
|
|
||||||
)
|
|
||||||
self.OUTPUT_PATH = (
|
self.OUTPUT_PATH = (
|
||||||
self.DATA_PATH / "dataset" / "housing_and_transportation_index"
|
self.DATA_PATH / "dataset" / "housing_and_transportation_index"
|
||||||
)
|
)
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
|
||||||
|
housing_url = (
|
||||||
|
"https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
|
||||||
|
)
|
||||||
|
|
||||||
|
sources = []
|
||||||
|
|
||||||
|
for fips in get_state_fips_codes(self.DATA_PATH):
|
||||||
|
sources.append(
|
||||||
|
ZIPDataSource(
|
||||||
|
source=f"{housing_url}{fips}",
|
||||||
|
destination=self.get_sources_path(),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return sources
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
# Download each state / territory individually
|
# Download each state / territory individually
|
||||||
dfs = []
|
dfs = []
|
||||||
zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
|
|
||||||
for fips in get_state_fips_codes(self.DATA_PATH):
|
for fips in get_state_fips_codes(self.DATA_PATH):
|
||||||
logger.debug(
|
|
||||||
f"Downloading housing data for state/territory with FIPS code {fips}"
|
|
||||||
)
|
|
||||||
|
|
||||||
unzip_file_from_url(
|
csv_source = (
|
||||||
f"{self.HOUSING_FTP_URL}{fips}",
|
self.get_sources_path() / f"htaindex2019_data_tracts_{fips}.csv"
|
||||||
self.get_tmp_path(),
|
|
||||||
zip_file_dir,
|
|
||||||
)
|
|
||||||
|
|
||||||
# New file name:
|
|
||||||
tmp_csv_file_path = (
|
|
||||||
zip_file_dir / f"htaindex2019_data_tracts_{fips}.csv"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
|
tmp_df = pd.read_csv(filepath_or_buffer=csv_source)
|
||||||
except EmptyDataError:
|
except EmptyDataError:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
|
f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"
|
||||||
|
|
|
@ -3,24 +3,33 @@ from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HudHousingETL(ExtractTransformLoad):
|
class HudHousingETL(ExtractTransformLoad):
|
||||||
|
|
||||||
NAME = "hud_housing"
|
NAME = "hud_housing"
|
||||||
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
|
|
||||||
|
|
||||||
|
# fetch
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
self.HOUSING_FTP_URL = (
|
self.housing_url = (
|
||||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||||
"hud_housing/2014thru2018-140-csv.zip"
|
"hud_housing/2014thru2018-140-csv.zip"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
|
self.housing_url = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
|
||||||
|
|
||||||
|
# source
|
||||||
|
|
||||||
|
# output
|
||||||
|
|
||||||
|
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
|
||||||
|
|
||||||
self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()
|
self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()
|
||||||
|
|
||||||
|
@ -55,15 +64,16 @@ class HudHousingETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
super().extract(
|
return [
|
||||||
self.HOUSING_FTP_URL,
|
ZIPDataSource(
|
||||||
self.HOUSING_ZIP_FILE_DIR,
|
source=self.housing_url, destination=self.get_sources_path()
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def _read_chas_table(self, file_name):
|
def _read_chas_table(self, file_name):
|
||||||
# New file name:
|
|
||||||
tmp_csv_file_path = self.HOUSING_ZIP_FILE_DIR / "140" / file_name
|
tmp_csv_file_path = self.get_sources_path() / "140" / file_name
|
||||||
tmp_df = pd.read_csv(
|
tmp_df = pd.read_csv(
|
||||||
filepath_or_buffer=tmp_csv_file_path,
|
filepath_or_buffer=tmp_csv_file_path,
|
||||||
encoding="latin-1",
|
encoding="latin-1",
|
||||||
|
@ -78,7 +88,12 @@ class HudHousingETL(ExtractTransformLoad):
|
||||||
|
|
||||||
return tmp_df
|
return tmp_df
|
||||||
|
|
||||||
def transform(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
table_8 = self._read_chas_table("Table8.csv")
|
table_8 = self._read_chas_table("Table8.csv")
|
||||||
table_3 = self._read_chas_table("Table3.csv")
|
table_3 = self._read_chas_table("Table3.csv")
|
||||||
|
|
||||||
|
@ -86,6 +101,8 @@ class HudHousingETL(ExtractTransformLoad):
|
||||||
table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME
|
table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def transform(self) -> None:
|
||||||
|
|
||||||
# Calculate share that lacks indoor plumbing or kitchen
|
# Calculate share that lacks indoor plumbing or kitchen
|
||||||
# This is computed as
|
# This is computed as
|
||||||
# (
|
# (
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import FileDataSource
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,44 +13,51 @@ logger = get_module_logger(__name__)
|
||||||
class HudRecapETL(ExtractTransformLoad):
|
class HudRecapETL(ExtractTransformLoad):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
# fetch
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
self.HUD_RECAP_CSV_URL = (
|
self.hud_recap_csv_url = (
|
||||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||||
"hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
|
"hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.HUD_RECAP_CSV_URL = (
|
self.hud_recap_csv_url = (
|
||||||
"https://opendata.arcgis.com/api/v3/datasets/"
|
"https://opendata.arcgis.com/api/v3/datasets/"
|
||||||
"56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
|
"56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.HUD_RECAP_CSV = (
|
# input
|
||||||
self.get_tmp_path()
|
self.hud_recap_source = (
|
||||||
|
self.get_sources_path()
|
||||||
/ "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
|
/ "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"
|
self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"
|
||||||
|
|
||||||
# Definining some variable names
|
# Defining some variable names
|
||||||
self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
|
self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
|
||||||
"hud_recap_priority_community"
|
"hud_recap_priority_community"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
download = requests.get(
|
return [
|
||||||
self.HUD_RECAP_CSV_URL,
|
FileDataSource(
|
||||||
verify=None,
|
source=self.hud_recap_csv_url, destination=self.hud_recap_source
|
||||||
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
|
|
||||||
)
|
)
|
||||||
file_contents = download.content
|
]
|
||||||
csv_file = open(self.HUD_RECAP_CSV, "wb")
|
|
||||||
csv_file.write(file_contents)
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
csv_file.close()
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
# Load comparison index (CalEnviroScreen 4)
|
||||||
|
self.df = pd.read_csv(self.hud_recap_source, dtype={"GEOID": "string"})
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
# Load comparison index (CalEnviroScreen 4)
|
|
||||||
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
|
|
||||||
|
|
||||||
self.df.rename(
|
self.df.rename(
|
||||||
columns={
|
columns={
|
||||||
|
|
|
@ -2,6 +2,8 @@ import geopandas as gpd
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
|
@ -10,16 +12,25 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
class MappingForEJETL(ExtractTransformLoad):
|
class MappingForEJETL(ExtractTransformLoad):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
|
|
||||||
|
|
||||||
self.MAPPING_FOR_EJ_VA_URL = (
|
# fetch
|
||||||
|
self.mapping_for_ej_va_url = (
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip"
|
settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip"
|
||||||
)
|
)
|
||||||
self.MAPPING_FOR_EJ_CO_URL = (
|
self.mapping_for_ej_co_url = (
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
|
settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
|
||||||
)
|
)
|
||||||
self.VA_SHP_FILE_PATH = self.get_tmp_path() / "mej_virginia_7_1.shp"
|
|
||||||
self.CO_SHP_FILE_PATH = self.get_tmp_path() / "mej_colorado_final.shp"
|
# input
|
||||||
|
self.va_shp_file_source = (
|
||||||
|
self.get_sources_path() / "mej_virginia_7_1.shp"
|
||||||
|
)
|
||||||
|
self.co_shp_file_source = (
|
||||||
|
self.get_sources_path() / "mej_colorado_final.shp"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
|
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
|
||||||
|
|
||||||
# Defining variables
|
# Defining variables
|
||||||
self.COLUMNS_TO_KEEP = [
|
self.COLUMNS_TO_KEEP = [
|
||||||
|
@ -38,26 +49,35 @@ class MappingForEJETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
super().extract(
|
return [
|
||||||
self.MAPPING_FOR_EJ_VA_URL,
|
ZIPDataSource(
|
||||||
self.get_tmp_path(),
|
source=self.mapping_for_ej_va_url,
|
||||||
)
|
destination=self.get_sources_path(),
|
||||||
super().extract(
|
),
|
||||||
self.MAPPING_FOR_EJ_CO_URL,
|
ZIPDataSource(
|
||||||
self.get_tmp_path(),
|
source=self.mapping_for_ej_co_url,
|
||||||
)
|
destination=self.get_sources_path(),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
def transform(self) -> None:
|
|
||||||
# Join (here, it's just concatenating) the two dataframes from
|
# Join (here, it's just concatenating) the two dataframes from
|
||||||
# CO and VA
|
# CO and VA
|
||||||
self.df = pd.concat(
|
self.df = pd.concat(
|
||||||
[
|
[
|
||||||
gpd.read_file(self.VA_SHP_FILE_PATH),
|
gpd.read_file(self.va_shp_file_source),
|
||||||
gpd.read_file(self.CO_SHP_FILE_PATH),
|
gpd.read_file(self.co_shp_file_source),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def transform(self) -> None:
|
||||||
|
|
||||||
# Fill Census tract to get it to be 11 digits, incl. leading 0s
|
# Fill Census tract to get it to be 11 digits, incl. leading 0s
|
||||||
# Note that VA and CO should never have leading 0s, so this isn't
|
# Note that VA and CO should never have leading 0s, so this isn't
|
||||||
# strictly necessary, but if in the future, there are more states
|
# strictly necessary, but if in the future, there are more states
|
||||||
|
|
|
@ -3,8 +3,9 @@ import pathlib
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import FileDataSource
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import download_file_from_url
|
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
|
||||||
|
@ -19,31 +20,35 @@ class MappingInequalityETL(ExtractTransformLoad):
|
||||||
|
|
||||||
Information on the mapping of this data to census tracts is available at
|
Information on the mapping of this data to census tracts is available at
|
||||||
https://github.com/americanpanorama/Census_HOLC_Research.
|
https://github.com/americanpanorama/Census_HOLC_Research.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
# fetch
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
self.MAPPING_INEQUALITY_CSV_URL = (
|
self.mapping_inequality_csv_url = (
|
||||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||||
"mapping_inequality/holc_tract_lookup.csv"
|
"mapping_inequality/holc_tract_lookup.csv"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.MAPPING_INEQUALITY_CSV_URL = (
|
self.mapping_inequality_csv_url = (
|
||||||
"https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
|
"https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
|
||||||
"main/2010_Census_Tracts/holc_tract_lookup.csv"
|
"main/2010_Census_Tracts/holc_tract_lookup.csv"
|
||||||
)
|
)
|
||||||
self.MAPPING_INEQUALITY_CSV = (
|
|
||||||
self.get_tmp_path() / "holc_tract_lookup.csv"
|
|
||||||
)
|
|
||||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
|
|
||||||
|
|
||||||
self.HOLC_MANUAL_MAPPING_CSV_PATH = (
|
# input
|
||||||
|
self.mapping_inequality_source = (
|
||||||
|
self.get_sources_path() / "holc_tract_lookup.csv"
|
||||||
|
)
|
||||||
|
self.holc_manual_mapping_source = ( # here be dragons – this file is pulled from a different place than most
|
||||||
pathlib.Path(__file__).parent
|
pathlib.Path(__file__).parent
|
||||||
/ "data"
|
/ "data"
|
||||||
/ "holc_grades_manually_mapped.csv"
|
/ "holc_grades_manually_mapped.csv"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
|
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
|
||||||
|
|
||||||
# Some input field names. From documentation: 'Census Tracts were intersected
|
# Some input field names. From documentation: 'Census Tracts were intersected
|
||||||
# with HOLC Polygons. Census information can be joined via the "geoid" field.
|
# with HOLC Polygons. Census information can be joined via the "geoid" field.
|
||||||
# There are two field "holc_prop" and "tract_prop" which give the proportion
|
# There are two field "holc_prop" and "tract_prop" which give the proportion
|
||||||
|
@ -73,22 +78,39 @@ class MappingInequalityETL(ExtractTransformLoad):
|
||||||
]
|
]
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
self.holc_manually_mapped_df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
download_file_from_url(
|
return [
|
||||||
file_url=self.MAPPING_INEQUALITY_CSV_URL,
|
FileDataSource(
|
||||||
download_file_name=self.MAPPING_INEQUALITY_CSV,
|
source=self.mapping_inequality_csv_url,
|
||||||
|
destination=self.mapping_inequality_source,
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def transform(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
df: pd.DataFrame = pd.read_csv(
|
|
||||||
self.MAPPING_INEQUALITY_CSV,
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
self.df = pd.read_csv(
|
||||||
|
self.mapping_inequality_source,
|
||||||
dtype={self.TRACT_INPUT_FIELD: "string"},
|
dtype={self.TRACT_INPUT_FIELD: "string"},
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Some data needs to be manually mapped to its grade.
|
||||||
|
# TODO: Investigate more data that may need to be manually mapped.
|
||||||
|
self.holc_manually_mapped_df = pd.read_csv(
|
||||||
|
filepath_or_buffer=self.holc_manual_mapping_source,
|
||||||
|
low_memory=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
def transform(self) -> None:
|
||||||
|
|
||||||
# rename Tract ID
|
# rename Tract ID
|
||||||
df.rename(
|
self.df.rename(
|
||||||
columns={
|
columns={
|
||||||
self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
|
self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
|
||||||
},
|
},
|
||||||
|
@ -98,28 +120,21 @@ class MappingInequalityETL(ExtractTransformLoad):
|
||||||
# Keep the first character, which is the HOLC grade (A, B, C, D).
|
# Keep the first character, which is the HOLC grade (A, B, C, D).
|
||||||
# TODO: investigate why this dataframe triggers these pylint errors.
|
# TODO: investigate why this dataframe triggers these pylint errors.
|
||||||
# pylint: disable=unsupported-assignment-operation, unsubscriptable-object
|
# pylint: disable=unsupported-assignment-operation, unsubscriptable-object
|
||||||
df[self.HOLC_GRADE_DERIVED_FIELD] = df[
|
self.df[self.HOLC_GRADE_DERIVED_FIELD] = self.df[
|
||||||
self.HOLC_GRADE_AND_ID_FIELD
|
self.HOLC_GRADE_AND_ID_FIELD
|
||||||
].str[0:1]
|
].str[0:1]
|
||||||
|
|
||||||
# Remove nonsense when the field has no grade or invalid grades.
|
# Remove nonsense when the field has no grade or invalid grades.
|
||||||
valid_grades = ["A", "B", "C", "D"]
|
valid_grades = ["A", "B", "C", "D"]
|
||||||
df.loc[
|
self.df.loc[
|
||||||
# pylint: disable=unsubscriptable-object
|
# pylint: disable=unsubscriptable-object
|
||||||
~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
|
~self.df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
|
||||||
self.HOLC_GRADE_DERIVED_FIELD,
|
self.HOLC_GRADE_DERIVED_FIELD,
|
||||||
] = None
|
] = None
|
||||||
|
|
||||||
# Some data needs to be manually mapped to its grade.
|
|
||||||
# TODO: Investigate more data that may need to be manually mapped.
|
|
||||||
holc_manually_mapped_df = pd.read_csv(
|
|
||||||
filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH,
|
|
||||||
low_memory=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Join on the existing data
|
# Join on the existing data
|
||||||
merged_df = df.merge(
|
merged_df = self.df.merge(
|
||||||
right=holc_manually_mapped_df,
|
right=self.holc_manually_mapped_df,
|
||||||
on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
|
on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
|
||||||
how="left",
|
how="left",
|
||||||
)
|
)
|
||||||
|
|
|
@ -4,6 +4,8 @@ import geopandas as gpd
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
|
@ -17,11 +19,16 @@ class MarylandEJScreenETL(ExtractTransformLoad):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.MARYLAND_EJSCREEN_URL = (
|
|
||||||
|
# fetch
|
||||||
|
self.maryland_ejscreen_url = (
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
|
settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen"
|
# input
|
||||||
|
self.shape_files_source = self.get_sources_path() / "mdejscreen"
|
||||||
|
|
||||||
|
# output
|
||||||
self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"
|
self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"
|
||||||
|
|
||||||
self.COLUMNS_TO_KEEP = [
|
self.COLUMNS_TO_KEEP = [
|
||||||
|
@ -31,37 +38,47 @@ class MarylandEJScreenETL(ExtractTransformLoad):
|
||||||
]
|
]
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
self.dfs_list: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
logger.debug("Downloading 207MB Maryland EJSCREEN Data")
|
return [
|
||||||
super().extract(
|
ZIPDataSource(
|
||||||
self.MARYLAND_EJSCREEN_URL,
|
source=self.maryland_ejscreen_url,
|
||||||
self.get_tmp_path(),
|
destination=self.get_sources_path(),
|
||||||
)
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def transform(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
|
|
||||||
|
|
||||||
# Ignore counties becauses this is not the level of measurement
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
logger.debug("Downloading 207MB Maryland EJSCREEN Data")
|
||||||
|
list_of_files = list(glob(str(self.shape_files_source) + "/*.shp"))
|
||||||
|
|
||||||
|
# Ignore counties because this is not the level of measurement
|
||||||
# that is consistent with our current scoring and ranking methodology.
|
# that is consistent with our current scoring and ranking methodology.
|
||||||
dfs_list = [
|
self.dfs_list = [
|
||||||
gpd.read_file(f)
|
gpd.read_file(f)
|
||||||
for f in list_of_files
|
for f in list_of_files
|
||||||
if not f.endswith("CountiesEJScore.shp")
|
if not f.endswith("CountiesEJScore.shp")
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def transform(self) -> None:
|
||||||
|
|
||||||
# Set the Census tract as the index and drop the geometry column
|
# Set the Census tract as the index and drop the geometry column
|
||||||
# that produces the census tract boundaries.
|
# that produces the census tract boundaries.
|
||||||
# The latter is because Geopandas raises an exception if there
|
# The latter is because Geopandas raises an exception if there
|
||||||
# are duplicate geometry columns.
|
# are duplicate geometry columns.
|
||||||
# Moreover, since the unit of measurement is at the tract level
|
# Moreover, since the unit of measurement is at the tract level
|
||||||
# we can consistantly merge this with other datasets
|
# we can consistantly merge this with other datasets
|
||||||
dfs_list = [
|
self.dfs_list = [
|
||||||
df.set_index("Census_Tra").drop("geometry", axis=1)
|
df.set_index("Census_Tra").drop("geometry", axis=1)
|
||||||
for df in dfs_list
|
for df in self.dfs_list
|
||||||
]
|
]
|
||||||
# pylint: disable=unsubscriptable-object
|
# pylint: disable=unsubscriptable-object
|
||||||
self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1))
|
self.df = gpd.GeoDataFrame(pd.concat(self.dfs_list, axis=1))
|
||||||
|
|
||||||
# Reset index so that we no longer have the tract as our index
|
# Reset index so that we no longer have the tract as our index
|
||||||
self.df = self.df.reset_index()
|
self.df = self.df.reset_index()
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import FileDataSource
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
|
@ -15,12 +17,21 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.MICHIGAN_EJSCREEN_S3_URL = (
|
|
||||||
|
# fetch
|
||||||
|
self.michigan_ejscreen_url = (
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
+ "/michigan_ejscore_12212021.csv"
|
+ "/michigan_ejscore_12212021.csv"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# input
|
||||||
|
self.michigan_ejscreen_source = (
|
||||||
|
self.get_sources_path() / "michigan_ejscore_12212021.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen"
|
self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen"
|
||||||
|
|
||||||
self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75
|
self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75
|
||||||
|
|
||||||
self.COLUMNS_TO_KEEP = [
|
self.COLUMNS_TO_KEEP = [
|
||||||
|
@ -32,14 +43,28 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [
|
||||||
|
FileDataSource(
|
||||||
|
source=self.michigan_ejscreen_url,
|
||||||
|
destination=self.michigan_ejscreen_source,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
self.df = pd.read_csv(
|
self.df = pd.read_csv(
|
||||||
filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
|
filepath_or_buffer=self.michigan_ejscreen_source,
|
||||||
dtype={"GEO_ID": "string"},
|
dtype={"GEO_ID": "string"},
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
|
||||||
self.df.rename(
|
self.df.rename(
|
||||||
columns={
|
columns={
|
||||||
"GEO_ID": self.GEOID_TRACT_FIELD_NAME,
|
"GEO_ID": self.GEOID_TRACT_FIELD_NAME,
|
||||||
|
|
|
@ -4,6 +4,8 @@
|
||||||
# pylint: disable=unsupported-assignment-operation
|
# pylint: disable=unsupported-assignment-operation
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
@ -16,17 +18,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
|
|
||||||
NAME = "national_risk_index"
|
NAME = "national_risk_index"
|
||||||
|
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
|
||||||
SOURCE_URL = (
|
|
||||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
|
||||||
"national_risk_index/NRI_Table_CensusTracts.zip"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
SOURCE_URL = (
|
|
||||||
"https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
|
|
||||||
"NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
|
|
||||||
)
|
|
||||||
|
|
||||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
LOAD_YAML_CONFIG: bool = True
|
LOAD_YAML_CONFIG: bool = True
|
||||||
|
@ -46,11 +37,28 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
AGRIVALUE_LOWER_BOUND = 408000
|
AGRIVALUE_LOWER_BOUND = 408000
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# define the full path for the input CSV file
|
|
||||||
self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
|
|
||||||
|
|
||||||
|
# fetch
|
||||||
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
|
self.risk_index_url = (
|
||||||
|
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||||
|
"national_risk_index/NRI_Table_CensusTracts.zip"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.risk_index_url = (
|
||||||
|
"https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
|
||||||
|
"NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
|
||||||
|
)
|
||||||
|
|
||||||
|
# source
|
||||||
|
self.risk_index_source = (
|
||||||
|
self.get_sources_path() / "NRI_Table_CensusTracts.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
# this is the main dataframe
|
# this is the main dataframe
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
self.df_nri: pd.DataFrame
|
||||||
|
|
||||||
# Start dataset-specific vars here
|
# Start dataset-specific vars here
|
||||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
|
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
|
||||||
|
@ -65,14 +73,26 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
|
self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
|
||||||
self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"
|
self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
"""Unzips NRI dataset from the FEMA data source and writes the files
|
return [
|
||||||
to the temporary data folder for use in the transform() method
|
ZIPDataSource(
|
||||||
"""
|
source=self.risk_index_url, destination=self.get_sources_path()
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
super().extract(
|
super().extract(
|
||||||
source_url=self.SOURCE_URL,
|
use_cached_data_sources
|
||||||
extract_path=self.get_tmp_path(),
|
) # download and extract data sources
|
||||||
|
|
||||||
|
# read in the unzipped csv from NRI data source then rename the
|
||||||
|
# Census Tract column for merging
|
||||||
|
self.df_nri = pd.read_csv(
|
||||||
|
self.risk_index_source,
|
||||||
|
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
|
||||||
|
na_values=["None"],
|
||||||
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
|
@ -84,16 +104,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
Groups inside of that Tract
|
Groups inside of that Tract
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# read in the unzipped csv from NRI data source then rename the
|
self.df_nri.rename(
|
||||||
# Census Tract column for merging
|
|
||||||
df_nri: pd.DataFrame = pd.read_csv(
|
|
||||||
self.INPUT_CSV,
|
|
||||||
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
|
|
||||||
na_values=["None"],
|
|
||||||
low_memory=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
df_nri.rename(
|
|
||||||
columns={
|
columns={
|
||||||
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
|
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
|
||||||
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
|
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
|
||||||
|
@ -123,42 +134,46 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
agriculture_columns = [
|
agriculture_columns = [
|
||||||
f"{x}_EALA"
|
f"{x}_EALA"
|
||||||
for x in disaster_categories
|
for x in disaster_categories
|
||||||
if f"{x}_EALA" in list(df_nri.columns)
|
if f"{x}_EALA" in list(self.df_nri.columns)
|
||||||
]
|
]
|
||||||
|
|
||||||
population_columns = [
|
population_columns = [
|
||||||
f"{x}_EALP"
|
f"{x}_EALP"
|
||||||
for x in disaster_categories
|
for x in disaster_categories
|
||||||
if f"{x}_EALP" in list(df_nri.columns)
|
if f"{x}_EALP" in list(self.df_nri.columns)
|
||||||
]
|
]
|
||||||
|
|
||||||
buildings_columns = [
|
buildings_columns = [
|
||||||
f"{x}_EALB"
|
f"{x}_EALB"
|
||||||
for x in disaster_categories
|
for x in disaster_categories
|
||||||
if f"{x}_EALB" in list(df_nri.columns)
|
if f"{x}_EALB" in list(self.df_nri.columns)
|
||||||
]
|
]
|
||||||
|
|
||||||
disaster_population_sum_series = df_nri[population_columns].sum(axis=1)
|
disaster_population_sum_series = self.df_nri[population_columns].sum(
|
||||||
|
|
||||||
disaster_agriculture_sum_series = df_nri[agriculture_columns].sum(
|
|
||||||
axis=1
|
axis=1
|
||||||
)
|
)
|
||||||
|
|
||||||
disaster_buildings_sum_series = df_nri[buildings_columns].sum(axis=1)
|
disaster_agriculture_sum_series = self.df_nri[agriculture_columns].sum(
|
||||||
|
axis=1
|
||||||
|
)
|
||||||
|
|
||||||
|
disaster_buildings_sum_series = self.df_nri[buildings_columns].sum(
|
||||||
|
axis=1
|
||||||
|
)
|
||||||
|
|
||||||
# Population EAL Rate = Eal Valp / Population
|
# Population EAL Rate = Eal Valp / Population
|
||||||
df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
|
self.df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
|
||||||
disaster_population_sum_series
|
disaster_population_sum_series
|
||||||
/ df_nri[self.POPULATION_INPUT_FIELD_NAME]
|
/ self.df_nri[self.POPULATION_INPUT_FIELD_NAME]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000)
|
# Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000)
|
||||||
## FORMULA ADJUSTMENT 2/17
|
## FORMULA ADJUSTMENT 2/17
|
||||||
## Because AGRIVALUE contains a lot of 0s, we are going to consider
|
## Because AGRIVALUE contains a lot of 0s, we are going to consider
|
||||||
## 90th percentile only for places that have some agrivalue at all
|
## 90th percentile only for places that have some agrivalue at all
|
||||||
df_nri[
|
self.df_nri[
|
||||||
self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
|
self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
|
||||||
] = disaster_agriculture_sum_series / df_nri[
|
] = disaster_agriculture_sum_series / self.df_nri[
|
||||||
self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME
|
self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME
|
||||||
].clip(
|
].clip(
|
||||||
lower=self.AGRIVALUE_LOWER_BOUND
|
lower=self.AGRIVALUE_LOWER_BOUND
|
||||||
|
@ -167,11 +182,11 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
## Check that this clip worked -- that the only place the value has changed is when the clip took effect
|
## Check that this clip worked -- that the only place the value has changed is when the clip took effect
|
||||||
base_expectation = (
|
base_expectation = (
|
||||||
disaster_agriculture_sum_series
|
disaster_agriculture_sum_series
|
||||||
/ df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
|
/ self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
df_nri[
|
self.df_nri[
|
||||||
df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
|
self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
|
||||||
!= base_expectation
|
!= base_expectation
|
||||||
][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
|
][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
|
||||||
<= self.AGRIVALUE_LOWER_BOUND
|
<= self.AGRIVALUE_LOWER_BOUND
|
||||||
|
@ -181,27 +196,27 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
|
self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
|
||||||
!= base_expectation
|
!= base_expectation
|
||||||
).sum() > 0, "Clipping the agrivalue did nothing!"
|
).sum() > 0, "Clipping the agrivalue did nothing!"
|
||||||
|
|
||||||
# This produces a boolean that is True in the case of non-zero agricultural value
|
# This produces a boolean that is True in the case of non-zero agricultural value
|
||||||
df_nri[self.CONTAINS_AGRIVALUE] = (
|
self.df_nri[self.CONTAINS_AGRIVALUE] = (
|
||||||
df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
|
self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
|
||||||
)
|
)
|
||||||
|
|
||||||
# divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)).
|
# divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)).
|
||||||
df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
|
self.df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
|
||||||
disaster_buildings_sum_series
|
disaster_buildings_sum_series
|
||||||
/ df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
|
/ self.df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Round all float columns to just 10 digits.
|
# Round all float columns to just 10 digits.
|
||||||
# Note: `round` is smart enough to only apply to float columns.
|
# Note: `round` is smart enough to only apply to float columns.
|
||||||
df_nri = df_nri.round(10)
|
self.df_nri = self.df_nri.round(10)
|
||||||
|
|
||||||
# Assign the final df to the class' output_df for the load method
|
# Assign the final df to the class' output_df for the load method
|
||||||
self.output_df = df_nri
|
self.output_df = self.df_nri
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
# Suppress scientific notation.
|
# Suppress scientific notation.
|
||||||
|
|
|
@ -3,6 +3,8 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
|
@ -13,10 +15,7 @@ class NatureDeprivedETL(ExtractTransformLoad):
|
||||||
"""ETL class for the Nature Deprived Communities dataset"""
|
"""ETL class for the Nature Deprived Communities dataset"""
|
||||||
|
|
||||||
NAME = "nlcd_nature_deprived"
|
NAME = "nlcd_nature_deprived"
|
||||||
SOURCE_URL = (
|
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
|
||||||
+ "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
|
|
||||||
)
|
|
||||||
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
LOAD_YAML_CONFIG: bool = True
|
LOAD_YAML_CONFIG: bool = True
|
||||||
|
@ -29,14 +28,25 @@ class NatureDeprivedETL(ExtractTransformLoad):
|
||||||
TRACT_PERCENT_CROPLAND_FIELD_NAME: str
|
TRACT_PERCENT_CROPLAND_FIELD_NAME: str
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# define the full path for the input CSV file
|
|
||||||
self.INPUT_CSV = (
|
# fetch
|
||||||
self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
|
self.nature_deprived_url = (
|
||||||
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
|
+ "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# source
|
||||||
|
# define the full path for the input CSV file
|
||||||
|
self.nature_deprived_source = (
|
||||||
|
self.get_sources_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
# output
|
||||||
# this is the main dataframe
|
# this is the main dataframe
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
self.df_ncld: pd.DataFrame
|
||||||
|
|
||||||
# Start dataset-specific vars here
|
# Start dataset-specific vars here
|
||||||
self.PERCENT_NATURAL_FIELD_NAME = "PctNatural"
|
self.PERCENT_NATURAL_FIELD_NAME = "PctNatural"
|
||||||
self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv"
|
self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv"
|
||||||
|
@ -47,28 +57,43 @@ class NatureDeprivedETL(ExtractTransformLoad):
|
||||||
# for area. This does indeed remove tracts from the 90th+ percentile later on
|
# for area. This does indeed remove tracts from the 90th+ percentile later on
|
||||||
self.TRACT_ACRES_LOWER_BOUND = 35
|
self.TRACT_ACRES_LOWER_BOUND = 35
|
||||||
|
|
||||||
def transform(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [
|
||||||
|
ZIPDataSource(
|
||||||
|
source=self.nature_deprived_url,
|
||||||
|
destination=self.get_sources_path(),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
"""Reads the unzipped data file into memory and applies the following
|
"""Reads the unzipped data file into memory and applies the following
|
||||||
transformations to prepare it for the load() method:
|
transformations to prepare it for the load() method:
|
||||||
|
|
||||||
- Renames columns as needed
|
- Renames columns as needed
|
||||||
"""
|
"""
|
||||||
|
|
||||||
df_ncld: pd.DataFrame = pd.read_csv(
|
super().extract(
|
||||||
self.INPUT_CSV,
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
|
self.df_ncld = pd.read_csv(
|
||||||
|
self.nature_deprived_source,
|
||||||
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
|
def transform(self) -> None:
|
||||||
df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND
|
|
||||||
|
self.df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
|
||||||
|
self.df_ncld[self.TRACT_ACRES_FIELD_NAME]
|
||||||
|
>= self.TRACT_ACRES_LOWER_BOUND
|
||||||
)
|
)
|
||||||
df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
|
self.df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
|
||||||
100 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
|
100 - self.df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Assign the final df to the class' output_df for the load method with rename
|
# Assign the final df to the class' output_df for the load method with rename
|
||||||
self.output_df = df_ncld.rename(
|
self.output_df = self.df_ncld.rename(
|
||||||
columns={
|
columns={
|
||||||
self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME,
|
self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME,
|
||||||
self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME,
|
self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME,
|
||||||
|
|
|
@ -3,9 +3,10 @@ import functools
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import unzip_file_from_url
|
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -23,6 +24,26 @@ class PersistentPovertyETL(ExtractTransformLoad):
|
||||||
PUERTO_RICO_EXPECTED_IN_DATA = False
|
PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
# fetch
|
||||||
|
self.poverty_url = (
|
||||||
|
settings.AWS_JUSTICE40_DATASOURCES_URL + "/LTDB_Std_All_Sample.zip"
|
||||||
|
)
|
||||||
|
|
||||||
|
# source
|
||||||
|
self.poverty_sources = [
|
||||||
|
self.get_sources_path()
|
||||||
|
/ "ltdb_std_all_sample"
|
||||||
|
/ "ltdb_std_1990_sample.csv",
|
||||||
|
self.get_sources_path()
|
||||||
|
/ "ltdb_std_all_sample"
|
||||||
|
/ "ltdb_std_2000_sample.csv",
|
||||||
|
self.get_sources_path()
|
||||||
|
/ "ltdb_std_all_sample"
|
||||||
|
/ "ltdb_std_2010_sample.csv",
|
||||||
|
]
|
||||||
|
|
||||||
|
# output
|
||||||
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"
|
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"
|
||||||
|
|
||||||
# Need to change hyperlink to S3
|
# Need to change hyperlink to S3
|
||||||
|
@ -44,6 +65,13 @@ class PersistentPovertyETL(ExtractTransformLoad):
|
||||||
|
|
||||||
self.df: pd.DataFrame
|
self.df: pd.DataFrame
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [
|
||||||
|
ZIPDataSource(
|
||||||
|
source=self.poverty_url, destination=self.get_sources_path()
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
|
def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
|
||||||
df = functools.reduce(
|
df = functools.reduce(
|
||||||
lambda df_a, df_b: pd.merge(
|
lambda df_a, df_b: pd.merge(
|
||||||
|
@ -75,28 +103,17 @@ class PersistentPovertyETL(ExtractTransformLoad):
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
unzipped_file_path = self.get_tmp_path()
|
|
||||||
|
|
||||||
unzip_file_from_url(
|
super().extract(
|
||||||
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
|
use_cached_data_sources
|
||||||
+ "/LTDB_Std_All_Sample.zip",
|
) # download and extract data sources
|
||||||
download_path=self.get_tmp_path(),
|
|
||||||
unzipped_file_path=unzipped_file_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
file_names = [
|
|
||||||
"ltdb_std_1990_sample.csv",
|
|
||||||
"ltdb_std_2000_sample.csv",
|
|
||||||
"ltdb_std_2010_sample.csv",
|
|
||||||
]
|
|
||||||
|
|
||||||
temporary_input_dfs = []
|
temporary_input_dfs = []
|
||||||
|
|
||||||
for file_name in file_names:
|
for file_name in self.poverty_sources:
|
||||||
temporary_input_df = pd.read_csv(
|
temporary_input_df = pd.read_csv(
|
||||||
filepath_or_buffer=unzipped_file_path
|
filepath_or_buffer=file_name,
|
||||||
/ f"ltdb_std_all_sample/{file_name}",
|
|
||||||
dtype={
|
dtype={
|
||||||
self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
|
self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
|
||||||
self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",
|
self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
import geopandas as gpd
|
import geopandas as gpd
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
@ -20,10 +22,17 @@ class TreeEquityScoreETL(ExtractTransformLoad):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
|
|
||||||
self.TES_CSV = self.get_tmp_path() / "tes_2021_data.csv"
|
# input
|
||||||
|
self.TES_CSV = self.get_sources_path() / "tes_2021_data.csv"
|
||||||
|
|
||||||
|
# output
|
||||||
self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
|
self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
|
||||||
self.df: gpd.GeoDataFrame
|
self.df: gpd.GeoDataFrame
|
||||||
|
|
||||||
|
self.tes_state_dfs = []
|
||||||
|
|
||||||
|
# config
|
||||||
self.states = [
|
self.states = [
|
||||||
"al",
|
"al",
|
||||||
"az",
|
"az",
|
||||||
|
@ -76,21 +85,36 @@ class TreeEquityScoreETL(ExtractTransformLoad):
|
||||||
"wy",
|
"wy",
|
||||||
]
|
]
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
|
||||||
|
tes_url = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
|
||||||
|
|
||||||
|
sources = []
|
||||||
for state in self.states:
|
for state in self.states:
|
||||||
|
sources.append(
|
||||||
|
ZIPDataSource(
|
||||||
|
source=f"{tes_url}{state}.zip.zip",
|
||||||
|
destination=self.get_sources_path() / state,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return sources
|
||||||
|
|
||||||
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
super().extract(
|
super().extract(
|
||||||
f"{self.TES_URL}{state}.zip.zip",
|
use_cached_data_sources
|
||||||
f"{self.get_tmp_path()}/{state}",
|
) # download and extract data sources
|
||||||
|
|
||||||
|
for state in self.states:
|
||||||
|
self.tes_state_dfs.append(
|
||||||
|
gpd.read_file(f"{self.get_sources_path()}/{state}/{state}.shp")
|
||||||
)
|
)
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
tes_state_dfs = []
|
|
||||||
for state in self.states:
|
|
||||||
tes_state_dfs.append(
|
|
||||||
gpd.read_file(f"{self.get_tmp_path()}/{state}/{state}.shp")
|
|
||||||
)
|
|
||||||
self.df = gpd.GeoDataFrame(
|
self.df = gpd.GeoDataFrame(
|
||||||
pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs
|
pd.concat(self.tes_state_dfs), crs=self.tes_state_dfs[0].crs
|
||||||
)
|
)
|
||||||
|
|
||||||
# rename ID to Tract ID
|
# rename ID to Tract ID
|
||||||
|
|
|
@ -4,63 +4,57 @@ import geopandas as gpd
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import ZIPDataSource
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.utils import unzip_file_from_url
|
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TribalETL(ExtractTransformLoad):
|
class TribalETL(ExtractTransformLoad):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
|
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
|
||||||
|
|
||||||
self.GEOGRAPHIC_BASE_PATH = (
|
self.GEOGRAPHIC_BASE_PATH = (
|
||||||
self.DATA_PATH / "tribal" / "geographic_data"
|
self.DATA_PATH / "tribal" / "geographic_data"
|
||||||
)
|
)
|
||||||
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
|
|
||||||
self.NATIONAL_TRIBAL_GEOJSON_PATH = (
|
self.NATIONAL_TRIBAL_GEOJSON_PATH = (
|
||||||
self.GEOGRAPHIC_BASE_PATH / "usa.json"
|
self.GEOGRAPHIC_BASE_PATH / "usa.json"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.USA_TRIBAL_DF_LIST = []
|
self.USA_TRIBAL_DF_LIST = []
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
"""Extract the tribal geojson zip files from Justice40 S3 data folder
|
|
||||||
|
|
||||||
Returns:
|
national_lar_url = (
|
||||||
None
|
|
||||||
"""
|
|
||||||
|
|
||||||
bia_shapefile_zip_url = (
|
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
+ "/BIA_National_LAR_updated_20220929.zip"
|
+ "/BIA_National_LAR_updated_20220929.zip"
|
||||||
)
|
)
|
||||||
|
tsa_and_aian_url = (
|
||||||
tsa_and_aian_geojson_zip_url = (
|
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
+ "/BIA_TSA_and_AIAN_json.zip"
|
+ "/BIA_TSA_and_AIAN_json.zip"
|
||||||
)
|
)
|
||||||
|
alaska_native_villages_url = (
|
||||||
alaska_geojson_url = (
|
|
||||||
settings.AWS_JUSTICE40_DATASOURCES_URL
|
settings.AWS_JUSTICE40_DATASOURCES_URL
|
||||||
+ "/Alaska_Native_Villages_json.zip"
|
+ "/Alaska_Native_Villages_json.zip"
|
||||||
)
|
)
|
||||||
|
|
||||||
unzip_file_from_url(
|
return [
|
||||||
bia_shapefile_zip_url,
|
ZIPDataSource(
|
||||||
self.TMP_PATH,
|
national_lar_url,
|
||||||
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar",
|
destination=self.get_sources_path() / "bia_national_lar",
|
||||||
)
|
),
|
||||||
|
ZIPDataSource(
|
||||||
unzip_file_from_url(
|
source=tsa_and_aian_url,
|
||||||
tsa_and_aian_geojson_zip_url,
|
destination=self.get_sources_path() / "tsa_and_aian",
|
||||||
self.TMP_PATH,
|
),
|
||||||
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian",
|
ZIPDataSource(
|
||||||
)
|
source=alaska_native_villages_url,
|
||||||
|
destination=self.get_sources_path() / "alaska_native_villages",
|
||||||
unzip_file_from_url(
|
),
|
||||||
alaska_geojson_url,
|
]
|
||||||
self.TMP_PATH,
|
|
||||||
self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages",
|
|
||||||
)
|
|
||||||
|
|
||||||
def _transform_bia_national_lar(self, path: Path) -> None:
|
def _transform_bia_national_lar(self, path: Path) -> None:
|
||||||
"""Transform the Tribal BIA National Lar Geodataframe and appends it to the
|
"""Transform the Tribal BIA National Lar Geodataframe and appends it to the
|
||||||
|
@ -187,21 +181,21 @@ class TribalETL(ExtractTransformLoad):
|
||||||
"""
|
"""
|
||||||
# Set the filepaths:
|
# Set the filepaths:
|
||||||
bia_national_lar_shapefile = (
|
bia_national_lar_shapefile = (
|
||||||
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
|
self.get_sources_path() / "bia_national_lar"
|
||||||
)
|
)
|
||||||
|
|
||||||
bia_aian_supplemental_geojson = (
|
bia_aian_supplemental_geojson = (
|
||||||
self.GEOGRAPHIC_BASE_PATH
|
self.get_sources_path()
|
||||||
/ "tsa_and_aian"
|
/ "tsa_and_aian"
|
||||||
/ "BIA_AIAN_Supplemental.json"
|
/ "BIA_AIAN_Supplemental.json"
|
||||||
)
|
)
|
||||||
|
|
||||||
bia_tsa_geojson = (
|
bia_tsa_geojson = (
|
||||||
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json"
|
self.get_sources_path() / "tsa_and_aian" / "BIA_TSA.json"
|
||||||
)
|
)
|
||||||
|
|
||||||
alaska_native_villages_geojson = (
|
alaska_native_villages_geojson = (
|
||||||
self.GEOGRAPHIC_BASE_PATH
|
self.get_sources_path()
|
||||||
/ "alaska_native_villages"
|
/ "alaska_native_villages"
|
||||||
/ "AlaskaNativeVillages.gdb.geojson"
|
/ "AlaskaNativeVillages.gdb.geojson"
|
||||||
)
|
)
|
||||||
|
@ -225,7 +219,11 @@ class TribalETL(ExtractTransformLoad):
|
||||||
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# note – this works a little different than many of the ETLs. The file
|
||||||
|
# being written here is used again downstream, so it's placed in a
|
||||||
|
# special directory.
|
||||||
logger.debug("Writing national geojson file")
|
logger.debug("Writing national geojson file")
|
||||||
|
self.GEOGRAPHIC_BASE_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
usa_tribal_df.to_file(
|
usa_tribal_df.to_file(
|
||||||
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
|
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
|
||||||
)
|
)
|
||||||
|
|
|
@ -4,6 +4,7 @@ import geopandas as gpd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
|
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
|
||||||
from data_pipeline.etl.sources.geo_utils import get_tract_geojson
|
from data_pipeline.etl.sources.geo_utils import get_tract_geojson
|
||||||
|
@ -67,6 +68,9 @@ class TribalOverlapETL(ExtractTransformLoad):
|
||||||
self.census_tract_gdf: gpd.GeoDataFrame
|
self.census_tract_gdf: gpd.GeoDataFrame
|
||||||
self.tribal_gdf: gpd.GeoDataFrame
|
self.tribal_gdf: gpd.GeoDataFrame
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return [] # this uses already retrieved / calculated data
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_string_from_list(series: pd.Series) -> str:
|
def _create_string_from_list(series: pd.Series) -> str:
|
||||||
"""Helper method that creates a sorted string list (for tribal names)."""
|
"""Helper method that creates a sorted string list (for tribal names)."""
|
||||||
|
@ -89,7 +93,12 @@ class TribalOverlapETL(ExtractTransformLoad):
|
||||||
|
|
||||||
return percentage_float
|
return percentage_float
|
||||||
|
|
||||||
def extract(self) -> None:
|
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||||
|
|
||||||
|
super().extract(
|
||||||
|
use_cached_data_sources
|
||||||
|
) # download and extract data sources
|
||||||
|
|
||||||
self.census_tract_gdf = get_tract_geojson()
|
self.census_tract_gdf = get_tract_geojson()
|
||||||
self.tribal_gdf = get_tribal_geojson()
|
self.tribal_gdf = get_tribal_geojson()
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,10 @@ import geopandas as gpd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
from data_pipeline.etl.datasource import FileDataSource
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
|
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
|
||||||
from data_pipeline.utils import download_file_from_url
|
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
|
||||||
|
@ -28,19 +29,6 @@ class USArmyFUDS(ExtractTransformLoad):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
|
||||||
self.FILE_URL = (
|
|
||||||
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
|
||||||
"us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
|
|
||||||
"all_data_reported_to_Congress_in_FY2020.geojson"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.FILE_URL: str = (
|
|
||||||
"https://opendata.arcgis.com/api/v3/datasets/"
|
|
||||||
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
|
|
||||||
"data?format=geojson&spatialRefId=4326&where=1%3D1"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"
|
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"
|
||||||
|
|
||||||
# Constants for output
|
# Constants for output
|
||||||
|
@ -50,17 +38,27 @@ class USArmyFUDS(ExtractTransformLoad):
|
||||||
self.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
|
self.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
|
||||||
self.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
|
self.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
|
||||||
]
|
]
|
||||||
self.DOWNLOAD_FILE_NAME = self.get_tmp_path() / "fuds.geojson"
|
self.fuds_source = self.get_sources_path() / "fuds.geojson"
|
||||||
|
|
||||||
self.raw_df: gpd.GeoDataFrame
|
self.raw_df: gpd.GeoDataFrame
|
||||||
self.output_df: pd.DataFrame
|
self.output_df: pd.DataFrame
|
||||||
|
|
||||||
def extract(self) -> None:
|
def get_data_sources(self) -> [DataSource]:
|
||||||
download_file_from_url(
|
|
||||||
file_url=self.FILE_URL,
|
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
|
||||||
download_file_name=self.DOWNLOAD_FILE_NAME,
|
fuds_url = (
|
||||||
verify=True,
|
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
|
||||||
|
"us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
|
||||||
|
"all_data_reported_to_Congress_in_FY2020.geojson"
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
fuds_url: str = (
|
||||||
|
"https://opendata.arcgis.com/api/v3/datasets/"
|
||||||
|
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
|
||||||
|
"data?format=geojson&spatialRefId=4326&where=1%3D1"
|
||||||
|
)
|
||||||
|
|
||||||
|
return [FileDataSource(source=fuds_url, destination=self.fuds_source)]
|
||||||
|
|
||||||
def transform(self) -> None:
|
def transform(self) -> None:
|
||||||
# before we try to do any transformation, get the tract data
|
# before we try to do any transformation, get the tract data
|
||||||
|
@ -68,7 +66,7 @@ class USArmyFUDS(ExtractTransformLoad):
|
||||||
|
|
||||||
logger.debug("Loading FUDS data as GeoDataFrame for transform")
|
logger.debug("Loading FUDS data as GeoDataFrame for transform")
|
||||||
raw_df = gpd.read_file(
|
raw_df = gpd.read_file(
|
||||||
filename=self.DOWNLOAD_FILE_NAME,
|
filename=self.fuds_source,
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -54,13 +54,19 @@ class TestCDCLifeExpectency(TestETL):
|
||||||
data. A basic version of that patching is included here for classes that can use it.
|
data. A basic version of that patching is included here for classes that can use it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
data_path, tmp_path = mock_paths
|
||||||
|
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
|
||||||
|
sources_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
with mock.patch(
|
with mock.patch(
|
||||||
"data_pipeline.utils.requests"
|
"data_pipeline.etl.downloader.requests"
|
||||||
) as requests_mock, mock.patch(
|
) as requests_mock, mock.patch(
|
||||||
|
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
|
||||||
|
) as sources_mock, mock.patch(
|
||||||
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
|
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
|
||||||
) as mock_get_state_fips_codes:
|
) as mock_get_state_fips_codes:
|
||||||
tmp_path = mock_paths[1]
|
|
||||||
|
|
||||||
|
# requests mock
|
||||||
def fake_get(url, *args, **kwargs):
|
def fake_get(url, *args, **kwargs):
|
||||||
file_path = url.split("/")[-1]
|
file_path = url.split("/")[-1]
|
||||||
with open(
|
with open(
|
||||||
|
@ -77,17 +83,30 @@ class TestCDCLifeExpectency(TestETL):
|
||||||
return response_mock
|
return response_mock
|
||||||
|
|
||||||
requests_mock.get = fake_get
|
requests_mock.get = fake_get
|
||||||
|
|
||||||
|
# fips codes mock
|
||||||
mock_get_state_fips_codes.return_value = [
|
mock_get_state_fips_codes.return_value = [
|
||||||
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
|
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# sources mock
|
||||||
|
sources_mock.return_value = sources_path
|
||||||
|
|
||||||
# Instantiate the ETL class.
|
# Instantiate the ETL class.
|
||||||
etl = self._get_instance_of_etl_class()
|
etl = self._get_instance_of_etl_class()
|
||||||
|
|
||||||
# Monkey-patch the temporary directory to the one used in the test
|
# Monkey-patch the temporary directory to the one used in the test
|
||||||
etl.TMP_PATH = tmp_path
|
etl.TMP_PATH = tmp_path
|
||||||
|
etl.SOURCES_PATH = data_path / "sources"
|
||||||
|
|
||||||
# Run the extract method.
|
# Run the extract method.
|
||||||
etl.extract()
|
etl.extract()
|
||||||
|
|
||||||
|
def fake_get_sources_path() -> pathlib.PosixPath:
|
||||||
|
return sources_path
|
||||||
|
|
||||||
|
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
|
||||||
|
|
||||||
return etl
|
return etl
|
||||||
|
|
||||||
def test_init(self, mock_etl, mock_paths):
|
def test_init(self, mock_etl, mock_paths):
|
||||||
|
|
|
@ -28,7 +28,7 @@ class TestTravelCompositeETL(TestETL):
|
||||||
mock_paths=mock_paths,
|
mock_paths=mock_paths,
|
||||||
)
|
)
|
||||||
df = gpd.read_file(
|
df = gpd.read_file(
|
||||||
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
|
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
|
||||||
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
|
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
|
||||||
)
|
)
|
||||||
assert df.shape[0] == 30
|
assert df.shape[0] == 30
|
||||||
|
|
|
@ -5,6 +5,7 @@ from data_pipeline.config import settings
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
from data_pipeline.etl.base import ValidGeoLevel
|
from data_pipeline.etl.base import ValidGeoLevel
|
||||||
from data_pipeline.utils import get_module_logger
|
from data_pipeline.utils import get_module_logger
|
||||||
|
from data_pipeline.etl.datasource import DataSource
|
||||||
|
|
||||||
logger = get_module_logger(__name__)
|
logger = get_module_logger(__name__)
|
||||||
|
|
||||||
|
@ -30,6 +31,9 @@ class ExampleETL(ExtractTransformLoad):
|
||||||
self.EXAMPLE_FIELD_NAME,
|
self.EXAMPLE_FIELD_NAME,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def get_data_sources(self) -> [DataSource]:
|
||||||
|
return []
|
||||||
|
|
||||||
def extract(self):
|
def extract(self):
|
||||||
# Pretend to download zip from external URL, write it to CSV.
|
# Pretend to download zip from external URL, write it to CSV.
|
||||||
zip_file_path = (
|
zip_file_path = (
|
||||||
|
@ -42,11 +46,11 @@ class ExampleETL(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
||||||
zip_ref.extractall(self.get_tmp_path())
|
zip_ref.extractall(self.get_sources_path())
|
||||||
|
|
||||||
def transform(self):
|
def transform(self):
|
||||||
df: pd.DataFrame = pd.read_csv(
|
df: pd.DataFrame = pd.read_csv(
|
||||||
self.get_tmp_path() / "input.csv",
|
self.get_sources_path() / "input.csv",
|
||||||
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
|
||||||
low_memory=False,
|
low_memory=False,
|
||||||
)
|
)
|
||||||
|
|
|
@ -124,12 +124,18 @@ class TestETL:
|
||||||
data. A basic version of that patching is included here for classes that can use it.
|
data. A basic version of that patching is included here for classes that can use it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
data_path, tmp_path = mock_paths
|
||||||
|
sources_path = data_path / "sources" / self._ETL_CLASS.__name__
|
||||||
|
sources_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
with mock.patch(
|
with mock.patch(
|
||||||
"data_pipeline.utils.requests"
|
"data_pipeline.etl.downloader.requests"
|
||||||
) as requests_mock, mock.patch(
|
) as requests_mock, mock.patch(
|
||||||
|
"data_pipeline.etl.base.ExtractTransformLoad.get_sources_path"
|
||||||
|
) as sources_mock, mock.patch(
|
||||||
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
|
"data_pipeline.etl.score.etl_utils.get_state_fips_codes"
|
||||||
) as mock_get_state_fips_codes:
|
) as mock_get_state_fips_codes:
|
||||||
tmp_path = mock_paths[1]
|
|
||||||
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
|
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
|
||||||
zip_file_fixture_src = (
|
zip_file_fixture_src = (
|
||||||
self._DATA_DIRECTORY_FOR_TEST
|
self._DATA_DIRECTORY_FOR_TEST
|
||||||
|
@ -145,6 +151,7 @@ class TestETL:
|
||||||
"rb",
|
"rb",
|
||||||
) as file:
|
) as file:
|
||||||
file_contents = file.read()
|
file_contents = file.read()
|
||||||
|
|
||||||
response_mock = requests.Response()
|
response_mock = requests.Response()
|
||||||
response_mock.status_code = 200
|
response_mock.status_code = 200
|
||||||
# pylint: disable=protected-access
|
# pylint: disable=protected-access
|
||||||
|
@ -154,15 +161,25 @@ class TestETL:
|
||||||
mock_get_state_fips_codes.return_value = [
|
mock_get_state_fips_codes.return_value = [
|
||||||
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
|
x[0:2] for x in self._FIXTURES_SHARED_TRACT_IDS
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# sources mock
|
||||||
|
sources_mock.return_value = sources_path
|
||||||
|
|
||||||
# Instantiate the ETL class.
|
# Instantiate the ETL class.
|
||||||
etl = self._get_instance_of_etl_class()
|
etl = self._get_instance_of_etl_class()
|
||||||
|
|
||||||
# Monkey-patch the temporary directory to the one used in the test
|
# Monkey-patch the temporary directory to the one used in the test
|
||||||
etl.TMP_PATH = tmp_path
|
etl.TMP_PATH = tmp_path
|
||||||
|
etl.SOURCES_PATH = data_path / "sources"
|
||||||
|
|
||||||
# Run the extract method.
|
# Run the extract method.
|
||||||
etl.extract()
|
etl.extract()
|
||||||
|
|
||||||
|
def fake_get_sources_path() -> pathlib.PosixPath:
|
||||||
|
return sources_path
|
||||||
|
|
||||||
|
mock.patch.object(etl, "get_sources_path", wraps=fake_get_sources_path)
|
||||||
|
|
||||||
return etl
|
return etl
|
||||||
|
|
||||||
def test_init_base(self, mock_etl, mock_paths):
|
def test_init_base(self, mock_etl, mock_paths):
|
||||||
|
@ -263,17 +280,12 @@ class TestETL:
|
||||||
file was unzipped from a "fake" downloaded zip (located in data) in a temporary path.
|
file was unzipped from a "fake" downloaded zip (located in data) in a temporary path.
|
||||||
"""
|
"""
|
||||||
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
|
if self._SAMPLE_DATA_ZIP_FILE_NAME is not None:
|
||||||
tmp_path = mock_paths[1]
|
|
||||||
|
|
||||||
_ = self._setup_etl_instance_and_run_extract(
|
etl = self._setup_etl_instance_and_run_extract(
|
||||||
mock_etl=mock_etl,
|
mock_etl=mock_etl,
|
||||||
mock_paths=mock_paths,
|
mock_paths=mock_paths,
|
||||||
)
|
)
|
||||||
assert (
|
assert (etl.get_sources_path()).exists()
|
||||||
tmp_path
|
|
||||||
/ self._EXTRACT_TMP_FOLDER_NAME
|
|
||||||
/ self._SAMPLE_DATA_FILE_NAME
|
|
||||||
).exists()
|
|
||||||
|
|
||||||
def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
|
def test_extract_produces_valid_data(self, snapshot, mock_etl, mock_paths):
|
||||||
"""Tests the extract method.
|
"""Tests the extract method.
|
||||||
|
@ -285,8 +297,11 @@ class TestETL:
|
||||||
mock_etl=mock_etl,
|
mock_etl=mock_etl,
|
||||||
mock_paths=mock_paths,
|
mock_paths=mock_paths,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
data_path, tmp_path = mock_paths
|
||||||
|
|
||||||
tmp_df = pd.read_csv(
|
tmp_df = pd.read_csv(
|
||||||
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
|
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
|
||||||
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
|
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
|
||||||
)
|
)
|
||||||
snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
|
snapshot.snapshot_dir = self._DATA_DIRECTORY_FOR_TEST
|
||||||
|
|
|
@ -29,7 +29,7 @@ class TestHistoricRedliningETL(TestETL):
|
||||||
mock_paths=mock_paths,
|
mock_paths=mock_paths,
|
||||||
)
|
)
|
||||||
tmp_df = pd.read_excel(
|
tmp_df = pd.read_excel(
|
||||||
etl.get_tmp_path() / self._SAMPLE_DATA_FILE_NAME,
|
etl.get_sources_path() / self._SAMPLE_DATA_FILE_NAME,
|
||||||
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
|
dtype={etl.GEOID_TRACT_FIELD_NAME: str},
|
||||||
)
|
)
|
||||||
assert tmp_df.shape == (15, 5)
|
assert tmp_df.shape == (15, 5)
|
||||||
|
|
|
@ -36,23 +36,12 @@ class TestNationalRiskIndexETL(TestETL):
|
||||||
|
|
||||||
def test_init(self, mock_etl, mock_paths):
|
def test_init(self, mock_etl, mock_paths):
|
||||||
"""Tests that the mock NationalRiskIndexETL class instance was
|
"""Tests that the mock NationalRiskIndexETL class instance was
|
||||||
initiliazed correctly.
|
initialized correctly.
|
||||||
|
|
||||||
Validates the following conditions:
|
|
||||||
- self.DATA_PATH points to the "data" folder in the temp directory
|
|
||||||
- self.TMP_PATH points to the "data/tmp" folder in the temp directory
|
|
||||||
- self.INPUT_PATH points to the correct path in the temp directory
|
|
||||||
- self.OUTPUT_PATH points to the correct path in the temp directory
|
|
||||||
"""
|
"""
|
||||||
# setup
|
# setup
|
||||||
etl = NationalRiskIndexETL()
|
etl = NationalRiskIndexETL()
|
||||||
data_path, tmp_path = mock_paths
|
|
||||||
input_csv = (
|
|
||||||
tmp_path / "NationalRiskIndexETL" / "NRI_Table_CensusTracts.csv"
|
|
||||||
)
|
|
||||||
|
|
||||||
# validation
|
# validation
|
||||||
assert etl.INPUT_CSV == input_csv
|
|
||||||
assert etl.GEOID_FIELD_NAME == "GEOID10"
|
assert etl.GEOID_FIELD_NAME == "GEOID10"
|
||||||
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
|
assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
|
||||||
assert etl.NAME == "national_risk_index"
|
assert etl.NAME == "national_risk_index"
|
||||||
|
|
Loading…
Add table
Reference in a new issue