Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs

* Update code to be more production-ish

* Move fetch to Extract part of ETL
* Create a downloader to house all downloading operations
* Remove unnecessary "name" in data source

* Format source files with black

* Fix issues from pylint and get the tests working with the new folder structure

* Clean up files with black

* Fix unzip test

* Add caching notes to README

* Fix tests (linting and case sensitivity bug)

* Address PR comments and add API keys for census where missing

* Merging comparator changes from main into this branch for the sake of the PR

* Add note on using cache (-u) during pipeline
This commit is contained in:
Travis Newby 2023-03-03 12:26:24 -06:00 committed by GitHub
commit 6f39033dde
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
52 changed files with 1787 additions and 686 deletions

View file

@ -1,23 +1,36 @@
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
class CalEnviroScreenETL(ExtractTransformLoad):
"""California environmental screen
TODO: Need good description
"""
def __init__(self):
self.CALENVIROSCREEN_FTP_URL = (
# fetch
self.calenviroscreen_ftp_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/CalEnviroScreen_4.0_2021.zip"
)
self.CALENVIROSCREEN_CSV = (
self.get_tmp_path() / "CalEnviroScreen_4.0_2021.csv"
)
self.CSV_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
# Definining some variable names
# input
self.calenviroscreen_source = (
self.get_sources_path() / "CalEnviroScreen_4.0_2021.csv"
)
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "calenviroscreen4"
# Defining some variable names
self.CALENVIROSCREEN_SCORE_FIELD_NAME = "calenviroscreen_score"
self.CALENVIROSCREEN_PERCENTILE_FIELD_NAME = (
"calenviroscreen_percentile"
@ -32,19 +45,28 @@ class CalEnviroScreenETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.calenviroscreen_ftp_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
self.CALENVIROSCREEN_FTP_URL,
self.get_tmp_path(),
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.calenviroscreen_source, dtype={"Census Tract": "string"}
)
def transform(self) -> None:
# Data from https://calenviroscreen-oehha.hub.arcgis.com/#Data, specifically:
# https://oehha.ca.gov/media/downloads/calenviroscreen/document/calenviroscreen40resultsdatadictionaryd12021.zip
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(
self.CALENVIROSCREEN_CSV, dtype={"Census Tract": "string"}
)
self.df.rename(
columns={
@ -68,5 +90,5 @@ class CalEnviroScreenETL(ExtractTransformLoad):
def load(self) -> None:
# write nationwide csv
self.CSV_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.CSV_PATH / "data06.csv", index=False)
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df.to_csv(self.OUTPUT_PATH / "data06.csv", index=False)

View file

@ -7,8 +7,9 @@ from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.score.etl_utils import (
compare_to_list_of_expected_state_fips_codes,
)
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -17,59 +18,74 @@ logger = get_module_logger(__name__)
class CDCLifeExpectancy(ExtractTransformLoad):
"""#TODO: create description"""
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
NAME = "cdc_life_expectancy"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
USA_FILE_URL = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
else:
USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
LOAD_YAML_CONFIG: bool = False
LIFE_EXPECTANCY_FIELD_NAME = "Life expectancy (years)"
INPUT_GEOID_TRACT_FIELD_NAME = "Tract ID"
STATES_MISSING_FROM_USA_FILE = ["23", "55"]
# For some reason, LEEP does not include Maine or Wisconsin in its "All of
# USA" file. Load these separately.
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
WISCONSIN_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
MAINE_FILE_URL: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
else:
WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
MAINE_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
TRACT_INPUT_COLUMN_NAME = "Tract ID"
STATE_INPUT_COLUMN_NAME = "STATE2KX"
raw_df: pd.DataFrame
output_df: pd.DataFrame
raw_df: pd.DataFrame # result of extraction
output_df: pd.DataFrame # result of transformation
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.usa_file_url = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/US_A.CSV"
else:
self.usa_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
# For some reason, LEEP does not include Maine or Wisconsin in its "All of USA" file. Load these separately.
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.wisconsin_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/WI_A.CSV"
self.maine_file_url: str = f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/cdc_file_expectancy/ME_A.CSV"
else:
self.wisconsin_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
self.maine_file_url: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/ME_A.CSV"
# input
self.usa_source = self.get_sources_path() / "US_A.CSV"
self.maine_source = self.get_sources_path() / "ME_A.CSV"
self.wisconsin_source = self.get_sources_path() / "WI_A.CSV"
# output
self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "cdc_life_expectancy"
)
# Constants for output
self.COLUMNS_TO_KEEP = [
self.COLUMNS_TO_KEEP = [ # the columns to save on output
self.GEOID_TRACT_FIELD_NAME,
field_names.LIFE_EXPECTANCY_FIELD,
]
def _download_and_prep_data(
self, file_url: str, download_file_name: pathlib.Path
) -> pd.DataFrame:
download_file_from_url(
file_url=file_url,
download_file_name=download_file_name,
verify=True,
)
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.usa_file_url, destination=self.usa_source
),
FileDataSource(
source=self.maine_file_url, destination=self.maine_source
),
FileDataSource(
source=self.wisconsin_file_url,
destination=self.wisconsin_source,
),
]
def _read_data(self, file_name: pathlib.Path) -> pd.DataFrame:
df = pd.read_csv(
filepath_or_buffer=download_file_name,
filepath_or_buffer=file_name,
dtype={
# The following need to remain as strings for all of their digits, not get converted to numbers.
self.TRACT_INPUT_COLUMN_NAME: "string",
@ -80,12 +96,13 @@ class CDCLifeExpectancy(ExtractTransformLoad):
return df
def extract(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
all_usa_raw_df = self._download_and_prep_data(
file_url=self.USA_FILE_URL,
download_file_name=self.get_tmp_path() / "US_A.CSV",
)
super().extract(
use_cached_data_sources
) # download and extract data sources
all_usa_raw_df = self._read_data(self.usa_source)
# Check which states are missing
states_in_life_expectancy_usa_file = list(
@ -101,17 +118,11 @@ class CDCLifeExpectancy(ExtractTransformLoad):
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
)
logger.debug("Downloading data for Maine")
maine_raw_df = self._download_and_prep_data(
file_url=self.MAINE_FILE_URL,
download_file_name=self.get_tmp_path() / "maine.csv",
maine_raw_df = self._read_data(
self.maine_source,
)
logger.debug("Downloading data for Wisconsin")
wisconsin_raw_df = self._download_and_prep_data(
file_url=self.WISCONSIN_FILE_URL,
download_file_name=self.get_tmp_path() / "wisconsin.csv",
)
wisconsin_raw_df = self._read_data(self.wisconsin_source)
combined_df = pd.concat(
objs=[all_usa_raw_df, maine_raw_df, wisconsin_raw_df],

View file

@ -4,14 +4,17 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
logger = get_module_logger(__name__)
class CDCPlacesETL(ExtractTransformLoad):
"""#TODO: Need description"""
NAME = "cdc_places"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
@ -21,15 +24,21 @@ class CDCPlacesETL(ExtractTransformLoad):
CDC_MEASURE_FIELD_NAME = "Measure"
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.CDC_PLACES_URL = (
self.cdc_places_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"cdc_places/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release.csv"
)
else:
self.CDC_PLACES_URL = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
self.cdc_places_url = "https://chronicdata.cdc.gov/api/views/cwsq-ngmh/rows.csv?accessType=DOWNLOAD"
# input
self.places_source = self.get_sources_path() / "census_tract.csv"
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_places"
self.COLUMNS_TO_KEEP: typing.List[str] = [
self.GEOID_TRACT_FIELD_NAME,
@ -43,19 +52,27 @@ class CDCPlacesETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
file_path = download_file_from_url(
file_url=self.CDC_PLACES_URL,
download_file_name=self.get_tmp_path() / "census_tract.csv",
)
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.cdc_places_url, destination=self.places_source
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
filepath_or_buffer=file_path,
filepath_or_buffer=self.places_source,
dtype={self.CDC_GEOID_FIELD_NAME: "string"},
low_memory=False,
)
def transform(self) -> None:
# Rename GEOID field
self.df.rename(
columns={self.CDC_GEOID_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME},

View file

@ -1,6 +1,8 @@
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -11,22 +13,28 @@ logger = get_module_logger(__name__)
class CDCSVIIndex(ExtractTransformLoad):
"""CDC SVI Index class ingests 2018 dataset located
here: https://www.atsdr.cdc.gov/placeandhealth/svi/index.html
Please see the README in this module for further details.
"""
def __init__(self):
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.CDC_SVI_INDEX_URL = (
self.cdc_svi_index_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"cdc_svi_index/SVI2018_US.csv"
)
else:
self.CDC_SVI_INDEX_URL = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
self.cdc_svi_index_url = "https://svi.cdc.gov/Documents/Data/2018_SVI_Data/CSV/SVI2018_US.csv"
# input
self.svi_source = self.get_sources_path() / "SVI2018_US.csv"
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "cdc_svi_index"
self.CDC_RPL_THEMES_THRESHOLD = 0.90
self.CDC_SVI_INDEX_TRACTS_FIPS_CODE = "FIPS"
self.COLUMNS_TO_KEEP = [
@ -47,9 +55,21 @@ class CDCSVIIndex(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.cdc_svi_index_url, destination=self.svi_source
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
filepath_or_buffer=self.CDC_SVI_INDEX_URL,
filepath_or_buffer=self.svi_source,
dtype={self.CDC_SVI_INDEX_TRACTS_FIPS_CODE: "string"},
low_memory=False,
)
@ -107,8 +127,8 @@ class CDCSVIIndex(ExtractTransformLoad):
)
def load(self) -> None:
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
self.df[self.COLUMNS_TO_KEEP].to_csv(
path_or_buf=self.OUTPUT_PATH / "usa.csv", index=False
)

View file

@ -8,7 +8,8 @@ import geopandas as gpd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -20,7 +21,7 @@ class GeoFileType(Enum):
class CensusETL(ExtractTransformLoad):
SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
# SHP_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "shp"
GEOJSON_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
CSV_BASE_PATH = ExtractTransformLoad.DATA_PATH / "census" / "csv"
GEOJSON_PATH = ExtractTransformLoad.DATA_PATH / "census" / "geojson"
@ -29,6 +30,9 @@ class CensusETL(ExtractTransformLoad):
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
def __init__(self):
self.shape_file_path = self.get_sources_path() / "shp"
# the fips_states_2010.csv is generated from data here
# https://www.census.gov/geographies/reference-files/time-series/geo/tallies.html
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
@ -50,7 +54,7 @@ class CensusETL(ExtractTransformLoad):
file_path: Path
if file_type == GeoFileType.SHP:
file_path = Path(
self.SHP_BASE_PATH
self.shape_file_path
/ fips_code
/ f"tl_2010_{fips_code}_tract10.shp"
)
@ -60,33 +64,22 @@ class CensusETL(ExtractTransformLoad):
file_path = Path(self.CSV_BASE_PATH / f"{fips_code}.csv")
return file_path
def _extract_shp(self, fips_code: str) -> None:
"""Download the SHP file for the provided FIPS code
def get_data_sources(self) -> [DataSource]:
Args:
fips_code (str): the FIPS code for the region of interest
sources = []
Returns:
None
"""
shp_file_path = self._path_for_fips_file(fips_code, GeoFileType.SHP)
for fips_code in self.STATE_FIPS_CODES:
# check if file exists
if not shp_file_path.is_file():
tract_state_url = f"https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_{fips_code}_tract10.zip"
unzip_file_from_url(
tract_state_url,
self.TMP_PATH,
self.DATA_PATH / "census" / "shp" / fips_code,
destination_path = self.shape_file_path / fips_code
sources.append(
ZIPDataSource(
source=tract_state_url, destination=destination_path
)
)
def extract(self) -> None:
logger.debug("Extracting census data")
for index, fips_code in enumerate(self.STATE_FIPS_CODES):
logger.debug(
f"Extracting shape for FIPS {fips_code} {index+1} of {len(self.STATE_FIPS_CODES)}"
)
self._extract_shp(fips_code)
return sources
def _transform_to_geojson(self, fips_code: str) -> None:
"""Convert the downloaded SHP file for the associated FIPS to geojson

View file

@ -56,6 +56,7 @@ def get_state_fips_codes(data_path: Path) -> list:
else:
fips = row[0].strip()
fips_state_list.append(fips)
return fips_state_list

View file

@ -8,12 +8,11 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census_acs.etl_imputations import (
calculate_income_measures,
)
from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import CensusDataSource
logger = get_module_logger(__name__)
@ -28,6 +27,9 @@ class CensusACSETL(ExtractTransformLoad):
MINIMUM_POPULATION_REQUIRED_FOR_IMPUTATION = 1
def __init__(self):
self.census_acs_source = self.get_sources_path() / "acs.csv"
self.TOTAL_UNEMPLOYED_FIELD = "B23025_005E"
self.TOTAL_IN_LABOR_FORCE = "B23025_003E"
self.EMPLOYMENT_FIELDS = [
@ -311,6 +313,34 @@ class CensusACSETL(ExtractTransformLoad):
self.df: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
# Define the variables to retrieve
variables = (
[
self.MEDIAN_INCOME_FIELD,
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.EMPLOYMENT_FIELDS
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS
+ self.EDUCATIONAL_FIELDS
+ self.RE_FIELDS
+ self.COLLEGE_ATTENDANCE_FIELDS
+ self.AGE_INPUT_FIELDS
)
return [
CensusDataSource(
source=None,
destination=self.census_acs_source,
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
acs_type="acs5",
)
]
# pylint: disable=too-many-arguments
def _merge_geojson(
self,
@ -339,27 +369,15 @@ class CensusACSETL(ExtractTransformLoad):
)
)
def extract(self) -> None:
# Define the variables to retrieve
variables = (
[
self.MEDIAN_INCOME_FIELD,
self.MEDIAN_HOUSE_VALUE_FIELD,
]
+ self.EMPLOYMENT_FIELDS
+ self.LINGUISTIC_ISOLATION_FIELDS
+ self.POVERTY_FIELDS
+ self.EDUCATIONAL_FIELDS
+ self.RE_FIELDS
+ self.COLLEGE_ATTENDANCE_FIELDS
+ self.AGE_INPUT_FIELDS
)
def extract(self, use_cached_data_sources: bool = False) -> None:
self.df = retrieve_census_acs_data(
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.census_acs_source,
dtype={self.GEOID_TRACT_FIELD_NAME: "string"},
)
def transform(self) -> None:

View file

@ -1,10 +1,9 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.census_acs.etl_utils import (
retrieve_census_acs_data,
)
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import CensusDataSource
logger = get_module_logger(__name__)
@ -18,6 +17,9 @@ class CensusACS2010ETL(ExtractTransformLoad):
"""
def __init__(self):
self.census_acs_source = self.get_sources_path() / "acs_2010.csv"
self.ACS_YEAR = 2010
self.ACS_TYPE = "acs5"
self.OUTPUT_PATH = (
@ -99,7 +101,7 @@ class CensusACS2010ETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
# Define the variables to retrieve
variables = (
self.UNEMPLOYED_FIELDS
@ -107,13 +109,26 @@ class CensusACS2010ETL(ExtractTransformLoad):
+ self.POVERTY_FIELDS
)
# Use the method defined on CensusACSETL to reduce coding redundancy.
self.df = retrieve_census_acs_data(
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
acs_type=self.ACS_TYPE,
return [
CensusDataSource(
source=None,
destination=self.census_acs_source,
acs_year=self.ACS_YEAR,
variables=variables,
tract_output_field_name=self.GEOID_TRACT_FIELD_NAME,
data_path_for_fips_codes=self.DATA_PATH,
acs_type=self.ACS_TYPE,
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.census_acs_source, dtype={"GEOID10_TRACT": "string"}
)
def transform(self) -> None:

View file

@ -1,14 +1,16 @@
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import requests
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.datasource import FileDataSource
logger = get_module_logger(__name__)
@ -22,6 +24,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
/ f"census_acs_median_income_{self.ACS_YEAR}"
)
self.GEOCORR_ALL_STATES_URL = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr2014_all_states_tracts_only.csv.zip"
)
self.GEOCORR_ALL_STATES_PATH = self.get_sources_path() / "geocorr"
self.GEOCORR_ALL_STATES_SOURCE = (
self.GEOCORR_ALL_STATES_PATH
/ "geocorr2014_all_states_tracts_only.csv"
)
# Set constants for Geocorr MSAs data.
self.PLACE_FIELD_NAME: str = "Census Place Name"
self.COUNTY_FIELD_NAME: str = "County Name"
@ -39,10 +51,16 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E"
+ "&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area"
)
self.MSA_MEDIAN_INCOME_SOURCE = (
self.get_sources_path() / "msa" / "msa_median_income.json"
)
self.MSA_INCOME_FIELD_NAME: str = f"Median household income in the past 12 months (MSA; {self.ACS_YEAR} inflation-adjusted dollars)"
# Set constants for state median incomes
self.STATE_MEDIAN_INCOME_URL: str = f"https://api.census.gov/data/{self.ACS_YEAR}/acs/acs5?get=B19013_001E&for=state"
self.STATE_MEDIAN_INCOME_SOURCE = (
self.get_sources_path() / "state" / "state_median_income.json"
)
self.STATE_GEOID_FIELD_NAME: str = "GEOID2"
self.STATE_MEDIAN_INCOME_FIELD_NAME: str = f"Median household income (State; {self.ACS_YEAR} inflation-adjusted dollars)"
@ -50,6 +68,18 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.PUERTO_RICO_S3_LINK: str = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/PR_census_tracts.csv"
)
self.PUERTO_RICO_ALL_STATES_SOURCE = (
self.get_sources_path() / "pr_tracts" / "pr_tracts.csv"
)
census_api_key = os.environ.get("CENSUS_API_KEY")
if census_api_key:
self.MSA_MEDIAN_INCOME_URL = (
self.MSA_MEDIAN_INCOME_URL + f"&key={census_api_key}"
)
self.STATE_MEDIAN_INCOME_URL = (
self.STATE_MEDIAN_INCOME_URL + f"&key={census_api_key}"
)
# Constants for output
self.AMI_REFERENCE_FIELD_NAME: str = "AMI Reference"
@ -76,6 +106,27 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
self.state_median_incomes: dict
self.pr_tracts: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.GEOCORR_ALL_STATES_URL,
destination=self.GEOCORR_ALL_STATES_PATH,
),
FileDataSource(
source=self.PUERTO_RICO_S3_LINK,
destination=self.PUERTO_RICO_ALL_STATES_SOURCE,
),
FileDataSource(
source=self.MSA_MEDIAN_INCOME_URL,
destination=self.MSA_MEDIAN_INCOME_SOURCE,
),
FileDataSource(
source=self.STATE_MEDIAN_INCOME_URL,
destination=self.STATE_MEDIAN_INCOME_SOURCE,
),
]
def _transform_geocorr(self) -> pd.DataFrame:
# Transform the geocorr data
geocorr_df = self.raw_geocorr_df
@ -223,7 +274,8 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
)
return state_median_incomes_df
def extract(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
# Load and clean GEOCORR data
# Note: this data is generated by https://mcdc.missouri.edu/applications/geocorr2014.html, at the advice of the Census.
# The specific query used is the following, which takes a couple of minutes to run:
@ -239,18 +291,12 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
# - Core based statistical area (CBSA)
# - CBSA Type (Metro or Micro)
logger.debug("Starting download of 1.5MB Geocorr information.")
unzip_file_from_url(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr2014_all_states_tracts_only.csv.zip",
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path() / "geocorr",
)
super().extract(
use_cached_data_sources
) # download and extract data sources
self.raw_geocorr_df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "geocorr"
/ "geocorr2014_all_states_tracts_only.csv",
filepath_or_buffer=self.GEOCORR_ALL_STATES_SOURCE,
# Skip second row, which has descriptions.
skiprows=[1],
# The following need to remain as strings for all of their digits, not get converted to numbers.
@ -264,39 +310,19 @@ class CensusACSMedianIncomeETL(ExtractTransformLoad):
low_memory=False,
)
logger.debug("Pulling PR tract list down.")
# This step is necessary because PR is not in geocorr at the level that gets joined
pr_file = self.get_tmp_path() / "pr_tracts" / "pr_tracts.csv"
download_file_from_url(
file_url=self.PUERTO_RICO_S3_LINK, download_file_name=pr_file
)
self.pr_tracts = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "pr_tracts"
/ "pr_tracts.csv",
filepath_or_buffer=self.PUERTO_RICO_ALL_STATES_SOURCE,
# The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={"GEOID10_TRACT": str},
low_memory=False,
)
self.pr_tracts["State Abbreviation"] = "PR"
# Download MSA median incomes
logger.debug("Starting download of MSA median incomes.")
download = requests.get(
self.MSA_MEDIAN_INCOME_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
self.msa_median_incomes = json.loads(download.content)
with self.MSA_MEDIAN_INCOME_SOURCE.open() as source:
self.msa_median_incomes = json.load(source)
# Download state median incomes
logger.debug("Starting download of state median incomes.")
download_state = requests.get(
self.STATE_MEDIAN_INCOME_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
self.state_median_incomes = json.loads(download_state.content)
with self.STATE_MEDIAN_INCOME_SOURCE.open() as source:
self.state_median_incomes = json.load(source)
## NOTE we already have PR's MI here
def transform(self) -> None:

View file

@ -1,13 +1,14 @@
import json
from typing import List
import os
import numpy as np
import pandas as pd
import requests
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
pd.options.mode.chained_assignment = "raise"
@ -342,20 +343,23 @@ class CensusDecennialETL(ExtractTransformLoad):
+ "&for=tract:*&in=state:{}%20county:{}"
)
census_api_key = os.environ.get("CENSUS_API_KEY")
if census_api_key:
self.API_URL = self.API_URL + f"&key={census_api_key}"
self.final_race_fields: List[str] = []
self.df: pd.DataFrame
self.df_vi: pd.DataFrame
self.df_all: pd.DataFrame
def extract(self) -> None:
dfs = []
dfs_vi = []
def get_data_sources(self) -> [DataSource]:
sources = []
for island in self.ISLAND_TERRITORIES:
logger.debug(
f"Downloading data for state/territory {island['state_abbreviation']}"
)
for county in island["county_fips"]:
api_url = self.API_URL.format(
self.DECENNIAL_YEAR,
island["state_abbreviation"],
@ -363,17 +367,48 @@ class CensusDecennialETL(ExtractTransformLoad):
island["fips"],
county,
)
logger.debug(f"CENSUS: Requesting {api_url}")
download = requests.get(
api_url,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
sources.append(
FileDataSource(
source=api_url,
destination=self.get_sources_path()
/ str(self.DECENNIAL_YEAR)
/ island["state_abbreviation"]
/ island["fips"]
/ county
/ "census.json",
)
)
return sources
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
dfs = []
dfs_vi = []
for island in self.ISLAND_TERRITORIES:
logger.debug(
f"Downloading data for state/territory {island['state_abbreviation']}"
)
for county in island["county_fips"]:
try:
df = json.loads(download.content)
filepath = (
self.get_sources_path()
/ str(self.DECENNIAL_YEAR)
/ island["state_abbreviation"]
/ island["fips"]
/ county
/ "census.json"
)
df = json.load(filepath.open())
except ValueError as e:
logger.error(
f"Could not load content in census decennial ETL because {e}. Content is {download.content}."
f"Could not load content in census decennial ETL because {e}."
)
# First row is the header

View file

@ -5,6 +5,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -38,17 +40,26 @@ class ChildOpportunityIndex(ExtractTransformLoad):
PUERTO_RICO_EXPECTED_IN_DATA = False
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.SOURCE_URL = (
self.child_opportunity_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"child_opportunity_index/raw.zip"
)
else:
self.SOURCE_URL = (
self.child_opportunity_url = (
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
"3a0ededa30a0?format=csv"
)
# input
self.child_opportunity_index_source = (
self.get_sources_path() / "raw.csv"
)
# output
# TODO: Decide about nixing this
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
@ -62,17 +73,25 @@ class ChildOpportunityIndex(ExtractTransformLoad):
self.IMPENETRABLE_SURFACES_INPUT_FIELD = "HE_GREEN"
self.READING_INPUT_FIELD = "ED_READING"
self.raw_df: pd.DataFrame
self.output_df: pd.DataFrame
def extract(self) -> None:
super().extract(
source_url=self.SOURCE_URL,
extract_path=self.get_tmp_path(),
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.child_opportunity_url,
destination=self.get_sources_path(),
)
]
def transform(self) -> None:
raw_df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() / "raw.csv",
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.raw_df = pd.read_csv(
filepath_or_buffer=self.child_opportunity_index_source,
# The following need to remain as strings for all of their digits, not get
# converted to numbers.
dtype={
@ -81,7 +100,9 @@ class ChildOpportunityIndex(ExtractTransformLoad):
low_memory=False,
)
output_df = raw_df.rename(
def transform(self) -> None:
output_df = self.raw_df.rename(
columns={
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,
self.EXTREME_HEAT_INPUT_FIELD: self.EXTREME_HEAT_FIELD,

View file

@ -5,22 +5,35 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
class DOEEnergyBurden(ExtractTransformLoad):
NAME = "doe_energy_burden"
SOURCE_URL: str = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
)
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
LOAD_YAML_CONFIG: bool = True
REVISED_ENERGY_BURDEN_FIELD_NAME: str
def __init__(self):
# fetch
self.doe_energy_burden_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/DOE_LEAD_AMI_TRACT_2018_ALL.csv.zip"
)
# input
self.doe_energy_burden_source = (
self.get_sources_path() / "DOE_LEAD_AMI_TRACT_2018_ALL.csv"
)
# output
self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "doe_energy_burden"
)
@ -29,10 +42,22 @@ class DOEEnergyBurden(ExtractTransformLoad):
self.raw_df: pd.DataFrame
self.output_df: pd.DataFrame
def transform(self) -> None:
raw_df: pd.DataFrame = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "DOE_LEAD_AMI_TRACT_2018_ALL.csv",
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.doe_energy_burden_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.raw_df = pd.read_csv(
filepath_or_buffer=self.doe_energy_burden_source,
# The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={
self.INPUT_GEOID_TRACT_FIELD_NAME: "string",
@ -40,8 +65,10 @@ class DOEEnergyBurden(ExtractTransformLoad):
low_memory=False,
)
def transform(self) -> None:
logger.debug("Renaming columns and ensuring output format is correct")
output_df = raw_df.rename(
output_df = self.raw_df.rename(
columns={
self.INPUT_ENERGY_BURDEN_FIELD_NAME: self.REVISED_ENERGY_BURDEN_FIELD_NAME,
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,

View file

@ -3,6 +3,8 @@
import geopandas as gpd
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -15,14 +17,6 @@ class TravelCompositeETL(ExtractTransformLoad):
NAME = "travel_composite"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
SOURCE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"dot_travel_composite/Shapefile_and_Metadata.zip"
)
else:
SOURCE_URL = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True
@ -31,14 +25,29 @@ class TravelCompositeETL(ExtractTransformLoad):
TRAVEL_BURDEN_FIELD_NAME: str
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.travel_composite_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"dot_travel_composite/Shapefile_and_Metadata.zip"
)
else:
self.travel_composite_url = "https://www.transportation.gov/sites/dot.gov/files/Shapefile_and_Metadata.zip"
# input
# define the full path for the input CSV file
self.INPUT_SHP = (
self.get_tmp_path() / "DOT_Disadvantage_Layer_Final_April2022.shp"
self.disadvantage_layer_shape_source = (
self.get_sources_path()
/ "DOT_Disadvantage_Layer_Final_April2022.shp"
)
# output
# this is the main dataframe
self.df: pd.DataFrame
self.df_dot: pd.DataFrame
# Start dataset-specific vars here
## Average of Transportation Indicator Percentiles (calculated)
## Calculated: Average of (EPL_TCB+EPL_NWKI+EPL_NOVEH+EPL_COMMUTE) excluding NULLS
@ -46,6 +55,22 @@ class TravelCompositeETL(ExtractTransformLoad):
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME = "Transp_TH"
self.INPUT_GEOID_TRACT_FIELD_NAME = "FIPS"
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.travel_composite_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df_dot = gpd.read_file(self.disadvantage_layer_shape_source)
def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method:
@ -54,15 +79,15 @@ class TravelCompositeETL(ExtractTransformLoad):
- Converts to CSV
"""
# read in the unzipped shapefile from data source
# reformat it to be standard df, remove unassigned rows, and
# then rename the Census Tract column for merging
df_dot: pd.DataFrame = gpd.read_file(self.INPUT_SHP)
df_dot = df_dot.rename(
self.df_dot = self.df_dot.rename(
columns={
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
self.INPUT_TRAVEL_DISADVANTAGE_FIELD_NAME: self.TRAVEL_BURDEN_FIELD_NAME,
}
).dropna(subset=[self.GEOID_TRACT_FIELD_NAME])
# Assign the final df to the class' output_df for the load method
self.output_df = df_dot
self.output_df = self.df_dot

View file

@ -1,12 +1,15 @@
from pathlib import Path
import geopandas as gpd
import pandas as pd
import geopandas as gpd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
logger = get_module_logger(__name__)
@ -39,13 +42,20 @@ class AbandonedMineETL(ExtractTransformLoad):
"55",
]
# Define these for easy code completion
def __init__(self):
self.SOURCE_URL = (
# fetch
self.eamlis_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/eAMLIS export of all data.tsv.zip"
)
# input
self.eamlis_source = (
self.get_sources_path() / "eAMLIS export of all data.tsv"
)
# output
self.TRACT_INPUT_COLUMN_NAME = self.INPUT_GEOID_TRACT_FIELD_NAME
self.OUTPUT_PATH: Path = (
@ -58,18 +68,34 @@ class AbandonedMineETL(ExtractTransformLoad):
]
self.output_df: pd.DataFrame
self.df: pd.DataFrame
def transform(self) -> None:
df = pd.read_csv(
self.get_tmp_path() / "eAMLIS export of all data.tsv",
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.eamlis_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.eamlis_source,
sep="\t",
low_memory=False,
)
def transform(self) -> None:
gdf = gpd.GeoDataFrame(
df,
self.df,
geometry=gpd.points_from_xy(
x=df["Longitude"],
y=df["Latitude"],
x=self.df["Longitude"],
y=self.df["Latitude"],
),
crs="epsg:4326",
)
@ -77,4 +103,5 @@ class AbandonedMineETL(ExtractTransformLoad):
gdf_tracts = add_tracts_for_geometries(gdf)
gdf_tracts = gdf_tracts.drop_duplicates(self.GEOID_TRACT_FIELD_NAME)
gdf_tracts[self.AML_BOOLEAN] = True
self.output_df = gdf_tracts[self.COLUMNS_TO_KEEP]

View file

@ -3,6 +3,8 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -15,11 +17,18 @@ class EJSCREENETL(ExtractTransformLoad):
INPUT_GEOID_TRACT_FIELD_NAME: str = "ID"
def __init__(self):
self.EJSCREEN_FTP_URL = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
self.EJSCREEN_CSV = (
self.get_tmp_path() / "EJSCREEN_2021_USPR_Tracts.csv"
# fetch
self.ejscreen_url = "https://gaftp.epa.gov/EJSCREEN/2021/EJSCREEN_2021_USPR_Tracts.csv.zip"
# input
self.ejscreen_source = (
self.get_sources_path() / "EJSCREEN_2021_USPR_Tracts.csv"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "ejscreen"
self.df: pd.DataFrame
self.COLUMNS_TO_KEEP = [
@ -43,22 +52,29 @@ class EJSCREENETL(ExtractTransformLoad):
field_names.UST_FIELD,
]
def extract(self) -> None:
super().extract(
self.EJSCREEN_FTP_URL,
self.get_tmp_path(),
verify=False, # EPA EJScreen end point has certificate issues often
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.ejscreen_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
def transform(self) -> None:
self.df = pd.read_csv(
self.EJSCREEN_CSV,
self.ejscreen_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
# EJSCREEN writes the word "None" for NA data.
na_values=["None"],
low_memory=False,
)
def transform(self) -> None:
# rename ID to Tract ID
self.output_df = self.df.rename(
columns={

View file

@ -1,5 +1,6 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -9,12 +10,17 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
# Note: while we normally set these properties in `__init__`,
# we are setting them as class properties here so they can be accessed by the
# class method `ejscreen_areas_of_concern_data_exists`.
LOCAL_CSV_PATH = ExtractTransformLoad.DATA_PATH / "local"
EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA = (
LOCAL_CSV_PATH / "ejscreen_areas_of_concerns_indicators.csv"
EJSCREEN_AREAS_OF_CONCERN_SOURCE = (
ExtractTransformLoad.DATA_PATH
/ "sources"
/ "EJSCREENAreasOfConcernETL"
/ "ejscreen_areas_of_concerns_indicators.csv"
)
def __init__(self):
# output
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / "ejscreen_areas_of_concern"
)
@ -22,6 +28,10 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
# TO DO: Load from actual source; the issue is that this dataset is not public for now
self.df: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
"""The source for this must be downloaded and saved manually. It is not publicly available"""
return []
@classmethod
def ejscreen_areas_of_concern_data_exists(cls):
"""Check whether or not the EJSCREEN areas of concern data exists.
@ -35,13 +45,19 @@ class EJSCREENAreasOfConcernETL(ExtractTransformLoad):
not reference this data.
"""
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA.is_file()
return cls.EJSCREEN_AREAS_OF_CONCERN_SOURCE.is_file()
def extract(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
logger.info(self.EJSCREEN_AREAS_OF_CONCERN_SOURCE)
if self.ejscreen_areas_of_concern_data_exists():
logger.debug("Loading EJSCREEN Areas of Concern Data Locally")
self.df = pd.read_csv(
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE_DATA,
filepath_or_buffer=self.EJSCREEN_AREAS_OF_CONCERN_SOURCE,
dtype={
self.GEOID_FIELD_NAME: "string",
},

View file

@ -5,18 +5,27 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
def __init__(self):
self.DEFINITION_ALTERNATIVE_FILE_URL = (
# fetch
self.definition_alternative_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/alternative DAC definition.csv.zip"
)
# input
self.definition_alternative_source = (
self.get_sources_path() / "J40 alternative DAC definition.csv"
)
# output
self.OUTPUT_PATH: Path = (
self.DATA_PATH / "dataset" / "energy_definition_alternative_draft"
)
@ -48,18 +57,22 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
unzip_file_from_url(
file_url=self.DEFINITION_ALTERNATIVE_FILE_URL,
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path()
/ "energy_definition_alternative_draft",
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.definition_alternative_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "energy_definition_alternative_draft"
/ "J40 alternative DAC definition.csv",
filepath_or_buffer=self.definition_alternative_source,
# The following need to remain as strings for all of their digits, not get converted to numbers.
dtype={
self.TRACT_INPUT_COLUMN_NAME: "string",
@ -68,6 +81,7 @@ class EnergyDefinitionAlternativeDraft(ExtractTransformLoad):
)
def transform(self) -> None:
self.df = self.df.rename(
columns={
self.TRACT_INPUT_COLUMN_NAME: self.GEOID_TRACT_FIELD_NAME,

View file

@ -4,8 +4,9 @@ import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -23,17 +24,25 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.AGGREGATED_RSEI_SCORE_FILE_URL = (
self.aggregated_rsei_score_file_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"epa_rsei/CensusMicroTracts2019_2019_aggregated.zip"
)
else:
self.AGGREGATED_RSEI_SCORE_FILE_URL = (
self.aggregated_rsei_score_file_url = (
"http://abt-rsei.s3.amazonaws.com/microdata2019/"
"census_agg/CensusMicroTracts2019_2019_aggregated.zip"
)
# input
self.aggregated_rsei_score_source = (
self.get_sources_path()
/ "CensusMicroTracts2019_2019_aggregated.csv"
)
# output
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "epa_rsei"
self.EPA_RSEI_SCORE_THRESHOLD_CUTOFF = 0.75
self.TRACT_INPUT_COLUMN_NAME = "GEOID10"
@ -64,7 +73,20 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.aggregated_rsei_score_file_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# the column headers from the above dataset are actually a census tract's data at this point
# We will use this data structure later to specify the column names
input_columns = [
@ -79,16 +101,8 @@ class EPARiskScreeningEnvironmentalIndicatorsETL(ExtractTransformLoad):
self.NCSCORE_INPUT_FIELD,
]
unzip_file_from_url(
file_url=self.AGGREGATED_RSEI_SCORE_FILE_URL,
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path() / "epa_rsei",
)
self.df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path()
/ "epa_rsei"
/ "CensusMicroTracts2019_2019_aggregated.csv",
filepath_or_buffer=self.aggregated_rsei_score_source,
# The following need to remain as strings for all of their digits, not get
# converted to numbers.
low_memory=False,

View file

@ -5,6 +5,8 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
@ -15,7 +17,7 @@ class FloodRiskETL(ExtractTransformLoad):
NAME = "fsf_flood_risk"
# These data were emailed to the J40 team while first street got
# their official data sharing channels setup.
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
LOAD_YAML_CONFIG: bool = True
@ -27,13 +29,16 @@ class FloodRiskETL(ExtractTransformLoad):
SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS: str
def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = (
self.get_tmp_path() / "fsf_flood" / "flood-tract2010.csv"
# fetch
self.flood_tract_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_flood.zip"
)
# this is the main dataframe
self.df: pd.DataFrame
# input
self.flood_tract_source = (
self.get_sources_path() / "fsf_flood" / "flood-tract2010.csv"
)
# Start dataset-specific vars here
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
@ -41,6 +46,29 @@ class FloodRiskETL(ExtractTransformLoad):
self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "mid_depth_100_year30"
self.CLIP_PROPERTIES_COUNT = 250
self.df_fsf_flood: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.flood_tract_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# read in the unzipped csv data source then rename the
# Census Tract column for merging
self.df_fsf_flood = pd.read_csv(
self.flood_tract_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method:
@ -49,35 +77,29 @@ class FloodRiskETL(ExtractTransformLoad):
- Calculates share of properties at risk, left-clipping number of properties at 250
"""
# read in the unzipped csv data source then rename the
# Census Tract column for merging
df_fsf_flood: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = df_fsf_flood[
self.df_fsf_flood[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_flood[
self.INPUT_GEOID_TRACT_FIELD_NAME
].str.zfill(11)
df_fsf_flood[self.COUNT_PROPERTIES] = df_fsf_flood[
self.df_fsf_flood[self.COUNT_PROPERTIES] = self.df_fsf_flood[
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
].clip(lower=self.CLIP_PROPERTIES_COUNT)
df_fsf_flood[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY] = (
df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
/ df_fsf_flood[self.COUNT_PROPERTIES]
self.df_fsf_flood[
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_TODAY
] = (
self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_TODAY]
/ self.df_fsf_flood[self.COUNT_PROPERTIES]
)
df_fsf_flood[
self.df_fsf_flood[
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS
] = (
df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
/ df_fsf_flood[self.COUNT_PROPERTIES]
self.df_fsf_flood[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
/ self.df_fsf_flood[self.COUNT_PROPERTIES]
)
# Assign the final df to the class' output_df for the load method with rename
self.output_df = df_fsf_flood.rename(
self.output_df = self.df_fsf_flood.rename(
columns={
self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FLOODING_TODAY,
self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FLOODING_IN_30_YEARS,

View file

@ -4,6 +4,8 @@ import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -15,7 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad):
NAME = "fsf_wildfire_risk"
# These data were emailed to the J40 team while first street got
# their official data sharing channels setup.
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True
@ -29,18 +31,48 @@ class WildfireRiskETL(ExtractTransformLoad):
SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS: str
def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = self.get_tmp_path() / "fsf_fire" / "fire-tract2010.csv"
# fetch
self.fsf_fire_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
)
# input
self.fsf_fire_source = (
self.get_sources_path() / "fsf_fire" / "fire-tract2010.csv"
)
# output
# this is the main dataframe
self.df: pd.DataFrame
self.df_fsf_fire: pd.DataFrame
# Start dataset-specific vars here
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME = "count_properties"
self.COUNT_PROPERTIES_AT_RISK_TODAY = "burnprob_year00_flag"
self.COUNT_PROPERTIES_AT_RISK_30_YEARS = "burnprob_year30_flag"
self.CLIP_PROPERTIES_COUNT = 250
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.fsf_fire_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df_fsf_fire = pd.read_csv(
self.fsf_fire_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
def transform(self) -> None:
"""Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method:
@ -50,31 +82,28 @@ class WildfireRiskETL(ExtractTransformLoad):
"""
# read in the unzipped csv data source then rename the
# Census Tract column for merging
df_fsf_fire: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = df_fsf_fire[
self.df_fsf_fire[self.GEOID_TRACT_FIELD_NAME] = self.df_fsf_fire[
self.INPUT_GEOID_TRACT_FIELD_NAME
].str.zfill(11)
df_fsf_fire[self.COUNT_PROPERTIES] = df_fsf_fire[
self.df_fsf_fire[self.COUNT_PROPERTIES] = self.df_fsf_fire[
self.COUNT_PROPERTIES_NATIVE_FIELD_NAME
].clip(lower=self.CLIP_PROPERTIES_COUNT)
df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
/ df_fsf_fire[self.COUNT_PROPERTIES]
self.df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_TODAY] = (
self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_TODAY]
/ self.df_fsf_fire[self.COUNT_PROPERTIES]
)
df_fsf_fire[self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS] = (
df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
/ df_fsf_fire[self.COUNT_PROPERTIES]
self.df_fsf_fire[
self.SHARE_OF_PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS
] = (
self.df_fsf_fire[self.COUNT_PROPERTIES_AT_RISK_30_YEARS]
/ self.df_fsf_fire[self.COUNT_PROPERTIES]
)
# Assign the final df to the class' output_df for the load method with rename
self.output_df = df_fsf_fire.rename(
self.output_df = self.df_fsf_fire.rename(
columns={
self.COUNT_PROPERTIES_AT_RISK_TODAY: self.PROPERTIES_AT_RISK_FROM_FIRE_TODAY,
self.COUNT_PROPERTIES_AT_RISK_30_YEARS: self.PROPERTIES_AT_RISK_FROM_FIRE_IN_30_YEARS,

View file

@ -3,17 +3,33 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
class GeoCorrETL(ExtractTransformLoad):
NAME = "geocorr"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
def __init__(self):
# fetch
self.geocorr_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr_urban_rural.csv.zip"
)
# input
self.geocorr_source = (
self.get_sources_path() / "geocorr_urban_rural.csv"
)
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "geocorr"
# Need to change hyperlink to S3
@ -23,7 +39,7 @@ class GeoCorrETL(ExtractTransformLoad):
# The source data for this notebook was downloaded from GeoCorr;
# the instructions for generating the source data is here:
# https://github.com/usds/justice40-tool/issues/355#issuecomment-920241787
self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
# self.GEOCORR_PLACES_URL = "https://justice40-data.s3.amazonaws.com/data-sources/geocorr_urban_rural.csv.zip"
self.GEOCORR_GEOID_FIELD_NAME = "GEOID10_TRACT"
self.URBAN_HEURISTIC_FIELD_NAME = "Urban Heuristic Flag"
self.COLUMNS_TO_KEEP = [
@ -33,16 +49,21 @@ class GeoCorrETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
unzip_file_from_url(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/geocorr_urban_rural.csv.zip",
download_path=self.get_tmp_path(),
unzipped_file_path=self.get_tmp_path(),
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.geocorr_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
filepath_or_buffer=self.get_tmp_path() / "geocorr_urban_rural.csv",
filepath_or_buffer=self.geocorr_source,
dtype={
self.GEOCORR_GEOID_FIELD_NAME: "string",
},

View file

@ -3,12 +3,16 @@ from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
class HistoricRedliningETL(ExtractTransformLoad):
NAME = "historic_redlining"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
EXPECTED_MISSING_STATES = [
"10",
@ -25,14 +29,14 @@ class HistoricRedliningETL(ExtractTransformLoad):
]
PUERTO_RICO_EXPECTED_IN_DATA = False
ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = False
SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
def __init__(self):
self.CSV_PATH = self.DATA_PATH / "dataset" / "historic_redlining"
self.HISTORIC_REDLINING_FILE_PATH = (
self.get_tmp_path() / "HRS_2010.xlsx"
)
# fetch
self.hrs_url = settings.AWS_JUSTICE40_DATASOURCES_URL + "/HRS_2010.zip"
# input
self.hrs_source = self.get_sources_path() / "HRS_2010.xlsx"
self.REDLINING_SCALAR = "Tract-level redlining score"
@ -40,30 +44,47 @@ class HistoricRedliningETL(ExtractTransformLoad):
self.GEOID_TRACT_FIELD_NAME,
self.REDLINING_SCALAR,
]
self.df: pd.DataFrame
self.historic_redlining_data: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.hrs_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.historic_redlining_data = pd.read_excel(self.hrs_source)
def transform(self) -> None:
# this is obviously temporary
historic_redlining_data = pd.read_excel(
self.HISTORIC_REDLINING_FILE_PATH
self.historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
self.historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
)
historic_redlining_data[self.GEOID_TRACT_FIELD_NAME] = (
historic_redlining_data["GEOID10"].astype(str).str.zfill(11)
)
historic_redlining_data = historic_redlining_data.rename(
self.historic_redlining_data = self.historic_redlining_data.rename(
columns={"HRS2010": self.REDLINING_SCALAR}
)
logger.debug(f"{historic_redlining_data.columns}")
logger.debug(f"{self.historic_redlining_data.columns}")
# Calculate lots of different score thresholds for convenience
for threshold in [3.25, 3.5, 3.75]:
historic_redlining_data[
self.historic_redlining_data[
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
] = (historic_redlining_data[self.REDLINING_SCALAR] >= threshold)
] = (
self.historic_redlining_data[self.REDLINING_SCALAR] >= threshold
)
## NOTE We add to columns to keep here
self.COLUMNS_TO_KEEP.append(
f"{self.REDLINING_SCALAR} meets or exceeds {round(threshold, 2)}"
)
self.output_df = historic_redlining_data
self.output_df = self.historic_redlining_data

View file

@ -1,8 +1,9 @@
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
from pandas.errors import EmptyDataError
logger = get_module_logger(__name__)
@ -10,36 +11,46 @@ logger = get_module_logger(__name__)
class HousingTransportationETL(ExtractTransformLoad):
def __init__(self):
self.HOUSING_FTP_URL = (
"https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
)
self.OUTPUT_PATH = (
self.DATA_PATH / "dataset" / "housing_and_transportation_index"
)
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
housing_url = (
"https://htaindex.cnt.org/download/download.php?focus=tract&geoid="
)
sources = []
for fips in get_state_fips_codes(self.DATA_PATH):
sources.append(
ZIPDataSource(
source=f"{housing_url}{fips}",
destination=self.get_sources_path(),
)
)
return sources
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# Download each state / territory individually
dfs = []
zip_file_dir = self.get_tmp_path() / "housing_and_transportation_index"
for fips in get_state_fips_codes(self.DATA_PATH):
logger.debug(
f"Downloading housing data for state/territory with FIPS code {fips}"
)
unzip_file_from_url(
f"{self.HOUSING_FTP_URL}{fips}",
self.get_tmp_path(),
zip_file_dir,
)
# New file name:
tmp_csv_file_path = (
zip_file_dir / f"htaindex2019_data_tracts_{fips}.csv"
csv_source = (
self.get_sources_path() / f"htaindex2019_data_tracts_{fips}.csv"
)
try:
tmp_df = pd.read_csv(filepath_or_buffer=tmp_csv_file_path)
tmp_df = pd.read_csv(filepath_or_buffer=csv_source)
except EmptyDataError:
logger.error(
f"Could not read Housing and Transportation data for state/territory with FIPS code {fips}"

View file

@ -3,24 +3,33 @@ from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
logger = get_module_logger(__name__)
class HudHousingETL(ExtractTransformLoad):
NAME = "hud_housing"
GEO_LEVEL: ValidGeoLevel = ValidGeoLevel.CENSUS_TRACT
def __init__(self):
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.HOUSING_FTP_URL = (
self.housing_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"hud_housing/2014thru2018-140-csv.zip"
)
else:
self.HOUSING_FTP_URL = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
self.housing_url = "https://www.huduser.gov/portal/datasets/cp/2014thru2018-140-csv.zip"
# source
# output
self.GEOID_TRACT_FIELD_NAME = "GEOID10_TRACT"
self.HOUSING_ZIP_FILE_DIR = self.get_tmp_path()
@ -55,15 +64,16 @@ class HudHousingETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
super().extract(
self.HOUSING_FTP_URL,
self.HOUSING_ZIP_FILE_DIR,
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.housing_url, destination=self.get_sources_path()
)
]
def _read_chas_table(self, file_name):
# New file name:
tmp_csv_file_path = self.HOUSING_ZIP_FILE_DIR / "140" / file_name
tmp_csv_file_path = self.get_sources_path() / "140" / file_name
tmp_df = pd.read_csv(
filepath_or_buffer=tmp_csv_file_path,
encoding="latin-1",
@ -78,7 +88,12 @@ class HudHousingETL(ExtractTransformLoad):
return tmp_df
def transform(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
table_8 = self._read_chas_table("Table8.csv")
table_3 = self._read_chas_table("Table3.csv")
@ -86,6 +101,8 @@ class HudHousingETL(ExtractTransformLoad):
table_3, how="outer", on=self.GEOID_TRACT_FIELD_NAME
)
def transform(self) -> None:
# Calculate share that lacks indoor plumbing or kitchen
# This is computed as
# (

View file

@ -1,7 +1,9 @@
import pandas as pd
import requests
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.utils import get_module_logger
@ -11,44 +13,51 @@ logger = get_module_logger(__name__)
class HudRecapETL(ExtractTransformLoad):
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.HUD_RECAP_CSV_URL = (
self.hud_recap_csv_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"hud_recap/Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
)
else:
self.HUD_RECAP_CSV_URL = (
self.hud_recap_csv_url = (
"https://opendata.arcgis.com/api/v3/datasets/"
"56de4edea8264fe5a344da9811ef5d6e_0/downloads/data?format=csv&spatialRefId=4326"
)
self.HUD_RECAP_CSV = (
self.get_tmp_path()
# input
self.hud_recap_source = (
self.get_sources_path()
/ "Racially_or_Ethnically_Concentrated_Areas_of_Poverty__R_ECAPs_.csv"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "hud_recap"
# Definining some variable names
# Defining some variable names
self.HUD_RECAP_PRIORITY_COMMUNITY_FIELD_NAME = (
"hud_recap_priority_community"
)
self.df: pd.DataFrame
def extract(self) -> None:
download = requests.get(
self.HUD_RECAP_CSV_URL,
verify=None,
timeout=settings.REQUESTS_DEFAULT_TIMOUT,
)
file_contents = download.content
csv_file = open(self.HUD_RECAP_CSV, "wb")
csv_file.write(file_contents)
csv_file.close()
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.hud_recap_csv_url, destination=self.hud_recap_source
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(self.hud_recap_source, dtype={"GEOID": "string"})
def transform(self) -> None:
# Load comparison index (CalEnviroScreen 4)
self.df = pd.read_csv(self.HUD_RECAP_CSV, dtype={"GEOID": "string"})
self.df.rename(
columns={

View file

@ -2,6 +2,8 @@ import geopandas as gpd
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
@ -10,16 +12,25 @@ logger = get_module_logger(__name__)
class MappingForEJETL(ExtractTransformLoad):
def __init__(self):
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
self.MAPPING_FOR_EJ_VA_URL = (
# fetch
self.mapping_for_ej_va_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/VA_mej.zip"
)
self.MAPPING_FOR_EJ_CO_URL = (
self.mapping_for_ej_co_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/CO_mej.zip"
)
self.VA_SHP_FILE_PATH = self.get_tmp_path() / "mej_virginia_7_1.shp"
self.CO_SHP_FILE_PATH = self.get_tmp_path() / "mej_colorado_final.shp"
# input
self.va_shp_file_source = (
self.get_sources_path() / "mej_virginia_7_1.shp"
)
self.co_shp_file_source = (
self.get_sources_path() / "mej_colorado_final.shp"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_for_ej"
# Defining variables
self.COLUMNS_TO_KEEP = [
@ -38,26 +49,35 @@ class MappingForEJETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
super().extract(
self.MAPPING_FOR_EJ_VA_URL,
self.get_tmp_path(),
)
super().extract(
self.MAPPING_FOR_EJ_CO_URL,
self.get_tmp_path(),
)
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.mapping_for_ej_va_url,
destination=self.get_sources_path(),
),
ZIPDataSource(
source=self.mapping_for_ej_co_url,
destination=self.get_sources_path(),
),
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
def transform(self) -> None:
# Join (here, it's just concatenating) the two dataframes from
# CO and VA
self.df = pd.concat(
[
gpd.read_file(self.VA_SHP_FILE_PATH),
gpd.read_file(self.CO_SHP_FILE_PATH),
gpd.read_file(self.va_shp_file_source),
gpd.read_file(self.co_shp_file_source),
]
)
def transform(self) -> None:
# Fill Census tract to get it to be 11 digits, incl. leading 0s
# Note that VA and CO should never have leading 0s, so this isn't
# strictly necessary, but if in the future, there are more states

View file

@ -3,8 +3,9 @@ import pathlib
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -19,31 +20,35 @@ class MappingInequalityETL(ExtractTransformLoad):
Information on the mapping of this data to census tracts is available at
https://github.com/americanpanorama/Census_HOLC_Research.
"""
def __init__(self):
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.MAPPING_INEQUALITY_CSV_URL = (
self.mapping_inequality_csv_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"mapping_inequality/holc_tract_lookup.csv"
)
else:
self.MAPPING_INEQUALITY_CSV_URL = (
self.mapping_inequality_csv_url = (
"https://raw.githubusercontent.com/americanpanorama/Census_HOLC_Research/"
"main/2010_Census_Tracts/holc_tract_lookup.csv"
)
self.MAPPING_INEQUALITY_CSV = (
self.get_tmp_path() / "holc_tract_lookup.csv"
)
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
self.HOLC_MANUAL_MAPPING_CSV_PATH = (
# input
self.mapping_inequality_source = (
self.get_sources_path() / "holc_tract_lookup.csv"
)
self.holc_manual_mapping_source = ( # here be dragons this file is pulled from a different place than most
pathlib.Path(__file__).parent
/ "data"
/ "holc_grades_manually_mapped.csv"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "mapping_inequality"
# Some input field names. From documentation: 'Census Tracts were intersected
# with HOLC Polygons. Census information can be joined via the "geoid" field.
# There are two field "holc_prop" and "tract_prop" which give the proportion
@ -73,22 +78,39 @@ class MappingInequalityETL(ExtractTransformLoad):
]
self.df: pd.DataFrame
self.holc_manually_mapped_df: pd.DataFrame
def extract(self) -> None:
download_file_from_url(
file_url=self.MAPPING_INEQUALITY_CSV_URL,
download_file_name=self.MAPPING_INEQUALITY_CSV,
)
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.mapping_inequality_csv_url,
destination=self.mapping_inequality_source,
)
]
def transform(self) -> None:
df: pd.DataFrame = pd.read_csv(
self.MAPPING_INEQUALITY_CSV,
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
self.mapping_inequality_source,
dtype={self.TRACT_INPUT_FIELD: "string"},
low_memory=False,
)
# Some data needs to be manually mapped to its grade.
# TODO: Investigate more data that may need to be manually mapped.
self.holc_manually_mapped_df = pd.read_csv(
filepath_or_buffer=self.holc_manual_mapping_source,
low_memory=False,
)
def transform(self) -> None:
# rename Tract ID
df.rename(
self.df.rename(
columns={
self.TRACT_INPUT_FIELD: self.GEOID_TRACT_FIELD_NAME,
},
@ -98,28 +120,21 @@ class MappingInequalityETL(ExtractTransformLoad):
# Keep the first character, which is the HOLC grade (A, B, C, D).
# TODO: investigate why this dataframe triggers these pylint errors.
# pylint: disable=unsupported-assignment-operation, unsubscriptable-object
df[self.HOLC_GRADE_DERIVED_FIELD] = df[
self.df[self.HOLC_GRADE_DERIVED_FIELD] = self.df[
self.HOLC_GRADE_AND_ID_FIELD
].str[0:1]
# Remove nonsense when the field has no grade or invalid grades.
valid_grades = ["A", "B", "C", "D"]
df.loc[
self.df.loc[
# pylint: disable=unsubscriptable-object
~df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
~self.df[self.HOLC_GRADE_DERIVED_FIELD].isin(valid_grades),
self.HOLC_GRADE_DERIVED_FIELD,
] = None
# Some data needs to be manually mapped to its grade.
# TODO: Investigate more data that may need to be manually mapped.
holc_manually_mapped_df = pd.read_csv(
filepath_or_buffer=self.HOLC_MANUAL_MAPPING_CSV_PATH,
low_memory=False,
)
# Join on the existing data
merged_df = df.merge(
right=holc_manually_mapped_df,
merged_df = self.df.merge(
right=self.holc_manually_mapped_df,
on=[self.HOLC_GRADE_AND_ID_FIELD, self.CITY_INPUT_FIELD],
how="left",
)

View file

@ -4,6 +4,8 @@ import geopandas as gpd
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
@ -17,11 +19,16 @@ class MarylandEJScreenETL(ExtractTransformLoad):
"""
def __init__(self):
self.MARYLAND_EJSCREEN_URL = (
# fetch
self.maryland_ejscreen_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/MD_EJScreen.zip"
)
self.SHAPE_FILES_PATH = self.get_tmp_path() / "mdejscreen"
# input
self.shape_files_source = self.get_sources_path() / "mdejscreen"
# output
self.OUTPUT_CSV_PATH = self.DATA_PATH / "dataset" / "maryland_ejscreen"
self.COLUMNS_TO_KEEP = [
@ -31,37 +38,47 @@ class MarylandEJScreenETL(ExtractTransformLoad):
]
self.df: pd.DataFrame
self.dfs_list: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.maryland_ejscreen_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
def extract(self) -> None:
logger.debug("Downloading 207MB Maryland EJSCREEN Data")
super().extract(
self.MARYLAND_EJSCREEN_URL,
self.get_tmp_path(),
)
use_cached_data_sources
) # download and extract data sources
def transform(self) -> None:
list_of_files = list(glob(str(self.SHAPE_FILES_PATH) + "/*.shp"))
logger.debug("Downloading 207MB Maryland EJSCREEN Data")
list_of_files = list(glob(str(self.shape_files_source) + "/*.shp"))
# Ignore counties becauses this is not the level of measurement
# Ignore counties because this is not the level of measurement
# that is consistent with our current scoring and ranking methodology.
dfs_list = [
self.dfs_list = [
gpd.read_file(f)
for f in list_of_files
if not f.endswith("CountiesEJScore.shp")
]
def transform(self) -> None:
# Set the Census tract as the index and drop the geometry column
# that produces the census tract boundaries.
# The latter is because Geopandas raises an exception if there
# are duplicate geometry columns.
# Moreover, since the unit of measurement is at the tract level
# we can consistantly merge this with other datasets
dfs_list = [
self.dfs_list = [
df.set_index("Census_Tra").drop("geometry", axis=1)
for df in dfs_list
for df in self.dfs_list
]
# pylint: disable=unsubscriptable-object
self.df = gpd.GeoDataFrame(pd.concat(dfs_list, axis=1))
self.df = gpd.GeoDataFrame(pd.concat(self.dfs_list, axis=1))
# Reset index so that we no longer have the tract as our index
self.df = self.df.reset_index()

View file

@ -1,6 +1,8 @@
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
@ -15,12 +17,21 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
"""
def __init__(self):
self.MICHIGAN_EJSCREEN_S3_URL = (
# fetch
self.michigan_ejscreen_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/michigan_ejscore_12212021.csv"
)
# input
self.michigan_ejscreen_source = (
self.get_sources_path() / "michigan_ejscore_12212021.csv"
)
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "michigan_ejscreen"
self.MICHIGAN_EJSCREEN_PRIORITY_COMMUNITY_THRESHOLD: float = 0.75
self.COLUMNS_TO_KEEP = [
@ -32,14 +43,28 @@ class MichiganEnviroScreenETL(ExtractTransformLoad):
self.df: pd.DataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return [
FileDataSource(
source=self.michigan_ejscreen_url,
destination=self.michigan_ejscreen_source,
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df = pd.read_csv(
filepath_or_buffer=self.MICHIGAN_EJSCREEN_S3_URL,
filepath_or_buffer=self.michigan_ejscreen_source,
dtype={"GEO_ID": "string"},
low_memory=False,
)
def transform(self) -> None:
self.df.rename(
columns={
"GEO_ID": self.GEOID_TRACT_FIELD_NAME,

View file

@ -4,6 +4,8 @@
# pylint: disable=unsupported-assignment-operation
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -16,17 +18,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
NAME = "national_risk_index"
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
SOURCE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"national_risk_index/NRI_Table_CensusTracts.zip"
)
else:
SOURCE_URL = (
"https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
"NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
)
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True
@ -46,11 +37,28 @@ class NationalRiskIndexETL(ExtractTransformLoad):
AGRIVALUE_LOWER_BOUND = 408000
def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
# fetch
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.risk_index_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"national_risk_index/NRI_Table_CensusTracts.zip"
)
else:
self.risk_index_url = (
"https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload/"
"NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
)
# source
self.risk_index_source = (
self.get_sources_path() / "NRI_Table_CensusTracts.csv"
)
# output
# this is the main dataframe
self.df: pd.DataFrame
self.df_nri: pd.DataFrame
# Start dataset-specific vars here
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
@ -65,14 +73,26 @@ class NationalRiskIndexETL(ExtractTransformLoad):
self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"
def extract(self) -> None:
"""Unzips NRI dataset from the FEMA data source and writes the files
to the temporary data folder for use in the transform() method
"""
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.risk_index_url, destination=self.get_sources_path()
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
source_url=self.SOURCE_URL,
extract_path=self.get_tmp_path(),
use_cached_data_sources
) # download and extract data sources
# read in the unzipped csv from NRI data source then rename the
# Census Tract column for merging
self.df_nri = pd.read_csv(
self.risk_index_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
na_values=["None"],
low_memory=False,
)
def transform(self) -> None:
@ -84,16 +104,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
Groups inside of that Tract
"""
# read in the unzipped csv from NRI data source then rename the
# Census Tract column for merging
df_nri: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
na_values=["None"],
low_memory=False,
)
df_nri.rename(
self.df_nri.rename(
columns={
self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
@ -123,42 +134,46 @@ class NationalRiskIndexETL(ExtractTransformLoad):
agriculture_columns = [
f"{x}_EALA"
for x in disaster_categories
if f"{x}_EALA" in list(df_nri.columns)
if f"{x}_EALA" in list(self.df_nri.columns)
]
population_columns = [
f"{x}_EALP"
for x in disaster_categories
if f"{x}_EALP" in list(df_nri.columns)
if f"{x}_EALP" in list(self.df_nri.columns)
]
buildings_columns = [
f"{x}_EALB"
for x in disaster_categories
if f"{x}_EALB" in list(df_nri.columns)
if f"{x}_EALB" in list(self.df_nri.columns)
]
disaster_population_sum_series = df_nri[population_columns].sum(axis=1)
disaster_agriculture_sum_series = df_nri[agriculture_columns].sum(
disaster_population_sum_series = self.df_nri[population_columns].sum(
axis=1
)
disaster_buildings_sum_series = df_nri[buildings_columns].sum(axis=1)
disaster_agriculture_sum_series = self.df_nri[agriculture_columns].sum(
axis=1
)
disaster_buildings_sum_series = self.df_nri[buildings_columns].sum(
axis=1
)
# Population EAL Rate = Eal Valp / Population
df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
self.df_nri[self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME] = (
disaster_population_sum_series
/ df_nri[self.POPULATION_INPUT_FIELD_NAME]
/ self.df_nri[self.POPULATION_INPUT_FIELD_NAME]
)
# Agriculture EAL Rate = Eal Vala / max(Agrivalue, 408000)
## FORMULA ADJUSTMENT 2/17
## Because AGRIVALUE contains a lot of 0s, we are going to consider
## 90th percentile only for places that have some agrivalue at all
df_nri[
self.df_nri[
self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME
] = disaster_agriculture_sum_series / df_nri[
] = disaster_agriculture_sum_series / self.df_nri[
self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME
].clip(
lower=self.AGRIVALUE_LOWER_BOUND
@ -167,11 +182,11 @@ class NationalRiskIndexETL(ExtractTransformLoad):
## Check that this clip worked -- that the only place the value has changed is when the clip took effect
base_expectation = (
disaster_agriculture_sum_series
/ df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
/ self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME]
)
assert (
df_nri[
df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
self.df_nri[
self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
!= base_expectation
][self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME].max()
<= self.AGRIVALUE_LOWER_BOUND
@ -181,27 +196,27 @@ class NationalRiskIndexETL(ExtractTransformLoad):
)
assert (
df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
self.df_nri[self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME]
!= base_expectation
).sum() > 0, "Clipping the agrivalue did nothing!"
# This produces a boolean that is True in the case of non-zero agricultural value
df_nri[self.CONTAINS_AGRIVALUE] = (
df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
self.df_nri[self.CONTAINS_AGRIVALUE] = (
self.df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
)
# divide EAL_VALB (Expected Annual Loss - Building Value) by BUILDVALUE (Building Value ($)).
df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
self.df_nri[self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME] = (
disaster_buildings_sum_series
/ df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
/ self.df_nri[self.BUILDING_VALUE_INPUT_FIELD_NAME]
)
# Round all float columns to just 10 digits.
# Note: `round` is smart enough to only apply to float columns.
df_nri = df_nri.round(10)
self.df_nri = self.df_nri.round(10)
# Assign the final df to the class' output_df for the load method
self.output_df = df_nri
self.output_df = self.df_nri
def load(self) -> None:
# Suppress scientific notation.

View file

@ -3,6 +3,8 @@
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
@ -13,10 +15,7 @@ class NatureDeprivedETL(ExtractTransformLoad):
"""ETL class for the Nature Deprived Communities dataset"""
NAME = "nlcd_nature_deprived"
SOURCE_URL = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
)
GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
PUERTO_RICO_EXPECTED_IN_DATA = False
LOAD_YAML_CONFIG: bool = True
@ -29,14 +28,25 @@ class NatureDeprivedETL(ExtractTransformLoad):
TRACT_PERCENT_CROPLAND_FIELD_NAME: str
def __init__(self):
# define the full path for the input CSV file
self.INPUT_CSV = (
self.get_tmp_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
# fetch
self.nature_deprived_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/usa_conus_nat_dep__compiled_by_TPL.csv.zip"
)
# source
# define the full path for the input CSV file
self.nature_deprived_source = (
self.get_sources_path() / "usa_conus_nat_dep__compiled_by_TPL.csv"
)
# output
# this is the main dataframe
self.df: pd.DataFrame
self.df_ncld: pd.DataFrame
# Start dataset-specific vars here
self.PERCENT_NATURAL_FIELD_NAME = "PctNatural"
self.PERCENT_IMPERVIOUS_FIELD_NAME = "PctImperv"
@ -47,28 +57,43 @@ class NatureDeprivedETL(ExtractTransformLoad):
# for area. This does indeed remove tracts from the 90th+ percentile later on
self.TRACT_ACRES_LOWER_BOUND = 35
def transform(self) -> None:
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.nature_deprived_url,
destination=self.get_sources_path(),
)
]
def extract(self, use_cached_data_sources: bool = False) -> None:
"""Reads the unzipped data file into memory and applies the following
transformations to prepare it for the load() method:
- Renames columns as needed
"""
df_ncld: pd.DataFrame = pd.read_csv(
self.INPUT_CSV,
super().extract(
use_cached_data_sources
) # download and extract data sources
self.df_ncld = pd.read_csv(
self.nature_deprived_source,
dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: str},
low_memory=False,
)
df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
df_ncld[self.TRACT_ACRES_FIELD_NAME] >= self.TRACT_ACRES_LOWER_BOUND
def transform(self) -> None:
self.df_ncld[self.ELIGIBLE_FOR_NATURE_DEPRIVED_FIELD_NAME] = (
self.df_ncld[self.TRACT_ACRES_FIELD_NAME]
>= self.TRACT_ACRES_LOWER_BOUND
)
df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
100 - df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
self.df_ncld[self.TRACT_PERCENT_NON_NATURAL_FIELD_NAME] = (
100 - self.df_ncld[self.PERCENT_NATURAL_FIELD_NAME]
)
# Assign the final df to the class' output_df for the load method with rename
self.output_df = df_ncld.rename(
self.output_df = self.df_ncld.rename(
columns={
self.PERCENT_IMPERVIOUS_FIELD_NAME: self.TRACT_PERCENT_IMPERVIOUS_FIELD_NAME,
self.PERCENT_CROPLAND_FIELD_NAME: self.TRACT_PERCENT_CROPLAND_FIELD_NAME,

View file

@ -3,9 +3,10 @@ import functools
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
logger = get_module_logger(__name__)
@ -23,6 +24,26 @@ class PersistentPovertyETL(ExtractTransformLoad):
PUERTO_RICO_EXPECTED_IN_DATA = False
def __init__(self):
# fetch
self.poverty_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL + "/LTDB_Std_All_Sample.zip"
)
# source
self.poverty_sources = [
self.get_sources_path()
/ "ltdb_std_all_sample"
/ "ltdb_std_1990_sample.csv",
self.get_sources_path()
/ "ltdb_std_all_sample"
/ "ltdb_std_2000_sample.csv",
self.get_sources_path()
/ "ltdb_std_all_sample"
/ "ltdb_std_2010_sample.csv",
]
# output
self.OUTPUT_PATH = self.DATA_PATH / "dataset" / "persistent_poverty"
# Need to change hyperlink to S3
@ -44,6 +65,13 @@ class PersistentPovertyETL(ExtractTransformLoad):
self.df: pd.DataFrame
def get_data_sources(self) -> [DataSource]:
return [
ZIPDataSource(
source=self.poverty_url, destination=self.get_sources_path()
)
]
def _join_input_dfs(self, dfs: list) -> pd.DataFrame:
df = functools.reduce(
lambda df_a, df_b: pd.merge(
@ -75,28 +103,17 @@ class PersistentPovertyETL(ExtractTransformLoad):
return df
def extract(self) -> None:
unzipped_file_path = self.get_tmp_path()
def extract(self, use_cached_data_sources: bool = False) -> None:
unzip_file_from_url(
file_url=settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/LTDB_Std_All_Sample.zip",
download_path=self.get_tmp_path(),
unzipped_file_path=unzipped_file_path,
)
file_names = [
"ltdb_std_1990_sample.csv",
"ltdb_std_2000_sample.csv",
"ltdb_std_2010_sample.csv",
]
super().extract(
use_cached_data_sources
) # download and extract data sources
temporary_input_dfs = []
for file_name in file_names:
for file_name in self.poverty_sources:
temporary_input_df = pd.read_csv(
filepath_or_buffer=unzipped_file_path
/ f"ltdb_std_all_sample/{file_name}",
filepath_or_buffer=file_name,
dtype={
self.GEOID_TRACT_INPUT_FIELD_NAME_1: "string",
self.GEOID_TRACT_INPUT_FIELD_NAME_2: "string",

View file

@ -1,6 +1,8 @@
import geopandas as gpd
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.utils import get_module_logger
logger = get_module_logger(__name__)
@ -20,10 +22,17 @@ class TreeEquityScoreETL(ExtractTransformLoad):
"""
def __init__(self):
self.TES_URL = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
self.TES_CSV = self.get_tmp_path() / "tes_2021_data.csv"
# input
self.TES_CSV = self.get_sources_path() / "tes_2021_data.csv"
# output
self.CSV_PATH = self.DATA_PATH / "dataset" / "tree_equity_score"
self.df: gpd.GeoDataFrame
self.tes_state_dfs = []
# config
self.states = [
"al",
"az",
@ -76,21 +85,36 @@ class TreeEquityScoreETL(ExtractTransformLoad):
"wy",
]
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
tes_url = "https://national-tes-data-share.s3.amazonaws.com/national_tes_share/"
sources = []
for state in self.states:
super().extract(
f"{self.TES_URL}{state}.zip.zip",
f"{self.get_tmp_path()}/{state}",
sources.append(
ZIPDataSource(
source=f"{tes_url}{state}.zip.zip",
destination=self.get_sources_path() / state,
)
)
return sources
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
for state in self.states:
self.tes_state_dfs.append(
gpd.read_file(f"{self.get_sources_path()}/{state}/{state}.shp")
)
def transform(self) -> None:
tes_state_dfs = []
for state in self.states:
tes_state_dfs.append(
gpd.read_file(f"{self.get_tmp_path()}/{state}/{state}.shp")
)
self.df = gpd.GeoDataFrame(
pd.concat(tes_state_dfs), crs=tes_state_dfs[0].crs
pd.concat(self.tes_state_dfs), crs=self.tes_state_dfs[0].crs
)
# rename ID to Tract ID

View file

@ -4,63 +4,57 @@ import geopandas as gpd
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import ZIPDataSource
from data_pipeline.score import field_names
from data_pipeline.utils import get_module_logger
from data_pipeline.utils import unzip_file_from_url
logger = get_module_logger(__name__)
class TribalETL(ExtractTransformLoad):
def __init__(self):
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
self.GEOGRAPHIC_BASE_PATH = (
self.DATA_PATH / "tribal" / "geographic_data"
)
self.CSV_BASE_PATH = self.DATA_PATH / "tribal" / "csv"
self.NATIONAL_TRIBAL_GEOJSON_PATH = (
self.GEOGRAPHIC_BASE_PATH / "usa.json"
)
self.USA_TRIBAL_DF_LIST = []
def extract(self) -> None:
"""Extract the tribal geojson zip files from Justice40 S3 data folder
def get_data_sources(self) -> [DataSource]:
Returns:
None
"""
bia_shapefile_zip_url = (
national_lar_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/BIA_National_LAR_updated_20220929.zip"
)
tsa_and_aian_geojson_zip_url = (
tsa_and_aian_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/BIA_TSA_and_AIAN_json.zip"
)
alaska_geojson_url = (
alaska_native_villages_url = (
settings.AWS_JUSTICE40_DATASOURCES_URL
+ "/Alaska_Native_Villages_json.zip"
)
unzip_file_from_url(
bia_shapefile_zip_url,
self.TMP_PATH,
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar",
)
unzip_file_from_url(
tsa_and_aian_geojson_zip_url,
self.TMP_PATH,
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian",
)
unzip_file_from_url(
alaska_geojson_url,
self.TMP_PATH,
self.GEOGRAPHIC_BASE_PATH / "alaska_native_villages",
)
return [
ZIPDataSource(
national_lar_url,
destination=self.get_sources_path() / "bia_national_lar",
),
ZIPDataSource(
source=tsa_and_aian_url,
destination=self.get_sources_path() / "tsa_and_aian",
),
ZIPDataSource(
source=alaska_native_villages_url,
destination=self.get_sources_path() / "alaska_native_villages",
),
]
def _transform_bia_national_lar(self, path: Path) -> None:
"""Transform the Tribal BIA National Lar Geodataframe and appends it to the
@ -187,21 +181,21 @@ class TribalETL(ExtractTransformLoad):
"""
# Set the filepaths:
bia_national_lar_shapefile = (
self.GEOGRAPHIC_BASE_PATH / "bia_national_lar"
self.get_sources_path() / "bia_national_lar"
)
bia_aian_supplemental_geojson = (
self.GEOGRAPHIC_BASE_PATH
self.get_sources_path()
/ "tsa_and_aian"
/ "BIA_AIAN_Supplemental.json"
)
bia_tsa_geojson = (
self.GEOGRAPHIC_BASE_PATH / "tsa_and_aian" / "BIA_TSA.json"
self.get_sources_path() / "tsa_and_aian" / "BIA_TSA.json"
)
alaska_native_villages_geojson = (
self.GEOGRAPHIC_BASE_PATH
self.get_sources_path()
/ "alaska_native_villages"
/ "AlaskaNativeVillages.gdb.geojson"
)
@ -225,7 +219,11 @@ class TribalETL(ExtractTransformLoad):
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
)
# note this works a little different than many of the ETLs. The file
# being written here is used again downstream, so it's placed in a
# special directory.
logger.debug("Writing national geojson file")
self.GEOGRAPHIC_BASE_PATH.mkdir(parents=True, exist_ok=True)
usa_tribal_df.to_file(
self.NATIONAL_TRIBAL_GEOJSON_PATH, driver="GeoJSON"
)

View file

@ -4,6 +4,7 @@ import geopandas as gpd
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.etl.sources.geo_utils import get_tract_geojson
@ -67,6 +68,9 @@ class TribalOverlapETL(ExtractTransformLoad):
self.census_tract_gdf: gpd.GeoDataFrame
self.tribal_gdf: gpd.GeoDataFrame
def get_data_sources(self) -> [DataSource]:
return [] # this uses already retrieved / calculated data
@staticmethod
def _create_string_from_list(series: pd.Series) -> str:
"""Helper method that creates a sorted string list (for tribal names)."""
@ -89,7 +93,12 @@ class TribalOverlapETL(ExtractTransformLoad):
return percentage_float
def extract(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
self.census_tract_gdf = get_tract_geojson()
self.tribal_gdf = get_tribal_geojson()

View file

@ -4,9 +4,10 @@ import geopandas as gpd
import numpy as np
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.datasource import FileDataSource
from data_pipeline.etl.base import ValidGeoLevel
from data_pipeline.etl.sources.geo_utils import add_tracts_for_geometries
from data_pipeline.utils import download_file_from_url
from data_pipeline.utils import get_module_logger
from data_pipeline.config import settings
@ -28,19 +29,6 @@ class USArmyFUDS(ExtractTransformLoad):
def __init__(self):
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
self.FILE_URL = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
"all_data_reported_to_Congress_in_FY2020.geojson"
)
else:
self.FILE_URL: str = (
"https://opendata.arcgis.com/api/v3/datasets/"
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
"data?format=geojson&spatialRefId=4326&where=1%3D1"
)
self.OUTPUT_PATH: Path = self.DATA_PATH / "dataset" / "us_army_fuds"
# Constants for output
@ -50,17 +38,27 @@ class USArmyFUDS(ExtractTransformLoad):
self.INELIGIBLE_FUDS_COUNT_FIELD_NAME,
self.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
]
self.DOWNLOAD_FILE_NAME = self.get_tmp_path() / "fuds.geojson"
self.fuds_source = self.get_sources_path() / "fuds.geojson"
self.raw_df: gpd.GeoDataFrame
self.output_df: pd.DataFrame
def extract(self) -> None:
download_file_from_url(
file_url=self.FILE_URL,
download_file_name=self.DOWNLOAD_FILE_NAME,
verify=True,
)
def get_data_sources(self) -> [DataSource]:
if settings.DATASOURCE_RETRIEVAL_FROM_AWS:
fuds_url = (
f"{settings.AWS_JUSTICE40_DATASOURCES_URL}/raw-data-sources/"
"us_army_fuds/Formerly_Used_Defense_Sites_(FUDS)_"
"all_data_reported_to_Congress_in_FY2020.geojson"
)
else:
fuds_url: str = (
"https://opendata.arcgis.com/api/v3/datasets/"
"3f8354667d5b4b1b8ad7a6e00c3cf3b1_1/downloads/"
"data?format=geojson&spatialRefId=4326&where=1%3D1"
)
return [FileDataSource(source=fuds_url, destination=self.fuds_source)]
def transform(self) -> None:
# before we try to do any transformation, get the tract data
@ -68,7 +66,7 @@ class USArmyFUDS(ExtractTransformLoad):
logger.debug("Loading FUDS data as GeoDataFrame for transform")
raw_df = gpd.read_file(
filename=self.DOWNLOAD_FILE_NAME,
filename=self.fuds_source,
low_memory=False,
)