Add ability to cache ETL data sources (#2169)

* Add a rough prototype allowing a developer to pre-download data sources for all ETLs

* Update code to be more production-ish

* Move fetch to Extract part of ETL
* Create a downloader to house all downloading operations
* Remove unnecessary "name" in data source

* Format source files with black

* Fix issues from pylint and get the tests working with the new folder structure

* Clean up files with black

* Fix unzip test

* Add caching notes to README

* Fix tests (linting and case sensitivity bug)

* Address PR comments and add API keys for census where missing

* Merging comparator changes from main into this branch for the sake of the PR

* Add note on using cache (-u) during pipeline
This commit is contained in:
Travis Newby 2023-03-03 12:26:24 -06:00 committed by GitHub
commit 6f39033dde
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
52 changed files with 1787 additions and 686 deletions

View file

@ -22,6 +22,8 @@ from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
from data_pipeline.score import field_names
from data_pipeline.score.score_runner import ScoreRunner
from data_pipeline.utils import get_module_logger
from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__)
@ -55,7 +57,13 @@ class ScoreETL(ExtractTransformLoad):
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return (
[]
) # we have all prerequisite sources locally as a result of running the ETLs
def extract(self, use_cached_data_sources: bool = False) -> None:
# EJSCreen csv Load
ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
self.ejscreen_df = pd.read_csv(

View file

@ -15,6 +15,7 @@ from data_pipeline.utils import get_module_logger
from data_pipeline.utils import load_dict_from_yaml_object_fields
from data_pipeline.utils import load_yaml_dict_from_file
from data_pipeline.utils import zip_files
from data_pipeline.etl.datasource import DataSource
logger = get_module_logger(__name__)
@ -68,7 +69,13 @@ class GeoScoreETL(ExtractTransformLoad):
self.geojson_score_usa_high: gpd.GeoDataFrame
self.geojson_score_usa_low: gpd.GeoDataFrame
def extract(self) -> None:
def get_data_sources(self) -> [DataSource]:
return (
[]
) # we have all prerequisite sources locally as a result of generating the previous steps in the pipeline
def extract(self, use_cached_data_sources: bool = False) -> None:
# check census data
check_census_data_source(
census_data_path=self.DATA_PATH / "census",

View file

@ -2,7 +2,9 @@ import json
from pathlib import Path
import numpy as np
from numpy import float64
import pandas as pd
from data_pipeline.content.schemas.download_schemas import CodebookConfig
from data_pipeline.content.schemas.download_schemas import CSVConfig
from data_pipeline.content.schemas.download_schemas import ExcelConfig
@ -16,7 +18,8 @@ from data_pipeline.utils import get_module_logger
from data_pipeline.utils import load_dict_from_yaml_object_fields
from data_pipeline.utils import load_yaml_dict_from_file
from data_pipeline.utils import zip_files
from numpy import float64
from data_pipeline.etl.datasource import DataSource
from data_pipeline.etl.downloader import Downloader
from . import constants
@ -61,6 +64,11 @@ class PostScoreETL(ExtractTransformLoad):
self.yaml_global_config_sort_by_label = "sort_by_label"
# End YAML definition constants
def get_data_sources(self) -> [DataSource]:
return (
[]
) # we have all prerequisite sources locally as a result of generating the score
def _extract_counties(self, county_path: Path) -> pd.DataFrame:
logger.debug("Reading Counties CSV")
return pd.read_csv(
@ -97,17 +105,23 @@ class PostScoreETL(ExtractTransformLoad):
return df
def extract(self) -> None:
def extract(self, use_cached_data_sources: bool = False) -> None:
super().extract(
use_cached_data_sources
) # download and extract data sources
# check census data
check_census_data_source(
census_data_path=self.DATA_PATH / "census",
census_data_source=self.DATA_SOURCE,
)
super().extract(
constants.CENSUS_COUNTIES_ZIP_URL,
constants.TMP_PATH,
# TODO would could probably add this to the data sources for this file
Downloader.download_zip_file_from_url(
constants.CENSUS_COUNTIES_ZIP_URL, constants.TMP_PATH
)
self.input_counties_df = self._extract_counties(
constants.CENSUS_COUNTIES_FILE_NAME
)

View file

@ -13,7 +13,7 @@ from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.score import field_names
from data_pipeline.utils import download_file_from_url
from data_pipeline.etl.downloader import Downloader
from data_pipeline.utils import get_module_logger
from . import constants
@ -48,7 +48,7 @@ def check_score_data_source(
# download from s3 if census_data_source is aws
if score_data_source == "aws":
logger.debug("Fetching Score Tile data from AWS S3")
download_file_from_url(
Downloader.download_file_from_url(
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
)
else: