mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-03 09:34:19 -07:00
Add ability to cache ETL data sources (#2169)
* Add a rough prototype allowing a developer to pre-download data sources for all ETLs * Update code to be more production-ish * Move fetch to Extract part of ETL * Create a downloader to house all downloading operations * Remove unnecessary "name" in data source * Format source files with black * Fix issues from pylint and get the tests working with the new folder structure * Clean up files with black * Fix unzip test * Add caching notes to README * Fix tests (linting and case sensitivity bug) * Address PR comments and add API keys for census where missing * Merging comparator changes from main into this branch for the sake of the PR * Add note on using cache (-u) during pipeline
This commit is contained in:
parent
4d9c1dd11e
commit
6f39033dde
52 changed files with 1787 additions and 686 deletions
|
@ -22,6 +22,8 @@ from data_pipeline.etl.sources.us_army_fuds.etl import USArmyFUDS
|
|||
from data_pipeline.score import field_names
|
||||
from data_pipeline.score.score_runner import ScoreRunner
|
||||
from data_pipeline.utils import get_module_logger
|
||||
from data_pipeline.etl.datasource import DataSource
|
||||
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -55,7 +57,13 @@ class ScoreETL(ExtractTransformLoad):
|
|||
|
||||
self.ISLAND_DEMOGRAPHIC_BACKFILL_FIELDS: List[str] = []
|
||||
|
||||
def extract(self) -> None:
|
||||
def get_data_sources(self) -> [DataSource]:
|
||||
return (
|
||||
[]
|
||||
) # we have all prerequisite sources locally as a result of running the ETLs
|
||||
|
||||
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||
|
||||
# EJSCreen csv Load
|
||||
ejscreen_csv = constants.DATA_PATH / "dataset" / "ejscreen" / "usa.csv"
|
||||
self.ejscreen_df = pd.read_csv(
|
||||
|
|
|
@ -15,6 +15,7 @@ from data_pipeline.utils import get_module_logger
|
|||
from data_pipeline.utils import load_dict_from_yaml_object_fields
|
||||
from data_pipeline.utils import load_yaml_dict_from_file
|
||||
from data_pipeline.utils import zip_files
|
||||
from data_pipeline.etl.datasource import DataSource
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
|
@ -68,7 +69,13 @@ class GeoScoreETL(ExtractTransformLoad):
|
|||
self.geojson_score_usa_high: gpd.GeoDataFrame
|
||||
self.geojson_score_usa_low: gpd.GeoDataFrame
|
||||
|
||||
def extract(self) -> None:
|
||||
def get_data_sources(self) -> [DataSource]:
|
||||
return (
|
||||
[]
|
||||
) # we have all prerequisite sources locally as a result of generating the previous steps in the pipeline
|
||||
|
||||
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||
|
||||
# check census data
|
||||
check_census_data_source(
|
||||
census_data_path=self.DATA_PATH / "census",
|
||||
|
|
|
@ -2,7 +2,9 @@ import json
|
|||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from numpy import float64
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.content.schemas.download_schemas import CodebookConfig
|
||||
from data_pipeline.content.schemas.download_schemas import CSVConfig
|
||||
from data_pipeline.content.schemas.download_schemas import ExcelConfig
|
||||
|
@ -16,7 +18,8 @@ from data_pipeline.utils import get_module_logger
|
|||
from data_pipeline.utils import load_dict_from_yaml_object_fields
|
||||
from data_pipeline.utils import load_yaml_dict_from_file
|
||||
from data_pipeline.utils import zip_files
|
||||
from numpy import float64
|
||||
from data_pipeline.etl.datasource import DataSource
|
||||
from data_pipeline.etl.downloader import Downloader
|
||||
|
||||
from . import constants
|
||||
|
||||
|
@ -61,6 +64,11 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
self.yaml_global_config_sort_by_label = "sort_by_label"
|
||||
# End YAML definition constants
|
||||
|
||||
def get_data_sources(self) -> [DataSource]:
|
||||
return (
|
||||
[]
|
||||
) # we have all prerequisite sources locally as a result of generating the score
|
||||
|
||||
def _extract_counties(self, county_path: Path) -> pd.DataFrame:
|
||||
logger.debug("Reading Counties CSV")
|
||||
return pd.read_csv(
|
||||
|
@ -97,17 +105,23 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
return df
|
||||
|
||||
def extract(self) -> None:
|
||||
def extract(self, use_cached_data_sources: bool = False) -> None:
|
||||
|
||||
super().extract(
|
||||
use_cached_data_sources
|
||||
) # download and extract data sources
|
||||
|
||||
# check census data
|
||||
check_census_data_source(
|
||||
census_data_path=self.DATA_PATH / "census",
|
||||
census_data_source=self.DATA_SOURCE,
|
||||
)
|
||||
|
||||
super().extract(
|
||||
constants.CENSUS_COUNTIES_ZIP_URL,
|
||||
constants.TMP_PATH,
|
||||
# TODO would could probably add this to the data sources for this file
|
||||
Downloader.download_zip_file_from_url(
|
||||
constants.CENSUS_COUNTIES_ZIP_URL, constants.TMP_PATH
|
||||
)
|
||||
|
||||
self.input_counties_df = self._extract_counties(
|
||||
constants.CENSUS_COUNTIES_FILE_NAME
|
||||
)
|
||||
|
|
|
@ -13,7 +13,7 @@ from data_pipeline.etl.score.constants import TILES_ISLAND_AREA_FIPS_CODES
|
|||
from data_pipeline.etl.score.constants import TILES_PUERTO_RICO_FIPS_CODE
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
from data_pipeline.score import field_names
|
||||
from data_pipeline.utils import download_file_from_url
|
||||
from data_pipeline.etl.downloader import Downloader
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
from . import constants
|
||||
|
@ -48,7 +48,7 @@ def check_score_data_source(
|
|||
# download from s3 if census_data_source is aws
|
||||
if score_data_source == "aws":
|
||||
logger.debug("Fetching Score Tile data from AWS S3")
|
||||
download_file_from_url(
|
||||
Downloader.download_file_from_url(
|
||||
file_url=TILE_SCORE_CSV_S3_URL, download_file_name=TILE_SCORE_CSV
|
||||
)
|
||||
else:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue