From d4898b8f5539ed6c6497c4caa00f00fe7ffb2fce Mon Sep 17 00:00:00 2001 From: Carlos Felix <63804190+carlosfelix2@users.noreply.github.com> Date: Tue, 7 Jan 2025 14:18:58 -0500 Subject: [PATCH] Improve download retry logic --- data/data-pipeline/data_pipeline/config.py | 3 ++- .../data_pipeline/etl/downloader.py | 25 ++++++++++++++++--- data/data-pipeline/data_pipeline/utils.py | 7 ++++-- data/data-pipeline/settings.toml | 2 ++ 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/data/data-pipeline/data_pipeline/config.py b/data/data-pipeline/data_pipeline/config.py index ee75e535..79a7ee3d 100644 --- a/data/data-pipeline/data_pipeline/config.py +++ b/data/data-pipeline/data_pipeline/config.py @@ -12,7 +12,8 @@ settings = Dynaconf( # set root dir settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent settings.DATA_PATH = settings.APP_ROOT / "data" -settings.REQUESTS_DEFAULT_TIMOUT = 3600 +settings.REQUESTS_DEFAULT_TIMOUT = 300 +settings.REQUESTS_DEFAULT_RETRIES = 3 # To set an environment use: # Linux/OSX: export ENV_FOR_DYNACONF=staging # Windows: set ENV_FOR_DYNACONF=staging diff --git a/data/data-pipeline/data_pipeline/etl/downloader.py b/data/data-pipeline/data_pipeline/etl/downloader.py index 4cc4f83e..3d8b4eaa 100644 --- a/data/data-pipeline/data_pipeline/etl/downloader.py +++ b/data/data-pipeline/data_pipeline/etl/downloader.py @@ -12,13 +12,26 @@ from tenacity import retry, stop_after_attempt, wait_exponential logger = get_module_logger(__name__) +def _log_retry_failure(retry_state): + logger.warning( + f"Failure downloading {retry_state.kwargs['file_url']}. Will retry." + ) + + class Downloader: """A simple class to encapsulate the download capabilities of the application""" + num_retries = ( + settings.REQUEST_RETRIES + if "REQUEST_RETRIES" in settings + else settings.REQUESTS_DEFAULT_RETRIES + ) + @classmethod @retry( - stop=stop_after_attempt(3), + stop=stop_after_attempt(num_retries), wait=wait_exponential(multiplier=1, min=4, max=10), + before_sleep=_log_retry_failure, ) def download_file_from_url( cls, @@ -43,9 +56,12 @@ class Downloader: download_file_name.parent.mkdir(parents=True, exist_ok=True) logger.debug(f"Downloading {file_url}") - response = requests.get( - file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT + timeout = ( + settings.REQUEST_TIMEOUT + if "REQUEST_TIMEOUT" in settings + else settings.REQUESTS_DEFAULT_TIMOUT ) + response = requests.get(file_url, verify=verify, timeout=timeout) if response.status_code == 200: file_contents = response.content logger.debug("Downloaded.") @@ -64,8 +80,9 @@ class Downloader: @classmethod @retry( - stop=stop_after_attempt(3), + stop=stop_after_attempt(num_retries), wait=wait_exponential(multiplier=1, min=4, max=10), + before_sleep=_log_retry_failure, ) def download_zip_file_from_url( cls, diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py index 8e5fe8fa..e9eff6b5 100644 --- a/data/data-pipeline/data_pipeline/utils.py +++ b/data/data-pipeline/data_pipeline/utils.py @@ -147,9 +147,12 @@ def download_file_from_url( if not os.path.isdir(download_file_name.parent): os.mkdir(download_file_name.parent) - response = requests.get( - file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT + timeout = ( + settings.REQUEST_TIMEOUT + if "REQUEST_TIMEOUT" in settings + else settings.REQUESTS_DEFAULT_TIMOUT ) + response = requests.get(file_url, verify=verify, timeout=timeout) if response.status_code == 200: file_contents = response.content else: diff --git a/data/data-pipeline/settings.toml b/data/data-pipeline/settings.toml index a30ec4c6..802ad515 100644 --- a/data/data-pipeline/settings.toml +++ b/data/data-pipeline/settings.toml @@ -2,6 +2,8 @@ AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources" AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-versions/2.0" DATASOURCE_RETRIEVAL_FROM_AWS = true +REQUEST_TIMEOUT = 120 +REQUEST_RETRIES = 2 [development]