Improve download retry logic

This commit is contained in:
Carlos Felix 2025-01-07 14:18:58 -05:00 committed by Carlos Felix
parent 9e33932600
commit d4898b8f55
4 changed files with 30 additions and 7 deletions

View file

@ -12,7 +12,8 @@ settings = Dynaconf(
# set root dir
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
settings.DATA_PATH = settings.APP_ROOT / "data"
settings.REQUESTS_DEFAULT_TIMOUT = 3600
settings.REQUESTS_DEFAULT_TIMOUT = 300
settings.REQUESTS_DEFAULT_RETRIES = 3
# To set an environment use:
# Linux/OSX: export ENV_FOR_DYNACONF=staging
# Windows: set ENV_FOR_DYNACONF=staging

View file

@ -12,13 +12,26 @@ from tenacity import retry, stop_after_attempt, wait_exponential
logger = get_module_logger(__name__)
def _log_retry_failure(retry_state):
logger.warning(
f"Failure downloading {retry_state.kwargs['file_url']}. Will retry."
)
class Downloader:
"""A simple class to encapsulate the download capabilities of the application"""
num_retries = (
settings.REQUEST_RETRIES
if "REQUEST_RETRIES" in settings
else settings.REQUESTS_DEFAULT_RETRIES
)
@classmethod
@retry(
stop=stop_after_attempt(3),
stop=stop_after_attempt(num_retries),
wait=wait_exponential(multiplier=1, min=4, max=10),
before_sleep=_log_retry_failure,
)
def download_file_from_url(
cls,
@ -43,9 +56,12 @@ class Downloader:
download_file_name.parent.mkdir(parents=True, exist_ok=True)
logger.debug(f"Downloading {file_url}")
response = requests.get(
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
timeout = (
settings.REQUEST_TIMEOUT
if "REQUEST_TIMEOUT" in settings
else settings.REQUESTS_DEFAULT_TIMOUT
)
response = requests.get(file_url, verify=verify, timeout=timeout)
if response.status_code == 200:
file_contents = response.content
logger.debug("Downloaded.")
@ -64,8 +80,9 @@ class Downloader:
@classmethod
@retry(
stop=stop_after_attempt(3),
stop=stop_after_attempt(num_retries),
wait=wait_exponential(multiplier=1, min=4, max=10),
before_sleep=_log_retry_failure,
)
def download_zip_file_from_url(
cls,

View file

@ -147,9 +147,12 @@ def download_file_from_url(
if not os.path.isdir(download_file_name.parent):
os.mkdir(download_file_name.parent)
response = requests.get(
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
timeout = (
settings.REQUEST_TIMEOUT
if "REQUEST_TIMEOUT" in settings
else settings.REQUESTS_DEFAULT_TIMOUT
)
response = requests.get(file_url, verify=verify, timeout=timeout)
if response.status_code == 200:
file_contents = response.content
else:

View file

@ -2,6 +2,8 @@
AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources"
AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-versions/2.0"
DATASOURCE_RETRIEVAL_FROM_AWS = true
REQUEST_TIMEOUT = 120
REQUEST_RETRIES = 2
[development]