Improve download retry logic

This commit is contained in:
Carlos Felix 2025-01-07 14:18:58 -05:00 committed by Carlos Felix
parent 9e33932600
commit d4898b8f55
4 changed files with 30 additions and 7 deletions

View file

@ -12,7 +12,8 @@ settings = Dynaconf(
# set root dir # set root dir
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
settings.DATA_PATH = settings.APP_ROOT / "data" settings.DATA_PATH = settings.APP_ROOT / "data"
settings.REQUESTS_DEFAULT_TIMOUT = 3600 settings.REQUESTS_DEFAULT_TIMOUT = 300
settings.REQUESTS_DEFAULT_RETRIES = 3
# To set an environment use: # To set an environment use:
# Linux/OSX: export ENV_FOR_DYNACONF=staging # Linux/OSX: export ENV_FOR_DYNACONF=staging
# Windows: set ENV_FOR_DYNACONF=staging # Windows: set ENV_FOR_DYNACONF=staging

View file

@ -12,13 +12,26 @@ from tenacity import retry, stop_after_attempt, wait_exponential
logger = get_module_logger(__name__) logger = get_module_logger(__name__)
def _log_retry_failure(retry_state):
logger.warning(
f"Failure downloading {retry_state.kwargs['file_url']}. Will retry."
)
class Downloader: class Downloader:
"""A simple class to encapsulate the download capabilities of the application""" """A simple class to encapsulate the download capabilities of the application"""
num_retries = (
settings.REQUEST_RETRIES
if "REQUEST_RETRIES" in settings
else settings.REQUESTS_DEFAULT_RETRIES
)
@classmethod @classmethod
@retry( @retry(
stop=stop_after_attempt(3), stop=stop_after_attempt(num_retries),
wait=wait_exponential(multiplier=1, min=4, max=10), wait=wait_exponential(multiplier=1, min=4, max=10),
before_sleep=_log_retry_failure,
) )
def download_file_from_url( def download_file_from_url(
cls, cls,
@ -43,9 +56,12 @@ class Downloader:
download_file_name.parent.mkdir(parents=True, exist_ok=True) download_file_name.parent.mkdir(parents=True, exist_ok=True)
logger.debug(f"Downloading {file_url}") logger.debug(f"Downloading {file_url}")
response = requests.get( timeout = (
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT settings.REQUEST_TIMEOUT
if "REQUEST_TIMEOUT" in settings
else settings.REQUESTS_DEFAULT_TIMOUT
) )
response = requests.get(file_url, verify=verify, timeout=timeout)
if response.status_code == 200: if response.status_code == 200:
file_contents = response.content file_contents = response.content
logger.debug("Downloaded.") logger.debug("Downloaded.")
@ -64,8 +80,9 @@ class Downloader:
@classmethod @classmethod
@retry( @retry(
stop=stop_after_attempt(3), stop=stop_after_attempt(num_retries),
wait=wait_exponential(multiplier=1, min=4, max=10), wait=wait_exponential(multiplier=1, min=4, max=10),
before_sleep=_log_retry_failure,
) )
def download_zip_file_from_url( def download_zip_file_from_url(
cls, cls,

View file

@ -147,9 +147,12 @@ def download_file_from_url(
if not os.path.isdir(download_file_name.parent): if not os.path.isdir(download_file_name.parent):
os.mkdir(download_file_name.parent) os.mkdir(download_file_name.parent)
response = requests.get( timeout = (
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT settings.REQUEST_TIMEOUT
if "REQUEST_TIMEOUT" in settings
else settings.REQUESTS_DEFAULT_TIMOUT
) )
response = requests.get(file_url, verify=verify, timeout=timeout)
if response.status_code == 200: if response.status_code == 200:
file_contents = response.content file_contents = response.content
else: else:

View file

@ -2,6 +2,8 @@
AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources" AWS_JUSTICE40_DATASOURCES_URL = "https://justice40-data.s3.amazonaws.com/data-sources"
AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-versions/2.0" AWS_JUSTICE40_DATAPIPELINE_URL = "https://justice40-data.s3.amazonaws.com/data-versions/2.0"
DATASOURCE_RETRIEVAL_FROM_AWS = true DATASOURCE_RETRIEVAL_FROM_AWS = true
REQUEST_TIMEOUT = 120
REQUEST_RETRIES = 2
[development] [development]