j40-cejst-2/data/data-pipeline/data_pipeline/etl/downloader.py
Travis Newby 6f39033dde
Add ability to cache ETL data sources (#2169)
* Add a rough prototype allowing a developer to pre-download data sources for all ETLs

* Update code to be more production-ish

* Move fetch to Extract part of ETL
* Create a downloader to house all downloading operations
* Remove unnecessary "name" in data source

* Format source files with black

* Fix issues from pylint and get the tests working with the new folder structure

* Clean up files with black

* Fix unzip test

* Add caching notes to README

* Fix tests (linting and case sensitivity bug)

* Address PR comments and add API keys for census where missing

* Merging comparator changes from main into this branch for the sake of the PR

* Add note on using cache (-u) during pipeline
2023-03-03 12:26:24 -06:00

95 lines
2.9 KiB
Python

import uuid
import urllib3
import requests
import zipfile
import shutil
from pathlib import Path
from data_pipeline.config import settings
class Downloader:
"""A simple class to encapsulate the download capabilities of the application"""
@classmethod
def download_file_from_url(
cls,
file_url: str,
download_file_name: Path,
verify: bool = True,
) -> str:
"""Downloads a file from a remote URL location and returns the file location.
Args:
file_url (str): URL where the zip file is located
download_file_name (pathlib.Path): file path where the file will be downloaded (called downloaded.zip by default)
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
error (optional, default to False)
Returns:
None
"""
# disable https warning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
download_file_name.parent.mkdir(parents=True, exist_ok=True)
response = requests.get(
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
)
if response.status_code == 200:
file_contents = response.content
else:
raise Exception(
f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
)
# Write the contents to disk.
file = open(download_file_name, "wb")
file.write(file_contents)
file.close()
return download_file_name
@classmethod
def download_zip_file_from_url(
cls,
file_url: str,
unzipped_file_path: Path,
verify: bool = True,
) -> None:
"""Downloads a zip file from a remote URL location and unzips it in a specific directory, removing the temporary file after
Args:
file_url (str): URL where the zip file is located
unzipped_file_path (pathlib.Path): directory and name of the extracted file
verify (bool): A flag to check if the certificate is valid. If truthy, an invalid certificate will throw an
error (optional, default to False)
Returns:
None
"""
# dir_id allows us to evade race conditions on parallel ETLs
dir_id = uuid.uuid4()
zip_download_path = (
settings.DATA_PATH
/ "tmp"
/ "downloads"
/ f"{dir_id}"
/ "download.zip"
)
zip_file_path = Downloader.download_file_from_url(
file_url=file_url,
download_file_name=zip_download_path,
verify=verify,
)
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(unzipped_file_path)
# cleanup temporary file and directory
shutil.rmtree(zip_download_path.parent)