mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-09-11 13:18:19 -07:00
Changes to allow local runs
This commit is contained in:
parent
d298f7dedb
commit
ff9e7b9aa2
11 changed files with 3231 additions and 1867 deletions
7
data/data-pipeline/data_pipeline/constants.py
Normal file
7
data/data-pipeline/data_pipeline/constants.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
import logging
|
||||
|
||||
LOG_LEVEL = logging.DEBUG
|
||||
"""Log level for all loggers."""
|
||||
|
||||
NO_SSL_VERIFY = True
|
||||
"""Set to true to skip SSL verification when downloading files. Useful for local development."""
|
|
@ -22,6 +22,7 @@ from pathlib import Path
|
|||
from typing import List
|
||||
from dataclasses import dataclass
|
||||
from abc import ABC, abstractmethod
|
||||
from data_pipeline.constants import NO_SSL_VERIFY
|
||||
|
||||
from data_pipeline.etl.downloader import Downloader
|
||||
from data_pipeline.etl.sources.census_acs.etl_utils import (
|
||||
|
@ -65,7 +66,7 @@ class FileDataSource(DataSource):
|
|||
Downloader.download_file_from_url(
|
||||
file_url=self.source,
|
||||
download_file_name=self.destination,
|
||||
verify=True,
|
||||
verify=not NO_SSL_VERIFY,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
|
@ -85,7 +86,7 @@ class ZIPDataSource(DataSource):
|
|||
Downloader.download_zip_file_from_url(
|
||||
file_url=self.source,
|
||||
unzipped_file_path=self.destination,
|
||||
verify=True,
|
||||
verify=not NO_SSL_VERIFY,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
|
|
|
@ -6,7 +6,9 @@ import shutil
|
|||
|
||||
from pathlib import Path
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.utils import get_module_logger
|
||||
|
||||
logger = get_module_logger(__name__)
|
||||
|
||||
class Downloader:
|
||||
"""A simple class to encapsulate the download capabilities of the application"""
|
||||
|
@ -34,12 +36,13 @@ class Downloader:
|
|||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
download_file_name.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.debug(f"Downloading {file_url}")
|
||||
response = requests.get(
|
||||
file_url, verify=verify, timeout=settings.REQUESTS_DEFAULT_TIMOUT
|
||||
)
|
||||
if response.status_code == 200:
|
||||
file_contents = response.content
|
||||
logger.debug("Downloaded.")
|
||||
else:
|
||||
raise Exception(
|
||||
f"HTTP response {response.status_code} from url {file_url}. Info: {response.content}"
|
||||
|
|
|
@ -215,6 +215,7 @@ class CensusETL(ExtractTransformLoad):
|
|||
state_gdf = gpd.read_file(file_name)
|
||||
usa_df = usa_df.append(state_gdf)
|
||||
|
||||
logger.debug("Converting to CRS")
|
||||
usa_df = usa_df.to_crs(
|
||||
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
|
||||
)
|
||||
|
|
|
@ -13,6 +13,7 @@ import requests
|
|||
import urllib3
|
||||
import yaml
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.constants import LOG_LEVEL
|
||||
from data_pipeline.content.schemas.download_schemas import CodebookConfig
|
||||
from data_pipeline.content.schemas.download_schemas import CSVConfig
|
||||
from data_pipeline.content.schemas.download_schemas import ExcelConfig
|
||||
|
@ -48,7 +49,7 @@ def get_module_logger(module_name: str) -> logging.Logger:
|
|||
)
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.setLevel(LOG_LEVEL)
|
||||
logger.propagate = False # don't send log messages to the parent logger (to avoid duplicate log messages)
|
||||
return logger
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue