mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-25 18:10:16 -07:00
Data directory should adopt standard Poetry-suggested python package structure (#457)
* Fixes #456 - Our data directory should adopt standard python package structure * a few missed references * updating readme * updating requirements * Running Black * Fixes for flake8 * updating pylint
This commit is contained in:
parent
4d7465c833
commit
c1568e87c0
61 changed files with 1273 additions and 1256 deletions
60
data/data-pipeline/data_pipeline/etl/base.py
Normal file
60
data/data-pipeline/data_pipeline/etl/base.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
from pathlib import Path
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.utils import unzip_file_from_url, remove_all_from_dir
|
||||
|
||||
|
||||
class ExtractTransformLoad:
|
||||
"""
|
||||
A class used to instantiate an ETL object to retrieve and process data from
|
||||
datasets.
|
||||
|
||||
Attributes:
|
||||
DATA_PATH (pathlib.Path): Local path where all data will be stored
|
||||
TMP_PATH (pathlib.Path): Local path where temporary data will be stored
|
||||
GEOID_FIELD_NAME (str): The common column name for a Census Block Group identifier
|
||||
GEOID_TRACT_FIELD_NAME (str): The common column name for a Census Tract identifier
|
||||
"""
|
||||
|
||||
DATA_PATH: Path = settings.APP_ROOT / "data"
|
||||
TMP_PATH: Path = DATA_PATH / "tmp"
|
||||
GEOID_FIELD_NAME: str = "GEOID10"
|
||||
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
|
||||
|
||||
def get_yaml_config(self) -> None:
|
||||
"""Reads the YAML configuration file for the dataset and stores
|
||||
the properies in the instance (upcoming feature)"""
|
||||
|
||||
pass
|
||||
|
||||
def check_ttl(self) -> None:
|
||||
"""Checks if the ETL process can be run based on a the TLL value on the
|
||||
YAML config (upcoming feature)"""
|
||||
|
||||
pass
|
||||
|
||||
def extract(self, source_url: str = None, extract_path: Path = None) -> None:
|
||||
"""Extract the data from
|
||||
a remote source. By default it provides code to get the file from a source url,
|
||||
unzips it and stores it on an extract_path."""
|
||||
|
||||
# this can be accessed via super().extract()
|
||||
if source_url and extract_path:
|
||||
unzip_file_from_url(source_url, self.TMP_PATH, extract_path)
|
||||
|
||||
def transform(self) -> None:
|
||||
"""Transform the data extracted into a format that can be consumed by the
|
||||
score generator"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def load(self) -> None:
|
||||
"""Saves the transformed data in the specified local data folder or remote AWS S3
|
||||
bucket"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
def cleanup(self) -> None:
|
||||
"""Clears out any files stored in the TMP folder"""
|
||||
|
||||
remove_all_from_dir(self.TMP_PATH)
|
Loading…
Add table
Add a link
Reference in a new issue