j40-cejst-2/data/data-pipeline/etl/base.py
Jorge Escobar 543d147e61
Data folder restructuring in preparation for 361 (#376)
* initial checkin

* gitignore and docker-compose update

* readme update and error on hud

* encoding issue

* one more small README change

* data roadmap re-strcuture

* pyproject sort

* small update to score output folders

* checkpoint

* couple of last fixes
2021-07-20 14:55:39 -04:00

63 lines
2 KiB
Python

from pathlib import Path
import pathlib
from config import settings
from utils import unzip_file_from_url, remove_all_from_dir
class ExtractTransformLoad(object):
"""
A class used to instantiate an ETL object to retrieve and process data from
datasets.
Attributes:
DATA_PATH (pathlib.Path): Local path where all data will be stored
TMP_PATH (pathlib.Path): Local path where temporary data will be stored
GEOID_FIELD_NAME (str): The common column name for a Census Block Group identifier
GEOID_TRACT_FIELD_NAME (str): The common column name for a Census Tract identifier
"""
DATA_PATH: Path = settings.APP_ROOT / "data"
TMP_PATH: Path = DATA_PATH / "tmp"
GEOID_FIELD_NAME: str = "GEOID10"
GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
def get_yaml_config(self) -> None:
"""Reads the YAML configuration file for the dataset and stores
the properies in the instance (upcoming feature)"""
pass
def check_ttl(self) -> None:
"""Checks if the ETL process can be run based on a the TLL value on the
YAML config (upcoming feature)"""
pass
def extract(
self, source_url: str = None, extract_path: Path = None
) -> None:
"""Extract the data from
a remote source. By default it provides code to get the file from a source url,
unzips it and stores it on an extract_path."""
# this can be accessed via super().extract()
if source_url and extract_path:
unzip_file_from_url(source_url, self.TMP_PATH, extract_path)
def transform(self) -> None:
"""Transform the data extracted into a format that can be consumed by the
score generator"""
raise NotImplementedError
def load(self) -> None:
"""Saves the transformed data in the specified local data folder or remote AWS S3
bucket"""
raise NotImplementedError
def cleanup(self) -> None:
"""Clears out any files stored in the TMP folder"""
remove_all_from_dir(self.TMP_PATH)