Data directory should adopt standard Poetry-suggested python package structure (#457)

* Fixes #456 - Our data directory should adopt standard python package structure * a few missed references * updating readme * updating requirements * Running Black * Fixes for flake8 * updating pylint
2025-07-26 09:51:16 -07:00 · 2021-08-05 15:35:54 -04:00 · 2021-08-05 15:35:54 -04:00 · c1568e87c0
commit c1568e87c0
parent 4d7465c833
61 changed files with 1273 additions and 1256 deletions
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -0,0 +1,60 @@
+from pathlib import Path
+
+from data_pipeline.config import settings
+from data_pipeline.utils import unzip_file_from_url, remove_all_from_dir
+
+
+class ExtractTransformLoad:
+    """
+    A class used to instantiate an ETL object to retrieve and process data from
+    datasets.
+
+    Attributes:
+        DATA_PATH (pathlib.Path): Local path where all data will be stored
+        TMP_PATH (pathlib.Path): Local path where temporary data will be stored
+        GEOID_FIELD_NAME (str): The common column name for a Census Block Group identifier
+        GEOID_TRACT_FIELD_NAME (str): The common column name for a Census Tract identifier
+    """
+
+    DATA_PATH: Path = settings.APP_ROOT / "data"
+    TMP_PATH: Path = DATA_PATH / "tmp"
+    GEOID_FIELD_NAME: str = "GEOID10"
+    GEOID_TRACT_FIELD_NAME: str = "GEOID10_TRACT"
+
+    def get_yaml_config(self) -> None:
+        """Reads the YAML configuration file for the dataset and stores
+        the properies in the instance (upcoming feature)"""
+
+        pass
+
+    def check_ttl(self) -> None:
+        """Checks if the ETL process can be run based on a the TLL value on the
+        YAML config (upcoming feature)"""
+
+        pass
+
+    def extract(self, source_url: str = None, extract_path: Path = None) -> None:
+        """Extract the data from
+        a remote source. By default it provides code to get the file from a source url,
+        unzips it and stores it on an extract_path."""
+
+        # this can be accessed via super().extract()
+        if source_url and extract_path:
+            unzip_file_from_url(source_url, self.TMP_PATH, extract_path)
+
+    def transform(self) -> None:
+        """Transform the data extracted into a format that can be consumed by the
+        score generator"""
+
+        raise NotImplementedError
+
+    def load(self) -> None:
+        """Saves the transformed data in the specified local data folder or remote AWS S3
+        bucket"""
+
+        raise NotImplementedError
+
+    def cleanup(self) -> None:
+        """Clears out any files stored in the TMP folder"""
+
+        remove_all_from_dir(self.TMP_PATH)