Run YAML load on all subclasses (#1518)

2025-02-23 01:54:18 -08:00 · 2022-08-02 16:24:38 -04:00 · 2022-08-02 16:24:38 -04:00 · e0a06997e6
commit e0a06997e6
parent bb06628921
5 changed files with 52 additions and 41 deletions
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -98,48 +98,51 @@ class ExtractTransformLoad:
    # It is used on the "load" base class method
    output_df: pd.DataFrame = None

+    def __init_subclass__(cls) -> None:
+        cls.DATASET_CONFIG = cls.yaml_config_load()
+
    @classmethod
    def yaml_config_load(cls) -> dict:
        """Generate config dictionary and set instance variables from YAML dataset."""
-
-        # check if the class instance has score YAML definitions
-        datasets_config = load_yaml_dict_from_file(
-            cls.DATASET_CONFIG / "datasets.yml",
-            DatasetsConfig,
-        )
-
-        # get the config for this dataset
-        try:
-            dataset_config = next(
-                item
-                for item in datasets_config.get("datasets")
-                if item["module_name"] == cls.NAME
+        if cls.NAME is not None:
+            # check if the class instance has score YAML definitions
+            datasets_config = load_yaml_dict_from_file(
+                cls.DATASET_CONFIG / "datasets.yml",
+                DatasetsConfig,
            )
-        except StopIteration:
-            # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
-            logger.error(
-                f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
-            )
-            sys.exit()

-        # set some of the basic fields
-        cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
-            "input_geoid_tract_field_name"
-        ]
+            # get the config for this dataset
+            try:
+                dataset_config = next(
+                    item
+                    for item in datasets_config.get("datasets")
+                    if item["module_name"] == cls.NAME
+                )
+            except StopIteration:
+                # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
+                logger.error(
+                    f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
+                )
+                sys.exit()

-        # get the columns to write on the CSV
-        # and set the constants
-        cls.COLUMNS_TO_KEEP = [
-            cls.GEOID_TRACT_FIELD_NAME,  # always index with geoid tract id
-        ]
-        for field in dataset_config["load_fields"]:
-            cls.COLUMNS_TO_KEEP.append(field["long_name"])
+            # set some of the basic fields
+            cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
+                "input_geoid_tract_field_name"
+            ]

-            # set the constants for the class
-            setattr(cls, field["df_field_name"], field["long_name"])
+            # get the columns to write on the CSV
+            # and set the constants
+            cls.COLUMNS_TO_KEEP = [
+                cls.GEOID_TRACT_FIELD_NAME,  # always index with geoid tract id
+            ]
+            for field in dataset_config["load_fields"]:
+                cls.COLUMNS_TO_KEEP.append(field["long_name"])

-        # return the config dict
-        return dataset_config
+                # set the constants for the class
+                setattr(cls, field["df_field_name"], field["long_name"])
+
+            # return the config dict
+            return dataset_config

    # This is a classmethod so it can be used by `get_data_frame` without
    # needing to create an instance of the class. This is a use case in `etl_score`.
--- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
+++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@ -99,4 +99,18 @@ datasets:
        include_in_csv: true
        include_in_excel: true
        column_position: 1
- 
+  - long_name: "Exaple ETL"
+    short_name: "Example"
+    module_name: "example_dataset"
+    description: "An example dataset for documentation"
+    input_geoid_tract_field_name: "GEOID10_TRACT"
+    load_fields:
+      - short_name: "EXAMPLE_FIELD"
+        df_field_name: "Input Field 1"
+        long_name: "Example Field 1" 
+        field_type: float
+        include_in_tiles: true
+        include_in_csv: true
+        include_in_excel: true
+        column_position: 1
+
--- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py
@ -38,7 +38,6 @@ class ChildOpportunityIndex(ExtractTransformLoad):
    READING_FIELD: str

    def __init__(self):
-        self.DATASET_CONFIG = super().yaml_config_load()
        self.SOURCE_URL = (
            "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
            "3a0ededa30a0?format=csv"
--- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py
@ -19,8 +19,6 @@ class DOEEnergyBurden(ExtractTransformLoad):
    REVISED_ENERGY_BURDEN_FIELD_NAME: str

    def __init__(self):
-        self.DATASET_CONFIG = super().yaml_config_load()
-
        self.OUTPUT_PATH: Path = (
            self.DATA_PATH / "dataset" / "doe_energy_burden"
        )
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -26,9 +26,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    AGRIVALUE_LOWER_BOUND = 408000

    def __init__(self):
-        # load YAML config
-        self.DATASET_CONFIG = super().yaml_config_load()
-
        # define the full path for the input CSV file
        self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"