diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 21f008cc..d9372191 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -98,48 +98,51 @@ class ExtractTransformLoad: # It is used on the "load" base class method output_df: pd.DataFrame = None + def __init_subclass__(cls) -> None: + cls.DATASET_CONFIG = cls.yaml_config_load() + @classmethod def yaml_config_load(cls) -> dict: """Generate config dictionary and set instance variables from YAML dataset.""" - - # check if the class instance has score YAML definitions - datasets_config = load_yaml_dict_from_file( - cls.DATASET_CONFIG / "datasets.yml", - DatasetsConfig, - ) - - # get the config for this dataset - try: - dataset_config = next( - item - for item in datasets_config.get("datasets") - if item["module_name"] == cls.NAME + if cls.NAME is not None: + # check if the class instance has score YAML definitions + datasets_config = load_yaml_dict_from_file( + cls.DATASET_CONFIG / "datasets.yml", + DatasetsConfig, ) - except StopIteration: - # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope. - logger.error( - f"Exception encountered while extracting dataset config for dataset {cls.NAME}" - ) - sys.exit() - # set some of the basic fields - cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[ - "input_geoid_tract_field_name" - ] + # get the config for this dataset + try: + dataset_config = next( + item + for item in datasets_config.get("datasets") + if item["module_name"] == cls.NAME + ) + except StopIteration: + # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope. + logger.error( + f"Exception encountered while extracting dataset config for dataset {cls.NAME}" + ) + sys.exit() - # get the columns to write on the CSV - # and set the constants - cls.COLUMNS_TO_KEEP = [ - cls.GEOID_TRACT_FIELD_NAME, # always index with geoid tract id - ] - for field in dataset_config["load_fields"]: - cls.COLUMNS_TO_KEEP.append(field["long_name"]) + # set some of the basic fields + cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[ + "input_geoid_tract_field_name" + ] - # set the constants for the class - setattr(cls, field["df_field_name"], field["long_name"]) + # get the columns to write on the CSV + # and set the constants + cls.COLUMNS_TO_KEEP = [ + cls.GEOID_TRACT_FIELD_NAME, # always index with geoid tract id + ] + for field in dataset_config["load_fields"]: + cls.COLUMNS_TO_KEEP.append(field["long_name"]) - # return the config dict - return dataset_config + # set the constants for the class + setattr(cls, field["df_field_name"], field["long_name"]) + + # return the config dict + return dataset_config # This is a classmethod so it can be used by `get_data_frame` without # needing to create an instance of the class. This is a use case in `etl_score`. diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml index b348246f..ed609ab5 100644 --- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml +++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml @@ -99,4 +99,18 @@ datasets: include_in_csv: true include_in_excel: true column_position: 1 - \ No newline at end of file + - long_name: "Exaple ETL" + short_name: "Example" + module_name: "example_dataset" + description: "An example dataset for documentation" + input_geoid_tract_field_name: "GEOID10_TRACT" + load_fields: + - short_name: "EXAMPLE_FIELD" + df_field_name: "Input Field 1" + long_name: "Example Field 1" + field_type: float + include_in_tiles: true + include_in_csv: true + include_in_excel: true + column_position: 1 + diff --git a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py index 6f55458c..beace420 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/child_opportunity_index/etl.py @@ -38,7 +38,6 @@ class ChildOpportunityIndex(ExtractTransformLoad): READING_FIELD: str def __init__(self): - self.DATASET_CONFIG = super().yaml_config_load() self.SOURCE_URL = ( "https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-" "3a0ededa30a0?format=csv" diff --git a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py index 6250aaff..0f67c402 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/doe_energy_burden/etl.py @@ -19,8 +19,6 @@ class DOEEnergyBurden(ExtractTransformLoad): REVISED_ENERGY_BURDEN_FIELD_NAME: str def __init__(self): - self.DATASET_CONFIG = super().yaml_config_load() - self.OUTPUT_PATH: Path = ( self.DATA_PATH / "dataset" / "doe_energy_burden" ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py index 1129f17c..b25989bf 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py @@ -26,9 +26,6 @@ class NationalRiskIndexETL(ExtractTransformLoad): AGRIVALUE_LOWER_BOUND = 408000 def __init__(self): - # load YAML config - self.DATASET_CONFIG = super().yaml_config_load() - # define the full path for the input CSV file self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"