mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
Run YAML load on all subclasses (#1518)
This commit is contained in:
parent
bb06628921
commit
e0a06997e6
5 changed files with 52 additions and 41 deletions
|
@ -98,48 +98,51 @@ class ExtractTransformLoad:
|
||||||
# It is used on the "load" base class method
|
# It is used on the "load" base class method
|
||||||
output_df: pd.DataFrame = None
|
output_df: pd.DataFrame = None
|
||||||
|
|
||||||
|
def __init_subclass__(cls) -> None:
|
||||||
|
cls.DATASET_CONFIG = cls.yaml_config_load()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def yaml_config_load(cls) -> dict:
|
def yaml_config_load(cls) -> dict:
|
||||||
"""Generate config dictionary and set instance variables from YAML dataset."""
|
"""Generate config dictionary and set instance variables from YAML dataset."""
|
||||||
|
if cls.NAME is not None:
|
||||||
# check if the class instance has score YAML definitions
|
# check if the class instance has score YAML definitions
|
||||||
datasets_config = load_yaml_dict_from_file(
|
datasets_config = load_yaml_dict_from_file(
|
||||||
cls.DATASET_CONFIG / "datasets.yml",
|
cls.DATASET_CONFIG / "datasets.yml",
|
||||||
DatasetsConfig,
|
DatasetsConfig,
|
||||||
)
|
|
||||||
|
|
||||||
# get the config for this dataset
|
|
||||||
try:
|
|
||||||
dataset_config = next(
|
|
||||||
item
|
|
||||||
for item in datasets_config.get("datasets")
|
|
||||||
if item["module_name"] == cls.NAME
|
|
||||||
)
|
)
|
||||||
except StopIteration:
|
|
||||||
# Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
|
|
||||||
logger.error(
|
|
||||||
f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
|
|
||||||
)
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# set some of the basic fields
|
# get the config for this dataset
|
||||||
cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
|
try:
|
||||||
"input_geoid_tract_field_name"
|
dataset_config = next(
|
||||||
]
|
item
|
||||||
|
for item in datasets_config.get("datasets")
|
||||||
|
if item["module_name"] == cls.NAME
|
||||||
|
)
|
||||||
|
except StopIteration:
|
||||||
|
# Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
|
||||||
|
logger.error(
|
||||||
|
f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
|
||||||
|
)
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
# get the columns to write on the CSV
|
# set some of the basic fields
|
||||||
# and set the constants
|
cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
|
||||||
cls.COLUMNS_TO_KEEP = [
|
"input_geoid_tract_field_name"
|
||||||
cls.GEOID_TRACT_FIELD_NAME, # always index with geoid tract id
|
]
|
||||||
]
|
|
||||||
for field in dataset_config["load_fields"]:
|
|
||||||
cls.COLUMNS_TO_KEEP.append(field["long_name"])
|
|
||||||
|
|
||||||
# set the constants for the class
|
# get the columns to write on the CSV
|
||||||
setattr(cls, field["df_field_name"], field["long_name"])
|
# and set the constants
|
||||||
|
cls.COLUMNS_TO_KEEP = [
|
||||||
|
cls.GEOID_TRACT_FIELD_NAME, # always index with geoid tract id
|
||||||
|
]
|
||||||
|
for field in dataset_config["load_fields"]:
|
||||||
|
cls.COLUMNS_TO_KEEP.append(field["long_name"])
|
||||||
|
|
||||||
# return the config dict
|
# set the constants for the class
|
||||||
return dataset_config
|
setattr(cls, field["df_field_name"], field["long_name"])
|
||||||
|
|
||||||
|
# return the config dict
|
||||||
|
return dataset_config
|
||||||
|
|
||||||
# This is a classmethod so it can be used by `get_data_frame` without
|
# This is a classmethod so it can be used by `get_data_frame` without
|
||||||
# needing to create an instance of the class. This is a use case in `etl_score`.
|
# needing to create an instance of the class. This is a use case in `etl_score`.
|
||||||
|
|
|
@ -99,4 +99,18 @@ datasets:
|
||||||
include_in_csv: true
|
include_in_csv: true
|
||||||
include_in_excel: true
|
include_in_excel: true
|
||||||
column_position: 1
|
column_position: 1
|
||||||
|
- long_name: "Exaple ETL"
|
||||||
|
short_name: "Example"
|
||||||
|
module_name: "example_dataset"
|
||||||
|
description: "An example dataset for documentation"
|
||||||
|
input_geoid_tract_field_name: "GEOID10_TRACT"
|
||||||
|
load_fields:
|
||||||
|
- short_name: "EXAMPLE_FIELD"
|
||||||
|
df_field_name: "Input Field 1"
|
||||||
|
long_name: "Example Field 1"
|
||||||
|
field_type: float
|
||||||
|
include_in_tiles: true
|
||||||
|
include_in_csv: true
|
||||||
|
include_in_excel: true
|
||||||
|
column_position: 1
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,6 @@ class ChildOpportunityIndex(ExtractTransformLoad):
|
||||||
READING_FIELD: str
|
READING_FIELD: str
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.DATASET_CONFIG = super().yaml_config_load()
|
|
||||||
self.SOURCE_URL = (
|
self.SOURCE_URL = (
|
||||||
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
|
"https://data.diversitydatakids.org/datastore/zip/f16fff12-b1e5-4f60-85d3-"
|
||||||
"3a0ededa30a0?format=csv"
|
"3a0ededa30a0?format=csv"
|
||||||
|
|
|
@ -19,8 +19,6 @@ class DOEEnergyBurden(ExtractTransformLoad):
|
||||||
REVISED_ENERGY_BURDEN_FIELD_NAME: str
|
REVISED_ENERGY_BURDEN_FIELD_NAME: str
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.DATASET_CONFIG = super().yaml_config_load()
|
|
||||||
|
|
||||||
self.OUTPUT_PATH: Path = (
|
self.OUTPUT_PATH: Path = (
|
||||||
self.DATA_PATH / "dataset" / "doe_energy_burden"
|
self.DATA_PATH / "dataset" / "doe_energy_burden"
|
||||||
)
|
)
|
||||||
|
|
|
@ -26,9 +26,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
|
||||||
AGRIVALUE_LOWER_BOUND = 408000
|
AGRIVALUE_LOWER_BOUND = 408000
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# load YAML config
|
|
||||||
self.DATASET_CONFIG = super().yaml_config_load()
|
|
||||||
|
|
||||||
# define the full path for the input CSV file
|
# define the full path for the input CSV file
|
||||||
self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
|
self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue