mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-08-03 21:24:19 -07:00
Data folder restructuring in preparation for 361 (#376)
* initial checkin * gitignore and docker-compose update * readme update and error on hud * encoding issue * one more small README change * data roadmap re-strcuture * pyproject sort * small update to score output folders * checkpoint * couple of last fixes
This commit is contained in:
parent
3032a8305d
commit
543d147e61
66 changed files with 130 additions and 108 deletions
151
data/data-roadmap/utils/utils_data_set_description_schema.py
Normal file
151
data/data-roadmap/utils/utils_data_set_description_schema.py
Normal file
|
@ -0,0 +1,151 @@
|
|||
import importlib_resources
|
||||
import pathlib
|
||||
import yamale
|
||||
import yaml
|
||||
|
||||
# Set directories.
|
||||
DATA_ROADMAP_DIRECTORY = importlib_resources.files("data_roadmap")
|
||||
UTILS_DIRECTORY = DATA_ROADMAP_DIRECTORY / "utils"
|
||||
DATA_SET_DESCRIPTIONS_DIRECTORY = DATA_ROADMAP_DIRECTORY / "data_set_descriptions"
|
||||
|
||||
# Set file paths.
|
||||
DATA_SET_DESCRIPTION_SCHEMA_FILE_PATH = (
|
||||
DATA_ROADMAP_DIRECTORY / "data_set_description_schema.yaml"
|
||||
)
|
||||
DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH = (
|
||||
DATA_ROADMAP_DIRECTORY / "data_set_description_field_descriptions.yaml"
|
||||
)
|
||||
DATA_SET_DESCRIPTION_TEMPLATE_FILE_PATH = (
|
||||
DATA_ROADMAP_DIRECTORY / "data_set_description_template.yaml"
|
||||
)
|
||||
|
||||
|
||||
def load_data_set_description_schema(
|
||||
file_path: pathlib.PosixPath = DATA_SET_DESCRIPTION_SCHEMA_FILE_PATH,
|
||||
) -> yamale.schema.schema.Schema:
|
||||
"""Load from file the data set description schema."""
|
||||
schema = yamale.make_schema(path=file_path)
|
||||
|
||||
return schema
|
||||
|
||||
|
||||
def load_data_set_description_field_descriptions(
|
||||
file_path: pathlib.PosixPath = DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH,
|
||||
) -> dict:
|
||||
"""Load from file the descriptions of fields in the data set description."""
|
||||
# Load field descriptions.
|
||||
with open(file_path, "r") as stream:
|
||||
data_set_description_field_descriptions = yaml.safe_load(stream=stream)
|
||||
|
||||
return data_set_description_field_descriptions
|
||||
|
||||
|
||||
def validate_descriptions_for_schema(
|
||||
schema: yamale.schema.schema.Schema,
|
||||
field_descriptions: dict,
|
||||
) -> None:
|
||||
"""Validate descriptions for schema.
|
||||
|
||||
Checks that every field in the `yamale` schema also has a field
|
||||
description in the `field_descriptions` dict.
|
||||
"""
|
||||
for field_name in schema.dict.keys():
|
||||
if field_name not in field_descriptions:
|
||||
raise ValueError(
|
||||
f"Field `{field_name}` does not have a "
|
||||
f"description. Please add one to file `{DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH}`"
|
||||
)
|
||||
|
||||
for field_name in field_descriptions.keys():
|
||||
if field_name not in schema.dict.keys():
|
||||
raise ValueError(
|
||||
f"Field `{field_name}` has a description but is not in the " f"schema."
|
||||
)
|
||||
|
||||
|
||||
def validate_all_data_set_descriptions(
|
||||
data_set_description_schema: yamale.schema.schema.Schema,
|
||||
) -> None:
|
||||
"""Validate data set descriptions.
|
||||
|
||||
Validate each file in the `data_set_descriptions` directory the schema
|
||||
against the provided schema.
|
||||
|
||||
"""
|
||||
data_set_description_file_paths_generator = DATA_SET_DESCRIPTIONS_DIRECTORY.glob(
|
||||
"*.yaml"
|
||||
)
|
||||
|
||||
# Validate each file
|
||||
for file_path in data_set_description_file_paths_generator:
|
||||
print(f"Validating {file_path}...")
|
||||
|
||||
# Create a yamale Data object
|
||||
data_set_description = yamale.make_data(file_path)
|
||||
|
||||
# TODO: explore collecting all errors and raising them at once. - Lucas
|
||||
yamale.validate(schema=data_set_description_schema, data=data_set_description)
|
||||
|
||||
|
||||
def write_data_set_description_template_file(
|
||||
data_set_description_schema: yamale.schema.schema.Schema,
|
||||
data_set_description_field_descriptions: dict,
|
||||
template_file_path: str = DATA_SET_DESCRIPTION_TEMPLATE_FILE_PATH,
|
||||
) -> None:
|
||||
"""Write an example data set description with helpful comments."""
|
||||
template_file_lines = []
|
||||
|
||||
# Write comments at the top of the template
|
||||
template_file_lines.append(
|
||||
"# Note: This template is automatically generated by the function\n"
|
||||
"# `write_data_set_description_template_file` from the schema\n"
|
||||
"# and field descriptions files. Do not manually edit this file.\n\n"
|
||||
)
|
||||
|
||||
schema_dict = data_set_description_schema.dict
|
||||
for field_name, field_schema in schema_dict.items():
|
||||
template_file_lines.append(f"{field_name}: \n")
|
||||
template_file_lines.append(
|
||||
f"# Description: {data_set_description_field_descriptions[field_name]}\n"
|
||||
)
|
||||
template_file_lines.append(f"# Required field: {field_schema.is_required}\n")
|
||||
template_file_lines.append(f"# Field type: {field_schema.get_name()}\n")
|
||||
if type(field_schema) is yamale.validators.validators.Enum:
|
||||
template_file_lines.append(
|
||||
f"# Valid choices are one of the following: {field_schema.enums}\n"
|
||||
)
|
||||
|
||||
# Add an empty linebreak to separate fields.
|
||||
template_file_lines.append("\n")
|
||||
|
||||
with open(template_file_path, "w") as file:
|
||||
file.writelines(template_file_lines)
|
||||
|
||||
|
||||
def run_validations_and_write_template() -> None:
|
||||
"""Run validations of schema and descriptions, and write a template file."""
|
||||
# Load the schema and a separate dictionary
|
||||
data_set_description_schema = load_data_set_description_schema()
|
||||
data_set_description_field_descriptions = (
|
||||
load_data_set_description_field_descriptions()
|
||||
)
|
||||
|
||||
validate_descriptions_for_schema(
|
||||
schema=data_set_description_schema,
|
||||
field_descriptions=data_set_description_field_descriptions,
|
||||
)
|
||||
|
||||
# Validate all data set descriptions in the directory against schema.
|
||||
validate_all_data_set_descriptions(
|
||||
data_set_description_schema=data_set_description_schema
|
||||
)
|
||||
|
||||
# Write an example template for data set descriptions.
|
||||
write_data_set_description_template_file(
|
||||
data_set_description_schema=data_set_description_schema,
|
||||
data_set_description_field_descriptions=data_set_description_field_descriptions,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_validations_and_write_template()
|
Loading…
Add table
Add a link
Reference in a new issue