mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-31 11:41:17 -07:00
Data folder restructuring in preparation for 361 (#376)
* initial checkin * gitignore and docker-compose update * readme update and error on hud * encoding issue * one more small README change * data roadmap re-strcuture * pyproject sort * small update to score output folders * checkpoint * couple of last fixes
This commit is contained in:
parent
3032a8305d
commit
543d147e61
66 changed files with 130 additions and 108 deletions
151
data/data-roadmap/utils/utils_data_set_description_schema.py
Normal file
151
data/data-roadmap/utils/utils_data_set_description_schema.py
Normal file
|
@ -0,0 +1,151 @@
|
|||
import importlib_resources
|
||||
import pathlib
|
||||
import yamale
|
||||
import yaml
|
||||
|
||||
# Set directories.
|
||||
DATA_ROADMAP_DIRECTORY = importlib_resources.files("data_roadmap")
|
||||
UTILS_DIRECTORY = DATA_ROADMAP_DIRECTORY / "utils"
|
||||
DATA_SET_DESCRIPTIONS_DIRECTORY = DATA_ROADMAP_DIRECTORY / "data_set_descriptions"
|
||||
|
||||
# Set file paths.
|
||||
DATA_SET_DESCRIPTION_SCHEMA_FILE_PATH = (
|
||||
DATA_ROADMAP_DIRECTORY / "data_set_description_schema.yaml"
|
||||
)
|
||||
DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH = (
|
||||
DATA_ROADMAP_DIRECTORY / "data_set_description_field_descriptions.yaml"
|
||||
)
|
||||
DATA_SET_DESCRIPTION_TEMPLATE_FILE_PATH = (
|
||||
DATA_ROADMAP_DIRECTORY / "data_set_description_template.yaml"
|
||||
)
|
||||
|
||||
|
||||
def load_data_set_description_schema(
|
||||
file_path: pathlib.PosixPath = DATA_SET_DESCRIPTION_SCHEMA_FILE_PATH,
|
||||
) -> yamale.schema.schema.Schema:
|
||||
"""Load from file the data set description schema."""
|
||||
schema = yamale.make_schema(path=file_path)
|
||||
|
||||
return schema
|
||||
|
||||
|
||||
def load_data_set_description_field_descriptions(
|
||||
file_path: pathlib.PosixPath = DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH,
|
||||
) -> dict:
|
||||
"""Load from file the descriptions of fields in the data set description."""
|
||||
# Load field descriptions.
|
||||
with open(file_path, "r") as stream:
|
||||
data_set_description_field_descriptions = yaml.safe_load(stream=stream)
|
||||
|
||||
return data_set_description_field_descriptions
|
||||
|
||||
|
||||
def validate_descriptions_for_schema(
|
||||
schema: yamale.schema.schema.Schema,
|
||||
field_descriptions: dict,
|
||||
) -> None:
|
||||
"""Validate descriptions for schema.
|
||||
|
||||
Checks that every field in the `yamale` schema also has a field
|
||||
description in the `field_descriptions` dict.
|
||||
"""
|
||||
for field_name in schema.dict.keys():
|
||||
if field_name not in field_descriptions:
|
||||
raise ValueError(
|
||||
f"Field `{field_name}` does not have a "
|
||||
f"description. Please add one to file `{DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH}`"
|
||||
)
|
||||
|
||||
for field_name in field_descriptions.keys():
|
||||
if field_name not in schema.dict.keys():
|
||||
raise ValueError(
|
||||
f"Field `{field_name}` has a description but is not in the " f"schema."
|
||||
)
|
||||
|
||||
|
||||
def validate_all_data_set_descriptions(
|
||||
data_set_description_schema: yamale.schema.schema.Schema,
|
||||
) -> None:
|
||||
"""Validate data set descriptions.
|
||||
|
||||
Validate each file in the `data_set_descriptions` directory the schema
|
||||
against the provided schema.
|
||||
|
||||
"""
|
||||
data_set_description_file_paths_generator = DATA_SET_DESCRIPTIONS_DIRECTORY.glob(
|
||||
"*.yaml"
|
||||
)
|
||||
|
||||
# Validate each file
|
||||
for file_path in data_set_description_file_paths_generator:
|
||||
print(f"Validating {file_path}...")
|
||||
|
||||
# Create a yamale Data object
|
||||
data_set_description = yamale.make_data(file_path)
|
||||
|
||||
# TODO: explore collecting all errors and raising them at once. - Lucas
|
||||
yamale.validate(schema=data_set_description_schema, data=data_set_description)
|
||||
|
||||
|
||||
def write_data_set_description_template_file(
|
||||
data_set_description_schema: yamale.schema.schema.Schema,
|
||||
data_set_description_field_descriptions: dict,
|
||||
template_file_path: str = DATA_SET_DESCRIPTION_TEMPLATE_FILE_PATH,
|
||||
) -> None:
|
||||
"""Write an example data set description with helpful comments."""
|
||||
template_file_lines = []
|
||||
|
||||
# Write comments at the top of the template
|
||||
template_file_lines.append(
|
||||
"# Note: This template is automatically generated by the function\n"
|
||||
"# `write_data_set_description_template_file` from the schema\n"
|
||||
"# and field descriptions files. Do not manually edit this file.\n\n"
|
||||
)
|
||||
|
||||
schema_dict = data_set_description_schema.dict
|
||||
for field_name, field_schema in schema_dict.items():
|
||||
template_file_lines.append(f"{field_name}: \n")
|
||||
template_file_lines.append(
|
||||
f"# Description: {data_set_description_field_descriptions[field_name]}\n"
|
||||
)
|
||||
template_file_lines.append(f"# Required field: {field_schema.is_required}\n")
|
||||
template_file_lines.append(f"# Field type: {field_schema.get_name()}\n")
|
||||
if type(field_schema) is yamale.validators.validators.Enum:
|
||||
template_file_lines.append(
|
||||
f"# Valid choices are one of the following: {field_schema.enums}\n"
|
||||
)
|
||||
|
||||
# Add an empty linebreak to separate fields.
|
||||
template_file_lines.append("\n")
|
||||
|
||||
with open(template_file_path, "w") as file:
|
||||
file.writelines(template_file_lines)
|
||||
|
||||
|
||||
def run_validations_and_write_template() -> None:
|
||||
"""Run validations of schema and descriptions, and write a template file."""
|
||||
# Load the schema and a separate dictionary
|
||||
data_set_description_schema = load_data_set_description_schema()
|
||||
data_set_description_field_descriptions = (
|
||||
load_data_set_description_field_descriptions()
|
||||
)
|
||||
|
||||
validate_descriptions_for_schema(
|
||||
schema=data_set_description_schema,
|
||||
field_descriptions=data_set_description_field_descriptions,
|
||||
)
|
||||
|
||||
# Validate all data set descriptions in the directory against schema.
|
||||
validate_all_data_set_descriptions(
|
||||
data_set_description_schema=data_set_description_schema
|
||||
)
|
||||
|
||||
# Write an example template for data set descriptions.
|
||||
write_data_set_description_template_file(
|
||||
data_set_description_schema=data_set_description_schema,
|
||||
data_set_description_field_descriptions=data_set_description_field_descriptions,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_validations_and_write_template()
|
|
@ -0,0 +1,248 @@
|
|||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
import yamale
|
||||
from data_roadmap.utils.utils_data_set_description_schema import (
|
||||
load_data_set_description_schema,
|
||||
load_data_set_description_field_descriptions,
|
||||
validate_descriptions_for_schema,
|
||||
validate_all_data_set_descriptions,
|
||||
write_data_set_description_template_file,
|
||||
)
|
||||
|
||||
|
||||
class UtilsDataSetDescriptionSchema(unittest.TestCase):
|
||||
@mock.patch("yamale.make_schema")
|
||||
def test_load_data_set_description_schema(self, make_schema_mock):
|
||||
load_data_set_description_schema(file_path="mock.yaml")
|
||||
|
||||
make_schema_mock.assert_called_once_with(path="mock.yaml")
|
||||
|
||||
@mock.patch("yaml.safe_load")
|
||||
def test_load_data_set_description_field_descriptions(self, yaml_safe_load_mock):
|
||||
# Note: this isn't a great test, we could mock the actual YAML to
|
||||
# make it better. - Lucas
|
||||
mock_dict = {
|
||||
"name": "The name of the thing.",
|
||||
"age": "The age of the thing.",
|
||||
"height": "The height of the thing.",
|
||||
"awesome": "The awesome of the thing.",
|
||||
"field": "The field of the thing.",
|
||||
}
|
||||
|
||||
yaml_safe_load_mock.return_value = mock_dict
|
||||
|
||||
field_descriptions = load_data_set_description_field_descriptions()
|
||||
|
||||
yaml_safe_load_mock.assert_called_once()
|
||||
|
||||
self.assertDictEqual(field_descriptions, mock_dict)
|
||||
|
||||
def test_validate_descriptions_for_schema(self):
|
||||
# Test when all descriptions are present.
|
||||
field_descriptions = {
|
||||
"name": "The name of the thing.",
|
||||
"age": "The age of the thing.",
|
||||
"height": "The height of the thing.",
|
||||
"awesome": "The awesome of the thing.",
|
||||
"field": "The field of the thing.",
|
||||
}
|
||||
|
||||
schema = yamale.make_schema(
|
||||
content="""
|
||||
name: str()
|
||||
age: int(max=200)
|
||||
height: num()
|
||||
awesome: bool()
|
||||
field: enum('option 1', 'option 2')
|
||||
"""
|
||||
)
|
||||
|
||||
# Should pass.
|
||||
validate_descriptions_for_schema(
|
||||
schema=schema, field_descriptions=field_descriptions
|
||||
)
|
||||
|
||||
field_descriptions_missing_one = {
|
||||
"name": "The name of the thing.",
|
||||
"age": "The age of the thing.",
|
||||
"height": "The height of the thing.",
|
||||
"awesome": "The awesome of the thing.",
|
||||
}
|
||||
|
||||
# Should fail because of the missing field description.
|
||||
with self.assertRaises(ValueError) as context_manager:
|
||||
validate_descriptions_for_schema(
|
||||
schema=schema, field_descriptions=field_descriptions_missing_one
|
||||
)
|
||||
|
||||
# Using `assertIn` because the file path is returned in the error
|
||||
# message, and it varies based on environment.
|
||||
self.assertIn(
|
||||
"Field `field` does not have a description. Please add one to file",
|
||||
str(context_manager.exception),
|
||||
)
|
||||
|
||||
field_descriptions_extra_one = {
|
||||
"name": "The name of the thing.",
|
||||
"age": "The age of the thing.",
|
||||
"height": "The height of the thing.",
|
||||
"awesome": "The awesome of the thing.",
|
||||
"field": "The field of the thing.",
|
||||
"extra": "Extra description.",
|
||||
}
|
||||
|
||||
# Should fail because of the extra field description.
|
||||
with self.assertRaises(ValueError) as context_manager:
|
||||
validate_descriptions_for_schema(
|
||||
schema=schema, field_descriptions=field_descriptions_extra_one
|
||||
)
|
||||
|
||||
# Using `assertIn` because the file path is returned in the error
|
||||
# message, and it varies based on environment.
|
||||
self.assertEquals(
|
||||
"Field `extra` has a description but is not in the schema.",
|
||||
str(context_manager.exception),
|
||||
)
|
||||
|
||||
def test_validate_all_data_set_descriptions(self):
|
||||
# Setup a few examples of `yamale` data *before* we mock the `make_data`
|
||||
# function.
|
||||
valid_data = yamale.make_data(
|
||||
content="""
|
||||
name: Bill
|
||||
age: 26
|
||||
height: 6.2
|
||||
awesome: True
|
||||
field: option 1
|
||||
"""
|
||||
)
|
||||
|
||||
invalid_data_1 = yamale.make_data(
|
||||
content="""
|
||||
name: Bill
|
||||
age: asdf
|
||||
height: 6.2
|
||||
awesome: asdf
|
||||
field: option 1
|
||||
"""
|
||||
)
|
||||
|
||||
invalid_data_2 = yamale.make_data(
|
||||
content="""
|
||||
age: 26
|
||||
height: 6.2
|
||||
awesome: True
|
||||
field: option 1
|
||||
"""
|
||||
)
|
||||
|
||||
# Mock `make_data`.
|
||||
with mock.patch.object(
|
||||
yamale, "make_data", return_value=None
|
||||
) as yamale_make_data_mock:
|
||||
schema = yamale.make_schema(
|
||||
content="""
|
||||
name: str()
|
||||
age: int(max=200)
|
||||
height: num()
|
||||
awesome: bool()
|
||||
field: enum('option 1', 'option 2')
|
||||
"""
|
||||
)
|
||||
|
||||
# Make the `make_data` method return valid data.
|
||||
yamale_make_data_mock.return_value = valid_data
|
||||
|
||||
# Should pass.
|
||||
validate_all_data_set_descriptions(data_set_description_schema=schema)
|
||||
|
||||
# Make some of the data invalid.
|
||||
yamale_make_data_mock.return_value = invalid_data_1
|
||||
|
||||
# Should fail because of the invalid field values.
|
||||
with self.assertRaises(yamale.YamaleError) as context_manager:
|
||||
validate_all_data_set_descriptions(data_set_description_schema=schema)
|
||||
|
||||
self.assertEqual(
|
||||
str(context_manager.exception),
|
||||
"""Error validating data
|
||||
age: 'asdf' is not a int.
|
||||
awesome: 'asdf' is not a bool.""",
|
||||
)
|
||||
|
||||
# Make some of the data missing.
|
||||
yamale_make_data_mock.return_value = invalid_data_2
|
||||
|
||||
# Should fail because of the missing fields.
|
||||
with self.assertRaises(yamale.YamaleError) as context_manager:
|
||||
validate_all_data_set_descriptions(data_set_description_schema=schema)
|
||||
|
||||
self.assertEqual(
|
||||
str(context_manager.exception),
|
||||
"""Error validating data
|
||||
name: Required field missing""",
|
||||
)
|
||||
|
||||
@mock.patch("builtins.open", new_callable=mock.mock_open)
|
||||
def test_write_data_set_description_template_file(self, builtins_writelines_mock):
|
||||
schema = yamale.make_schema(
|
||||
content="""
|
||||
name: str()
|
||||
age: int(max=200)
|
||||
height: num()
|
||||
awesome: bool()
|
||||
field: enum('option 1', 'option 2')
|
||||
"""
|
||||
)
|
||||
|
||||
data_set_description_field_descriptions = {
|
||||
"name": "The name of the thing.",
|
||||
"age": "The age of the thing.",
|
||||
"height": "The height of the thing.",
|
||||
"awesome": "The awesome of the thing.",
|
||||
"field": "The field of the thing.",
|
||||
}
|
||||
|
||||
write_data_set_description_template_file(
|
||||
data_set_description_schema=schema,
|
||||
data_set_description_field_descriptions=data_set_description_field_descriptions,
|
||||
template_file_path="mock_template.yaml",
|
||||
)
|
||||
|
||||
call_to_writelines = builtins_writelines_mock.mock_calls[2][1][0]
|
||||
|
||||
self.assertListEqual(
|
||||
call_to_writelines,
|
||||
[
|
||||
"# Note: This template is automatically generated by the function\n"
|
||||
"# `write_data_set_description_template_file` from the schema\n"
|
||||
"# and field descriptions files. Do not manually edit this file.\n\n",
|
||||
"name: \n",
|
||||
"# Description: The name of the thing.\n",
|
||||
"# Required field: True\n",
|
||||
"# Field type: str\n",
|
||||
"\n",
|
||||
"age: \n",
|
||||
"# Description: The age of the thing.\n",
|
||||
"# Required field: True\n",
|
||||
"# Field type: int\n",
|
||||
"\n",
|
||||
"height: \n",
|
||||
"# Description: The height of the thing.\n",
|
||||
"# Required field: True\n",
|
||||
"# Field type: num\n",
|
||||
"\n",
|
||||
"awesome: \n",
|
||||
"# Description: The awesome of the thing.\n",
|
||||
"# Required field: True\n",
|
||||
"# Field type: bool\n",
|
||||
"\n",
|
||||
"field: \n",
|
||||
"# Description: The field of the thing.\n",
|
||||
"# Required field: True\n",
|
||||
"# Field type: enum\n",
|
||||
"# Valid choices are one of the following: ('option 1', 'option 2')\n",
|
||||
"\n",
|
||||
],
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue