import importlib_resources import pathlib import yamale import yaml # Set directories. DATA_ROADMAP_DIRECTORY = importlib_resources.files("data_roadmap") UTILS_DIRECTORY = DATA_ROADMAP_DIRECTORY / "utils" DATA_SET_DESCRIPTIONS_DIRECTORY = DATA_ROADMAP_DIRECTORY / "data_set_descriptions" # Set file paths. DATA_SET_DESCRIPTION_SCHEMA_FILE_PATH = ( DATA_ROADMAP_DIRECTORY / "data_set_description_schema.yaml" ) DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH = ( DATA_ROADMAP_DIRECTORY / "data_set_description_field_descriptions.yaml" ) DATA_SET_DESCRIPTION_TEMPLATE_FILE_PATH = ( DATA_ROADMAP_DIRECTORY / "data_set_description_template.yaml" ) def load_data_set_description_schema( file_path: pathlib.PosixPath = DATA_SET_DESCRIPTION_SCHEMA_FILE_PATH, ) -> yamale.schema.schema.Schema: """Load from file the data set description schema.""" schema = yamale.make_schema(path=file_path) return schema def load_data_set_description_field_descriptions( file_path: pathlib.PosixPath = DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH, ) -> dict: """Load from file the descriptions of fields in the data set description.""" # Load field descriptions. with open(file_path, "r") as stream: data_set_description_field_descriptions = yaml.safe_load(stream=stream) return data_set_description_field_descriptions def validate_descriptions_for_schema( schema: yamale.schema.schema.Schema, field_descriptions: dict, ) -> None: """Validate descriptions for schema. Checks that every field in the `yamale` schema also has a field description in the `field_descriptions` dict. """ for field_name in schema.dict.keys(): if field_name not in field_descriptions: raise ValueError( f"Field `{field_name}` does not have a " f"description. Please add one to file `{DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH}`" ) for field_name in field_descriptions.keys(): if field_name not in schema.dict.keys(): raise ValueError( f"Field `{field_name}` has a description but is not in the " f"schema." ) def validate_all_data_set_descriptions( data_set_description_schema: yamale.schema.schema.Schema, ) -> None: """Validate data set descriptions. Validate each file in the `data_set_descriptions` directory the schema against the provided schema. """ data_set_description_file_paths_generator = DATA_SET_DESCRIPTIONS_DIRECTORY.glob( "*.yaml" ) # Validate each file for file_path in data_set_description_file_paths_generator: print(f"Validating {file_path}...") # Create a yamale Data object data_set_description = yamale.make_data(file_path) # TODO: explore collecting all errors and raising them at once. - Lucas yamale.validate(schema=data_set_description_schema, data=data_set_description) def write_data_set_description_template_file( data_set_description_schema: yamale.schema.schema.Schema, data_set_description_field_descriptions: dict, template_file_path: str = DATA_SET_DESCRIPTION_TEMPLATE_FILE_PATH, ) -> None: """Write an example data set description with helpful comments.""" template_file_lines = [] # Write comments at the top of the template template_file_lines.append( "# Note: This template is automatically generated by the function\n" "# `write_data_set_description_template_file` from the schema\n" "# and field descriptions files. Do not manually edit this file.\n\n" ) schema_dict = data_set_description_schema.dict for field_name, field_schema in schema_dict.items(): template_file_lines.append(f"{field_name}: \n") template_file_lines.append( f"# Description: {data_set_description_field_descriptions[field_name]}\n" ) template_file_lines.append(f"# Required field: {field_schema.is_required}\n") template_file_lines.append(f"# Field type: {field_schema.get_name()}\n") if type(field_schema) is yamale.validators.validators.Enum: template_file_lines.append( f"# Valid choices are one of the following: {field_schema.enums}\n" ) # Add an empty linebreak to separate fields. template_file_lines.append("\n") with open(template_file_path, "w") as file: file.writelines(template_file_lines) def run_validations_and_write_template() -> None: """Run validations of schema and descriptions, and write a template file.""" # Load the schema and a separate dictionary data_set_description_schema = load_data_set_description_schema() data_set_description_field_descriptions = ( load_data_set_description_field_descriptions() ) validate_descriptions_for_schema( schema=data_set_description_schema, field_descriptions=data_set_description_field_descriptions, ) # Validate all data set descriptions in the directory against schema. validate_all_data_set_descriptions( data_set_description_schema=data_set_description_schema ) # Write an example template for data set descriptions. write_data_set_description_template_file( data_set_description_schema=data_set_description_schema, data_set_description_field_descriptions=data_set_description_field_descriptions, ) if __name__ == "__main__": run_validations_and_write_template()