From ad6dbf9709cf2bea3d40c346ebb515a9cc8d02ad Mon Sep 17 00:00:00 2001 From: Saran Ahluwalia <94847739+saran-ahluwalia@users.noreply.github.com> Date: Fri, 10 Dec 2021 13:54:46 -0500 Subject: [PATCH] remove data roadmap directory from repository (#1034) Removed data roadmap --- data/data-roadmap/README.md | 153 ----------- data/data-roadmap/__init__.py | 0 ...ta_set_description_field_descriptions.yaml | 39 --- .../data_set_description_schema.yaml | 24 -- .../data_set_description_template.yaml | 94 ------- .../data_set_descriptions/PM25.yaml | 35 --- .../data_set_descriptions/__init__.py | 0 data/data-roadmap/requirements.txt | 1 - data/data-roadmap/setup.py | 21 -- .../utils_data_set_description_schema.py | 151 ----------- .../utils_data_set_description_schema_test.py | 248 ------------------ 11 files changed, 766 deletions(-) delete mode 100644 data/data-roadmap/README.md delete mode 100644 data/data-roadmap/__init__.py delete mode 100644 data/data-roadmap/data_set_description_field_descriptions.yaml delete mode 100644 data/data-roadmap/data_set_description_schema.yaml delete mode 100644 data/data-roadmap/data_set_description_template.yaml delete mode 100644 data/data-roadmap/data_set_descriptions/PM25.yaml delete mode 100644 data/data-roadmap/data_set_descriptions/__init__.py delete mode 100644 data/data-roadmap/requirements.txt delete mode 100644 data/data-roadmap/setup.py delete mode 100644 data/data-roadmap/utils/utils_data_set_description_schema.py delete mode 100644 data/data-roadmap/utils/utils_data_set_description_schema_test.py diff --git a/data/data-roadmap/README.md b/data/data-roadmap/README.md deleted file mode 100644 index 9e8f1808..00000000 --- a/data/data-roadmap/README.md +++ /dev/null @@ -1,153 +0,0 @@ -# Overview - -This document describes our "data roadmap", which serves several purposes. - -# Data roadmap goals - -The goals of the data roadmap are as follows: - -- Tracking data sets being considered for inclusion in the Climate and Economic Justice Screening Tool (CEJST), either as a data set that is included in the cumulative impacts score or a reference data set that is not included in the score - -- Prioritizing data sets, so that it's obvious to developers working on the CEJST which data sets to incorporate next into the tool - -- Gathering important details about each data set, such as its geographic resolution and the year it was last updated, so that the CEJST team can make informed decisions about what data to prioritize - -- Tracking the problem areas that each data set relates to (e.g., a certain data set may relate to the problem of pesticide exposure amongst migrant farm workers) - -- Enabling members of the public to submit ideas for problem areas or data sets to be considered for inclusion in the CEJST, with easy-to-use and accessible tools - -- Enabling members of the public to submit revisions to the information about each problem area or data set, with easy-to-use and accessible tools - -- Enabling the CEJST development team to review suggestions before incorporating them officially into the data roadmap, to filter out potential noise and spam, or consider how requests may lead to changes in software features and documentation - -# User stories - -These goals can map onto several user stories for the data roadmap, such as: - -- As a community member, I want to suggest a new idea for a dataset. -- As a community member, I want to understand what happened with my suggestion for a new dataset. -- As a community member, I want to edit the details of a dataset proposal to add more information. -- As a WHEJAC board member, I want to vote on what data sources should be prioritized next. -- As a product manager, I want to filter based on characteristics of the data. -- As a developer, I want to know what to work on next. - -# Data set descriptions - -There are lots of details that are important to track for each data set. This -information helps us prepare to integrate a data set into the tool and prioritize -between different options for data in the data roadmap. - -In order to support a process of peer review on edits and updates, these details are -tracked in one `YAML` file per data set description in the directory -[data_roadmap/data_set_descriptions](data_roadmap/data_set_descriptions). - -Each data set description includes a number of fields, some of which are required. -The schema defining these fields is written in [Yamale](https://github.com/23andMe/Yamale) -and lives at [data_roadmap/data_set_description_schema.yaml](data_roadmap/data_set_description_schema.yaml). - -Because `Yamale` does not provide a method for describing fields, we've created an -additional file that includes written descriptions of the meaning of each field in -the schema. These live in [data_roadmap/data_set_description_field_descriptions.yaml](data_roadmap/data_set_description_field_descriptions.yaml). - -In order to provide a helpful starting point for people who are ready to contribute -ideas for a new data set for consideration, there is an auto-generated data set -description template that lives at [data_roadmap/data_set_description_template.yaml](data_roadmap/data_set_description_template.yaml). - -# Steps to add a new data set description: the "easy" way - -Soon we will create a Google Form that contributors can use to submit ideas for new -data sets. The Google Form will match the schema of the data set descriptions. Please -see [this ticket](https://app.zenhub.com/workspaces/justice40-60993f6e05473d0010ec44e3/issues/usds/justice40-tool/39) -for tracking this work. - -# Steps to add a new data set description: the git-savvy way - -For those who are comfortable using `git` and `Markdown`, these are the steps to -contribute a new data set description to the data roadmap: - -1. Research and learn about the data set you're proposing for consideration. - -2. Clone the repository and learn about the [contribution guidelines for this - project](../docs/CONTRIBUTING.md). - -3. In your local version of the repository, copy the template from - `data_roadmap/data_set_description_template.yaml` into a new file that lives in - `data_roadmap/data_set_descriptions` and has the name of the data set as the name of the file. - -4. Edit this file to ensure it has all of the appropriate details about the data set. - -5. If you'd like, you can run the validations in `run_validations_and_write_template` - to ensure your contribution is valid according to the schema. These checks will also - run automatically on each commit. - -6. Create a pull request with your new data set description and submit it for peer - review. - -Thank you for contributing! - -# Tooling proposal and milestones - -There is no single tool that supports all the goals and user stories described above. -Therefore we've proposed combining a number of tools in a way that can support them all. - -We've also proposed various "milestones" that will allow us to iteratively and -sequentially build the data roadmap in a way that supports the entire vision but -starts with small and achievable steps. These milestones are proposed in order. - -This work is most accurately tracked in [this epic](https://app.zenhub.com/workspaces/justice40-60993f6e05473d0010ec44e3/issues/usds/justice40-tool/38). -We've also verbally described them below. - -## Milestone: YAML files for data sets and linter (Done) - -To start, we'll create a folder in this repository that can -house YAML files, one per data set. Each file will describe the characteristics of the data. - -The benefit of using a YAML file for this is that it's easy to subject changes to these files to peer review through the pull request process. This allows external collaborators from the open source community to submit suggested changes, which can be reviewed by the core CEJST team. - -We'll use a Python-based script to load all the files in the directory, and then run a schema validator to ensure all the files have valid entries. - -For schema validation, we propose using [Yamale](https://github.com/23andMe/Yamale). This provides a lightweight schema and validator, and [integrates nicely with GitHub actions](https://github.com/nrkno/yaml-schema-validator-github-action). - -If there's an improper format in any of the files, the schema validator will throw an error. - -As part of this milestone, we will also set this up to run automatically with each commit to any branch as part of CI/CD. - -## Milestone: Google forms integration - -To make it easy for non-engineer members of the public and advisory bodies such as the WHEJAC to submit suggestions for data sets, we will configure a Google Form that maps to the schema of the data set files. - -This will enable members of the public to fill out a simple form suggesting data without needing to understand Github or other engineering concepts. - -At first, these responses can just go into a resulting Google Sheet and be manually copied and converted into data set description files. Later, we can write a script that converts new entries in the Google Sheet automatically into data set files. This can be setup to run as a trigger on the addition of new rows to the Google Sheet. - -## Milestone: Post data in tabular format - -Add a script that runs the schema validator on all files and, if successful, posts the results in a tabular format. There are straightforward packages to post a Python dictionary / `pandas` dataframe to Google Sheets and/or Airtable. As part of this milestone, we will also set this up to run automatically with each commit to `main` as part of CI/CD. - -This will make it easier to filter the data to answer questions like, "which data sources are available at the census block group level". - -## Milestone: Tickets created for incorporating data sets - -For each data set that is being considered for inclusion soon in the tool, the project management team will create a ticket for "Incorporating \_\_\_ data set into the database", with a link to the data set detail document. This ticket will be created in the ticket tracking system used by the open source repository, which is ZenHub. This project management system will be public. - -At the initial launch, we are not planning for members of the open source community to be able to create tickets, but we would like to consider a process for members of the open source community creating tickets that can go through review by the CEJST team. - -This will help developers know what to work on next, and open source community members can also pick up tickets and work to integrate the data sets. - -## Milestone: Add problem areas - -We'll need to somehow track "problem areas" that describe problems in climate, environmental, and economic justice, even without specific proposals of data sets. For instance, a problem area may be "food insecurity", and a number of data sets can have this as their problem area. - -We can change the linter to validate that every data set description maps to one or more known problem areas. - -The benefit of this is that some non-data-focused members of the public or the WHEJAC advisory body may want to suggest we prioritize certain problem areas, with or without ideas for specific data sets that may best address that problem area. - -It is not clear at this time the best path forward for implementing these problem area descriptions. One option is to create a folder for descriptions of problem areas, which contains YAML files that get validated according to a schema. Another option would be simply to add these as an array in the description of data sets, or add labels to the tickets once data sets are tracked in GitHub tickets. - -## Milestone: Add prioritzation voting for WHEJAC and members of the public - -This milestone is currently the least well-defined. It's important that members of advisory bodies like the WHEJAC and members of the public be able to "upvote" certain data sets for inclusion in the tool. - -One potential for this is to use the [Stanford Participatory Budgeting Platform](https://pbstanford.org/). Here's an [example of voting on proposals within a limited budget](https://pbstanford.org/nyc8/knapsack). - -For instance, going into a quarterly planning cycle, the CEJST development team could estimate the amount of time (in developer-weeks) that it would take to clean, analyze, and incorporate each potential data set. For instance, incorporating some already-cleaned census data may take 1 week of a developer's time, while incorporating new asthma data from CMS that's never been publicly released could take 5 weeks. Given a "budget" of the number of developer weeks available (e.g., 2 developers for 13 weeks, or 26 developer-weeks), advisors can vote on their top priorities for inclusion in the tool within the available "budget". diff --git a/data/data-roadmap/__init__.py b/data/data-roadmap/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/data/data-roadmap/data_set_description_field_descriptions.yaml b/data/data-roadmap/data_set_description_field_descriptions.yaml deleted file mode 100644 index f2f3a552..00000000 --- a/data/data-roadmap/data_set_description_field_descriptions.yaml +++ /dev/null @@ -1,39 +0,0 @@ -# There is no method for adding field descriptions to `yamale` schemas. -# Therefore, we've created a dictionary here of fields and their descriptions. -name: A short name of the data set. -source: The URL pointing towards the data set itself or more information about the - data set. -relevance_to_environmental_justice: It's useful to spell out why this data is - relevant to EJ issues and/or can be used to identify EJ communities. -spatial_resolution: Dev team needs to know if the resolution is granular enough to be useful -public_status: Whether a dataset has already gone through public release process - (like Census data) or may need a lengthy review process (like Medicaid data). -sponsor: Whether there's a federal agency or non-governmental agency that is working - to provide and maintain this data. -subjective_rating_of_data_quality: Sometimes we don't have statistics on data - quality, but we know it is likely to be accurate or not. How much has it been - vetted by an agency; is this the de facto data set for the topic? -estimated_margin_of_error: Estimated margin of error on measurement, if known. Often - more narrow geographic measures have a higher margin of error due to a smaller sample - for each measurement. -known_data_quality_issues: It can be helpful to write out known problems. -geographic_coverage_percent: We want to think about data that is comprehensive across - America. -geographic_coverage_description: A verbal description of geographic coverage. -data_formats: Developers need to know what formats the data is available in -last_updated_date: When was the data last updated / refreshed? (In format YYYY-MM-DD. - If exact date is not known, use YYYY-01-01.) -frequency_of_updates: How often is this data updated? Is it updated on a reliable - cadence? -documentation: Link to docs. Also, is the documentation good enough? Can we get the - info we need? -data_can_go_in_cloud: Some datasets can not legally go in the cloud - -discussion: Review of other topics, such as - peer review (Overview or links out to peer review done on this dataset), - where and how data is available (e.g., Geoplatform.gov? Is it available from multiple - sources?), - risk assessment of the data (e.g. a vendor-processed version of the dataset might not - be open or good enough), - legal considerations (Legal disclaimers, assumption of risk, proprietary?), - accreditation (Is this source accredited?) diff --git a/data/data-roadmap/data_set_description_schema.yaml b/data/data-roadmap/data_set_description_schema.yaml deleted file mode 100644 index 3e12ad02..00000000 --- a/data/data-roadmap/data_set_description_schema.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# `yamale` schema for descriptions of data sets. -name: str(required=True) -source: str(required=True) -relevance_to_environmental_justice: str(required=False) -data_formats: enum('GeoJSON', 'Esri Shapefile (SHP, DBF, SHX)', 'GML', 'KML/KMZ', - 'GPX', 'CSV/XLSX', 'GDB', 'MBTILES', 'LAS', required=True) -spatial_resolution: enum('State/territory', 'County', 'Zip code', 'Census tract', - 'Census block group', 'Exact address or lat/long', 'Other', required=True) -public_status: enum('Not Released', 'Public', 'Public for certain audiences', 'Other', - required=True) -sponsor: str(required=True) -subjective_rating_of_data_quality: enum('Low Quality', 'Medium Quality', 'High - Quality', required=False) -estimated_margin_of_error: num(required=False) -known_data_quality_issues: str(required=False) -geographic_coverage_percent: num(required=False) -geographic_coverage_description: str(required=False) -last_updated_date: day(min='2001-01-01', max='2100-01-01', required=True) -frequency_of_updates: enum('Less than annually', 'Approximately annually', - 'Once very 1-6 months', - 'Daily or more frequently than daily', 'Unknown', required=True) -documentation: str(required=False) -data_can_go_in_cloud: bool(required=False) -discussion: str(required=False) diff --git a/data/data-roadmap/data_set_description_template.yaml b/data/data-roadmap/data_set_description_template.yaml deleted file mode 100644 index 7475ad04..00000000 --- a/data/data-roadmap/data_set_description_template.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# Note: This template is automatically generated by the function -# `write_data_set_description_template_file` from the schema -# and field descriptions files. Do not manually edit this file. - -name: -# Description: A short name of the data set. -# Required field: True -# Field type: str - -source: -# Description: The URL pointing towards the data set itself or more information about the data set. -# Required field: True -# Field type: str - -relevance_to_environmental_justice: -# Description: It's useful to spell out why this data is relevant to EJ issues and/or can be used to identify EJ communities. -# Required field: False -# Field type: str - -data_formats: -# Description: Developers need to know what formats the data is available in -# Required field: True -# Field type: enum -# Valid choices are one of the following: ('GeoJSON', 'Esri Shapefile (SHP, DBF, SHX)', 'GML', 'KML/KMZ', 'GPX', 'CSV/XLSX', 'GDB', 'MBTILES', 'LAS') - -spatial_resolution: -# Description: Dev team needs to know if the resolution is granular enough to be useful -# Required field: True -# Field type: enum -# Valid choices are one of the following: ('State/territory', 'County', 'Zip code', 'Census tract', 'Census block group', 'Exact address or lat/long', 'Other') - -public_status: -# Description: Whether a dataset has already gone through public release process (like Census data) or may need a lengthy review process (like Medicaid data). -# Required field: True -# Field type: enum -# Valid choices are one of the following: ('Not Released', 'Public', 'Public for certain audiences', 'Other') - -sponsor: -# Description: Whether there's a federal agency or non-governmental agency that is working to provide and maintain this data. -# Required field: True -# Field type: str - -subjective_rating_of_data_quality: -# Description: Sometimes we don't have statistics on data quality, but we know it is likely to be accurate or not. How much has it been vetted by an agency; is this the de facto data set for the topic? -# Required field: False -# Field type: enum -# Valid choices are one of the following: ('Low Quality', 'Medium Quality', 'High Quality') - -estimated_margin_of_error: -# Description: Estimated margin of error on measurement, if known. Often more narrow geographic measures have a higher margin of error due to a smaller sample for each measurement. -# Required field: False -# Field type: num - -known_data_quality_issues: -# Description: It can be helpful to write out known problems. -# Required field: False -# Field type: str - -geographic_coverage_percent: -# Description: We want to think about data that is comprehensive across America. -# Required field: False -# Field type: num - -geographic_coverage_description: -# Description: A verbal description of geographic coverage. -# Required field: False -# Field type: str - -last_updated_date: -# Description: When was the data last updated / refreshed? (In format YYYY-MM-DD. If exact date is not known, use YYYY-01-01.) -# Required field: True -# Field type: day - -frequency_of_updates: -# Description: How often is this data updated? Is it updated on a reliable cadence? -# Required field: True -# Field type: enum -# Valid choices are one of the following: ('Less than annually', 'Approximately annually', 'Once very 1-6 months', 'Daily or more frequently than daily', 'Unknown') - -documentation: -# Description: Link to docs. Also, is the documentation good enough? Can we get the info we need? -# Required field: False -# Field type: str - -data_can_go_in_cloud: -# Description: Some datasets can not legally go in the cloud -# Required field: False -# Field type: bool - -discussion: -# Description: Review of other topics, such as peer review (Overview or links out to peer review done on this dataset), where and how data is available (e.g., Geoplatform.gov? Is it available from multiple sources?), risk assessment of the data (e.g. a vendor-processed version of the dataset might not be open or good enough), legal considerations (Legal disclaimers, assumption of risk, proprietary?), accreditation (Is this source accredited?) -# Required field: False -# Field type: str - diff --git a/data/data-roadmap/data_set_descriptions/PM25.yaml b/data/data-roadmap/data_set_descriptions/PM25.yaml deleted file mode 100644 index 459e7ec0..00000000 --- a/data/data-roadmap/data_set_descriptions/PM25.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: Particulate Matter 2.5 - -source: https://gaftp.epa.gov/EJSCREEN/ - -relevance_to_environmental_justice: Particulate matter has a lot of adverse impacts - on health. - -data_formats: CSV/XLSX - -spatial_resolution: Census block group - -public_status: Public - -sponsor: EPA - -subjective_rating_of_data_quality: Medium Quality - -estimated_margin_of_error: - -known_data_quality_issues: Many PM 2.5 stations are known to be pretty far apart, so - averaging them can lead to data quality loss. - -geographic_coverage_percent: - -geographic_coverage_description: - -last_updated_date: 2017-01-01 - -frequency_of_updates: Less than annually - -documentation: https://www.epa.gov/sites/production/files/2015-05/documents/ejscreen_technical_document_20150505.pdf#page=13 - -data_can_go_in_cloud: True - -discussion: diff --git a/data/data-roadmap/data_set_descriptions/__init__.py b/data/data-roadmap/data_set_descriptions/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/data/data-roadmap/requirements.txt b/data/data-roadmap/requirements.txt deleted file mode 100644 index 11814a4e..00000000 --- a/data/data-roadmap/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -yamale==3.0.8 diff --git a/data/data-roadmap/setup.py b/data/data-roadmap/setup.py deleted file mode 100644 index 01c01fc2..00000000 --- a/data/data-roadmap/setup.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Setup script for `data_roadmap` package.""" -import os - -from setuptools import find_packages -from setuptools import setup - -# TODO: replace this with `poetry`. https://github.com/usds/justice40-tool/issues/57 -_PACKAGE_DIRECTORY = os.path.abspath(os.path.dirname(__file__)) - -with open(os.path.join(_PACKAGE_DIRECTORY, "requirements.txt")) as f: - requirements = f.readlines() - -setup( - name="data_roadmap", - description="Data roadmap package", - author="CEJST Development Team", - author_email="justice40open@usds.gov", - install_requires=requirements, - include_package_data=True, - packages=find_packages(), -) diff --git a/data/data-roadmap/utils/utils_data_set_description_schema.py b/data/data-roadmap/utils/utils_data_set_description_schema.py deleted file mode 100644 index f6e76068..00000000 --- a/data/data-roadmap/utils/utils_data_set_description_schema.py +++ /dev/null @@ -1,151 +0,0 @@ -import importlib_resources -import pathlib -import yamale -import yaml - -# Set directories. -DATA_ROADMAP_DIRECTORY = importlib_resources.files("data_roadmap") -UTILS_DIRECTORY = DATA_ROADMAP_DIRECTORY / "utils" -DATA_SET_DESCRIPTIONS_DIRECTORY = DATA_ROADMAP_DIRECTORY / "data_set_descriptions" - -# Set file paths. -DATA_SET_DESCRIPTION_SCHEMA_FILE_PATH = ( - DATA_ROADMAP_DIRECTORY / "data_set_description_schema.yaml" -) -DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH = ( - DATA_ROADMAP_DIRECTORY / "data_set_description_field_descriptions.yaml" -) -DATA_SET_DESCRIPTION_TEMPLATE_FILE_PATH = ( - DATA_ROADMAP_DIRECTORY / "data_set_description_template.yaml" -) - - -def load_data_set_description_schema( - file_path: pathlib.PosixPath = DATA_SET_DESCRIPTION_SCHEMA_FILE_PATH, -) -> yamale.schema.schema.Schema: - """Load from file the data set description schema.""" - schema = yamale.make_schema(path=file_path) - - return schema - - -def load_data_set_description_field_descriptions( - file_path: pathlib.PosixPath = DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH, -) -> dict: - """Load from file the descriptions of fields in the data set description.""" - # Load field descriptions. - with open(file_path, "r") as stream: - data_set_description_field_descriptions = yaml.safe_load(stream=stream) - - return data_set_description_field_descriptions - - -def validate_descriptions_for_schema( - schema: yamale.schema.schema.Schema, - field_descriptions: dict, -) -> None: - """Validate descriptions for schema. - - Checks that every field in the `yamale` schema also has a field - description in the `field_descriptions` dict. - """ - for field_name in schema.dict.keys(): - if field_name not in field_descriptions: - raise ValueError( - f"Field `{field_name}` does not have a " - f"description. Please add one to file `{DATA_SET_DESCRIPTION_FIELD_DESCRIPTIONS_FILE_PATH}`" - ) - - for field_name in field_descriptions.keys(): - if field_name not in schema.dict.keys(): - raise ValueError( - f"Field `{field_name}` has a description but is not in the " f"schema." - ) - - -def validate_all_data_set_descriptions( - data_set_description_schema: yamale.schema.schema.Schema, -) -> None: - """Validate data set descriptions. - - Validate each file in the `data_set_descriptions` directory the schema - against the provided schema. - - """ - data_set_description_file_paths_generator = DATA_SET_DESCRIPTIONS_DIRECTORY.glob( - "*.yaml" - ) - - # Validate each file - for file_path in data_set_description_file_paths_generator: - print(f"Validating {file_path}...") - - # Create a yamale Data object - data_set_description = yamale.make_data(file_path) - - # TODO: explore collecting all errors and raising them at once. - Lucas - yamale.validate(schema=data_set_description_schema, data=data_set_description) - - -def write_data_set_description_template_file( - data_set_description_schema: yamale.schema.schema.Schema, - data_set_description_field_descriptions: dict, - template_file_path: str = DATA_SET_DESCRIPTION_TEMPLATE_FILE_PATH, -) -> None: - """Write an example data set description with helpful comments.""" - template_file_lines = [] - - # Write comments at the top of the template - template_file_lines.append( - "# Note: This template is automatically generated by the function\n" - "# `write_data_set_description_template_file` from the schema\n" - "# and field descriptions files. Do not manually edit this file.\n\n" - ) - - schema_dict = data_set_description_schema.dict - for field_name, field_schema in schema_dict.items(): - template_file_lines.append(f"{field_name}: \n") - template_file_lines.append( - f"# Description: {data_set_description_field_descriptions[field_name]}\n" - ) - template_file_lines.append(f"# Required field: {field_schema.is_required}\n") - template_file_lines.append(f"# Field type: {field_schema.get_name()}\n") - if type(field_schema) is yamale.validators.validators.Enum: - template_file_lines.append( - f"# Valid choices are one of the following: {field_schema.enums}\n" - ) - - # Add an empty linebreak to separate fields. - template_file_lines.append("\n") - - with open(template_file_path, "w") as file: - file.writelines(template_file_lines) - - -def run_validations_and_write_template() -> None: - """Run validations of schema and descriptions, and write a template file.""" - # Load the schema and a separate dictionary - data_set_description_schema = load_data_set_description_schema() - data_set_description_field_descriptions = ( - load_data_set_description_field_descriptions() - ) - - validate_descriptions_for_schema( - schema=data_set_description_schema, - field_descriptions=data_set_description_field_descriptions, - ) - - # Validate all data set descriptions in the directory against schema. - validate_all_data_set_descriptions( - data_set_description_schema=data_set_description_schema - ) - - # Write an example template for data set descriptions. - write_data_set_description_template_file( - data_set_description_schema=data_set_description_schema, - data_set_description_field_descriptions=data_set_description_field_descriptions, - ) - - -if __name__ == "__main__": - run_validations_and_write_template() diff --git a/data/data-roadmap/utils/utils_data_set_description_schema_test.py b/data/data-roadmap/utils/utils_data_set_description_schema_test.py deleted file mode 100644 index 7d738bc2..00000000 --- a/data/data-roadmap/utils/utils_data_set_description_schema_test.py +++ /dev/null @@ -1,248 +0,0 @@ -import unittest -from unittest import mock - -import yamale -from data_roadmap.utils.utils_data_set_description_schema import ( - load_data_set_description_schema, - load_data_set_description_field_descriptions, - validate_descriptions_for_schema, - validate_all_data_set_descriptions, - write_data_set_description_template_file, -) - - -class UtilsDataSetDescriptionSchema(unittest.TestCase): - @mock.patch("yamale.make_schema") - def test_load_data_set_description_schema(self, make_schema_mock): - load_data_set_description_schema(file_path="mock.yaml") - - make_schema_mock.assert_called_once_with(path="mock.yaml") - - @mock.patch("yaml.safe_load") - def test_load_data_set_description_field_descriptions(self, yaml_safe_load_mock): - # Note: this isn't a great test, we could mock the actual YAML to - # make it better. - Lucas - mock_dict = { - "name": "The name of the thing.", - "age": "The age of the thing.", - "height": "The height of the thing.", - "awesome": "The awesome of the thing.", - "field": "The field of the thing.", - } - - yaml_safe_load_mock.return_value = mock_dict - - field_descriptions = load_data_set_description_field_descriptions() - - yaml_safe_load_mock.assert_called_once() - - self.assertDictEqual(field_descriptions, mock_dict) - - def test_validate_descriptions_for_schema(self): - # Test when all descriptions are present. - field_descriptions = { - "name": "The name of the thing.", - "age": "The age of the thing.", - "height": "The height of the thing.", - "awesome": "The awesome of the thing.", - "field": "The field of the thing.", - } - - schema = yamale.make_schema( - content=""" -name: str() -age: int(max=200) -height: num() -awesome: bool() -field: enum('option 1', 'option 2') -""" - ) - - # Should pass. - validate_descriptions_for_schema( - schema=schema, field_descriptions=field_descriptions - ) - - field_descriptions_missing_one = { - "name": "The name of the thing.", - "age": "The age of the thing.", - "height": "The height of the thing.", - "awesome": "The awesome of the thing.", - } - - # Should fail because of the missing field description. - with self.assertRaises(ValueError) as context_manager: - validate_descriptions_for_schema( - schema=schema, field_descriptions=field_descriptions_missing_one - ) - - # Using `assertIn` because the file path is returned in the error - # message, and it varies based on environment. - self.assertIn( - "Field `field` does not have a description. Please add one to file", - str(context_manager.exception), - ) - - field_descriptions_extra_one = { - "name": "The name of the thing.", - "age": "The age of the thing.", - "height": "The height of the thing.", - "awesome": "The awesome of the thing.", - "field": "The field of the thing.", - "extra": "Extra description.", - } - - # Should fail because of the extra field description. - with self.assertRaises(ValueError) as context_manager: - validate_descriptions_for_schema( - schema=schema, field_descriptions=field_descriptions_extra_one - ) - - # Using `assertIn` because the file path is returned in the error - # message, and it varies based on environment. - self.assertEquals( - "Field `extra` has a description but is not in the schema.", - str(context_manager.exception), - ) - - def test_validate_all_data_set_descriptions(self): - # Setup a few examples of `yamale` data *before* we mock the `make_data` - # function. - valid_data = yamale.make_data( - content=""" - name: Bill - age: 26 - height: 6.2 - awesome: True - field: option 1 - """ - ) - - invalid_data_1 = yamale.make_data( - content=""" - name: Bill - age: asdf - height: 6.2 - awesome: asdf - field: option 1 - """ - ) - - invalid_data_2 = yamale.make_data( - content=""" - age: 26 - height: 6.2 - awesome: True - field: option 1 - """ - ) - - # Mock `make_data`. - with mock.patch.object( - yamale, "make_data", return_value=None - ) as yamale_make_data_mock: - schema = yamale.make_schema( - content=""" - name: str() - age: int(max=200) - height: num() - awesome: bool() - field: enum('option 1', 'option 2') - """ - ) - - # Make the `make_data` method return valid data. - yamale_make_data_mock.return_value = valid_data - - # Should pass. - validate_all_data_set_descriptions(data_set_description_schema=schema) - - # Make some of the data invalid. - yamale_make_data_mock.return_value = invalid_data_1 - - # Should fail because of the invalid field values. - with self.assertRaises(yamale.YamaleError) as context_manager: - validate_all_data_set_descriptions(data_set_description_schema=schema) - - self.assertEqual( - str(context_manager.exception), - """Error validating data - age: 'asdf' is not a int. - awesome: 'asdf' is not a bool.""", - ) - - # Make some of the data missing. - yamale_make_data_mock.return_value = invalid_data_2 - - # Should fail because of the missing fields. - with self.assertRaises(yamale.YamaleError) as context_manager: - validate_all_data_set_descriptions(data_set_description_schema=schema) - - self.assertEqual( - str(context_manager.exception), - """Error validating data - name: Required field missing""", - ) - - @mock.patch("builtins.open", new_callable=mock.mock_open) - def test_write_data_set_description_template_file(self, builtins_writelines_mock): - schema = yamale.make_schema( - content=""" - name: str() - age: int(max=200) - height: num() - awesome: bool() - field: enum('option 1', 'option 2') - """ - ) - - data_set_description_field_descriptions = { - "name": "The name of the thing.", - "age": "The age of the thing.", - "height": "The height of the thing.", - "awesome": "The awesome of the thing.", - "field": "The field of the thing.", - } - - write_data_set_description_template_file( - data_set_description_schema=schema, - data_set_description_field_descriptions=data_set_description_field_descriptions, - template_file_path="mock_template.yaml", - ) - - call_to_writelines = builtins_writelines_mock.mock_calls[2][1][0] - - self.assertListEqual( - call_to_writelines, - [ - "# Note: This template is automatically generated by the function\n" - "# `write_data_set_description_template_file` from the schema\n" - "# and field descriptions files. Do not manually edit this file.\n\n", - "name: \n", - "# Description: The name of the thing.\n", - "# Required field: True\n", - "# Field type: str\n", - "\n", - "age: \n", - "# Description: The age of the thing.\n", - "# Required field: True\n", - "# Field type: int\n", - "\n", - "height: \n", - "# Description: The height of the thing.\n", - "# Required field: True\n", - "# Field type: num\n", - "\n", - "awesome: \n", - "# Description: The awesome of the thing.\n", - "# Required field: True\n", - "# Field type: bool\n", - "\n", - "field: \n", - "# Description: The field of the thing.\n", - "# Required field: True\n", - "# Field type: enum\n", - "# Valid choices are one of the following: ('option 1', 'option 2')\n", - "\n", - ], - )