diff --git a/data/data-pipeline/data_pipeline/content/config/excel.yml b/data/data-pipeline/data_pipeline/content/config/excel.yml index e9f77ba1..81f7ae61 100644 --- a/data/data-pipeline/data_pipeline/content/config/excel.yml +++ b/data/data-pipeline/data_pipeline/content/config/excel.yml @@ -7,8 +7,7 @@ global_config: excel_config: default_column_width: 30 sheets: - - main: - label: "Data" + - label: "Data" fields: - score_name: GEOID10_TRACT label: Census tract ID diff --git a/data/data-pipeline/data_pipeline/content/schemas/csv.py b/data/data-pipeline/data_pipeline/content/schemas/csv.py deleted file mode 100644 index a37150db..00000000 --- a/data/data-pipeline/data_pipeline/content/schemas/csv.py +++ /dev/null @@ -1,34 +0,0 @@ -from dataclasses import dataclass, field -from enum import Enum -from typing import List - - -class FieldType(Enum): - STRING = "string" - INT64 = "int64" - BOOL = "bool" - FLOAT = "float" - PERCENTAGE = "percentage" - LOSS_RATE_PERCENTAGE = "loss_rate_percentage" - - -@dataclass -class CSVConfig: - @dataclass - class GlobalConfig: - @dataclass - class RoundingNum: - float: int - loss_rate_percentage: int - - sort_by_label: str - rounding_num: RoundingNum - - @dataclass - class Field: - score_name: str - label: str - format: FieldType = field(metadata={"by_value": True}) - - global_config: GlobalConfig - fields: List[Field] diff --git a/data/data-pipeline/data_pipeline/content/schemas/download_schemas.py b/data/data-pipeline/data_pipeline/content/schemas/download_schemas.py new file mode 100644 index 00000000..820ab2d9 --- /dev/null +++ b/data/data-pipeline/data_pipeline/content/schemas/download_schemas.py @@ -0,0 +1,57 @@ +from dataclasses import dataclass, field +from enum import Enum +from typing import List + + +class FieldType(Enum): + STRING = "string" + INT64 = "int64" + BOOL = "bool" + FLOAT = "float" + PERCENTAGE = "percentage" + LOSS_RATE_PERCENTAGE = "loss_rate_percentage" + + +@dataclass +class RoundingNum: + float: int + loss_rate_percentage: int + + +@dataclass +class Field: + score_name: str + label: str + format: FieldType = field(metadata={"by_value": True}) + + +@dataclass +class CSVConfig: + @dataclass + class GlobalConfig: + sort_by_label: str + rounding_num: RoundingNum + + global_config: GlobalConfig + fields: List[Field] + + +@dataclass +class ExcelConfig: + @dataclass + class GlobalConfig: + @dataclass + class ExcelGlobalConfig: + default_column_width: int + + sort_by_label: str + rounding_num: RoundingNum + excel_config: ExcelGlobalConfig + + @dataclass + class SheetItem: + label: str + fields: List[Field] + + global_config: GlobalConfig + sheets: List[SheetItem] diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 2ddfbb37..aa45dc78 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -3,6 +3,10 @@ import json from numpy import float64 import numpy as np import pandas as pd +from data_pipeline.content.schemas.download_schemas import ( + CSVConfig, + ExcelConfig, +) from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.score.etl_utils import floor_series @@ -452,7 +456,7 @@ class PostScoreETL(ExtractTransformLoad): # open excel yaml config excel_csv_config = load_yaml_dict_from_file( - self.CONTENT_CONFIG / "excel.yml" + self.CONTENT_CONFIG / "excel.yml", ExcelConfig ) # Define Excel Columns Column Width @@ -525,7 +529,7 @@ class PostScoreETL(ExtractTransformLoad): logger.info("Writing downloadable csv") # open yaml config downloadable_csv_config = load_yaml_dict_from_file( - self.CONTENT_CONFIG / "csv.yml" + self.CONTENT_CONFIG / "csv.yml", CSVConfig ) downloadable_df = self._create_downloadable_data( score_df=self.output_score_county_state_merged_df, diff --git a/data/data-pipeline/data_pipeline/utils.py b/data/data-pipeline/data_pipeline/utils.py index 0e7bcd02..a25665e0 100644 --- a/data/data-pipeline/data_pipeline/utils.py +++ b/data/data-pipeline/data_pipeline/utils.py @@ -1,4 +1,4 @@ -from typing import List, Type +from typing import List, Union import datetime import json import logging @@ -14,7 +14,10 @@ import yaml from marshmallow_dataclass import class_schema from data_pipeline.config import settings -from data_pipeline.content.schemas.csv import CSVConfig +from data_pipeline.content.schemas.download_schemas import ( + CSVConfig, + ExcelConfig, +) ## zlib is not available on all systems @@ -330,7 +333,7 @@ def zip_directory( def load_yaml_dict_from_file( - yaml_file_path: Path, yaml_schema: Type[CSVConfig] + yaml_file_path: Path, schema_class: Union[CSVConfig, ExcelConfig] ) -> dict: """Load a YAML file specified in path into a Python dictionary. @@ -343,7 +346,10 @@ def load_yaml_dict_from_file( with open(yaml_file_path, encoding="UTF-8") as file: yaml_dict = yaml.load(file, Loader=yaml.FullLoader) - pass + # validate YAML + yaml_config_schema = class_schema(schema_class) + yaml_config_schema().load(yaml_dict) + return yaml_dict diff --git a/data/data-pipeline/poetry.lock b/data/data-pipeline/poetry.lock index 38799b10..0917b134 100644 --- a/data/data-pipeline/poetry.lock +++ b/data/data-pipeline/poetry.lock @@ -870,6 +870,17 @@ lint = ["pre-commit (>=1.18,<2.0)"] tests = ["pytest (>=5.4)", "pytest-mypy-plugins (>=1.2.0)", "typing-extensions (>=3.7.2,<3.8.0)"] union = ["typeguard"] +[[package]] +name = "marshmallow-enum" +version = "1.5.1" +description = "Enum field for Marshmallow" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +marshmallow = ">=2.0.0" + [[package]] name = "matplotlib" version = "3.5.1" @@ -1847,7 +1858,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "4278ecd8aab0cf352d62961687c33f92cda5e6d246309b046484dd797edf4986" +content-hash = "2dda480b8f50a2414ff01d3c73663c28d64bc6df01f950367763c241a84b02f6" [metadata.files] appnope = [ @@ -2371,6 +2382,10 @@ marshmallow-dataclass = [ {file = "marshmallow_dataclass-8.5.3-py3-none-any.whl", hash = "sha256:eefeff62ee975c64d293d2db9370e7e748a2ff83dcb5109416b75e087a2ac02e"}, {file = "marshmallow_dataclass-8.5.3.tar.gz", hash = "sha256:c0c5e1ea8d0e557b6fa00343799a9a9e60757b948fb096076beb6aa76bd68d30"}, ] +marshmallow-enum = [ + {file = "marshmallow-enum-1.5.1.tar.gz", hash = "sha256:38e697e11f45a8e64b4a1e664000897c659b60aa57bfa18d44e226a9920b6e58"}, + {file = "marshmallow_enum-1.5.1-py2.py3-none-any.whl", hash = "sha256:57161ab3dbfde4f57adeb12090f39592e992b9c86d206d02f6bd03ebec60f072"}, +] matplotlib = [ {file = "matplotlib-3.5.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:456cc8334f6d1124e8ff856b42d2cc1c84335375a16448189999496549f7182b"}, {file = "matplotlib-3.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8a77906dc2ef9b67407cec0bdbf08e3971141e535db888974a915be5e1e3efc6"}, diff --git a/data/data-pipeline/pyproject.toml b/data/data-pipeline/pyproject.toml index ec5d9a00..7140e02a 100644 --- a/data/data-pipeline/pyproject.toml +++ b/data/data-pipeline/pyproject.toml @@ -25,6 +25,7 @@ ipython = "^7.31.1" jupyter = "^1.0.0" jupyter-contrib-nbextensions = "^0.5.1" marshmallow-dataclass = "^8.5.3" +marshmallow-enum = "^1.5.1" matplotlib = "^3.4.2" numpy = "^1.22.1" pandas = "^1.2.5"