Marshmallow Schemas for YAML files (#1497)

* Marshmallow Schemas for YAML files * completed ticket * passing tests * lint * click dep * staging BE map * Pr review
2025-09-21 06:51:13 -07:00 · 2022-03-31 13:56:10 -04:00 · 2022-03-31 13:56:10 -04:00 · 859177a877
commit 859177a877
parent 27311b11e2
11 changed files with 387 additions and 183 deletions
--- a/data/data-pipeline/data_pipeline/content/config/excel.yml
+++ b/data/data-pipeline/data_pipeline/content/config/excel.yml
@ -7,8 +7,7 @@ global_config:
  excel_config:
    default_column_width: 30
 sheets:
-  - main:
-    label: "Data"
+  - label: "Data"
    fields:
      - score_name: GEOID10_TRACT
        label: Census tract ID
--- a/data/data-pipeline/data_pipeline/content/schemas/init.py
+++ b/data/data-pipeline/data_pipeline/content/schemas/init.py
--- a/data/data-pipeline/data_pipeline/content/schemas/download_schemas.py
+++ b/data/data-pipeline/data_pipeline/content/schemas/download_schemas.py
@ -0,0 +1,68 @@
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import List, Optional
+
+
+class FieldType(Enum):
+    STRING = "string"
+    INT64 = "int64"
+    BOOL = "bool"
+    FLOAT = "float"
+    PERCENTAGE = "percentage"
+    LOSS_RATE_PERCENTAGE = "loss_rate_percentage"
+
+
+@dataclass
+class RoundingNum:
+    float: int
+    loss_rate_percentage: int
+
+
+@dataclass
+class Field:
+    score_name: str
+    label: str
+    format: FieldType = field(metadata={"by_value": True})
+
+
+@dataclass
+class CSVConfig:
+    @dataclass
+    class GlobalConfig:
+        sort_by_label: str
+        rounding_num: RoundingNum
+
+    global_config: GlobalConfig
+    fields: List[Field]
+
+
+@dataclass
+class ExcelConfig:
+    @dataclass
+    class GlobalConfig:
+        @dataclass
+        class ExcelGlobalConfig:
+            default_column_width: int
+
+        sort_by_label: str
+        rounding_num: RoundingNum
+        excel_config: ExcelGlobalConfig
+
+    @dataclass
+    class SheetItem:
+        label: str
+        fields: List[Field]
+
+    global_config: GlobalConfig
+    sheets: List[SheetItem]
+
+
+@dataclass
+class CodebookConfig:
+    @dataclass
+    class Field:
+        score_name: str
+        notes: Optional[str]
+        category: Optional[str]
+
+    fields: List[Field]
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -3,6 +3,11 @@ import json
 from numpy import float64
 import numpy as np
 import pandas as pd
+from data_pipeline.content.schemas.download_schemas import (
+    CSVConfig,
+    CodebookConfig,
+    ExcelConfig,
+)

 from data_pipeline.etl.base import ExtractTransformLoad
 from data_pipeline.etl.score.etl_utils import floor_series, create_codebook
@ -460,7 +465,7 @@ class PostScoreETL(ExtractTransformLoad):

        # open excel yaml config
        excel_csv_config = load_yaml_dict_from_file(
-            self.CONTENT_CONFIG / "excel.yml"
+            self.CONTENT_CONFIG / "excel.yml", ExcelConfig
        )

        # Define Excel Columns Column Width
@ -535,7 +540,7 @@ class PostScoreETL(ExtractTransformLoad):
        logger.info("Writing downloadable csv")
        # open yaml config
        downloadable_csv_config = load_yaml_dict_from_file(
-            self.CONTENT_CONFIG / "csv.yml"
+            self.CONTENT_CONFIG / "csv.yml", CSVConfig
        )
        downloadable_df = self._create_downloadable_data(
            score_df=self.output_score_county_state_merged_df,
@ -557,7 +562,8 @@ class PostScoreETL(ExtractTransformLoad):

        # load supplemental codebook yml
        field_descriptions_for_codebook_config = load_yaml_dict_from_file(
-            self.CONTENT_CONFIG / "field_descriptions_for_codebook.yml"
+            self.CONTENT_CONFIG / "field_descriptions_for_codebook.yml",
+            CodebookConfig,
        )

        # create codebook
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py
@ -5,6 +5,9 @@ from importlib import reload
 from pathlib import Path
 import pandas.api.types as ptypes
 import pandas.testing as pdt
+from data_pipeline.content.schemas.download_schemas import (
+    CSVConfig,
+)

 from data_pipeline.etl.score import constants
 from data_pipeline.utils import load_yaml_dict_from_file
@ -94,7 +97,7 @@ def test_create_downloadable_data(
    etl, score_data_expected, downloadable_data_expected
 ):
    downloadable_csv_config = load_yaml_dict_from_file(
-        etl.CONTENT_CONFIG / "csv.yml"
+        etl.CONTENT_CONFIG / "csv.yml", CSVConfig
    )
    output_downloadable_df_actual = etl._create_downloadable_data(
        score_data_expected,
--- a/data/data-pipeline/data_pipeline/score/schemas/init.py
+++ b/data/data-pipeline/data_pipeline/score/schemas/init.py
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@ -1,17 +1,24 @@
-from typing import List
+from typing import List, Union
 import datetime
 import json
 import logging
 import os
 import sys
 import shutil
+import uuid
 import zipfile
 from pathlib import Path
 import urllib3
 import requests
 import yaml
+from marshmallow_dataclass import class_schema

 from data_pipeline.config import settings
+from data_pipeline.content.schemas.download_schemas import (
+    CSVConfig,
+    CodebookConfig,
+    ExcelConfig,
+)


 ## zlib is not available on all systems
@ -175,9 +182,12 @@ def unzip_file_from_url(
        None

    """
+    # file_id allows us to evade race conditions on parallel ETLs
+    file_id = uuid.uuid4()
+
    zip_file_path = download_file_from_url(
        file_url=file_url,
-        download_file_name=download_path / "downloaded.zip",
+        download_file_name=download_path / f"downloaded-{file_id}.zip",
        verify=verify,
    )

@ -323,7 +333,10 @@ def zip_directory(
    )


-def load_yaml_dict_from_file(yaml_file_path: Path) -> dict:
+def load_yaml_dict_from_file(
+    yaml_file_path: Path,
+    schema_class: Union[CSVConfig, ExcelConfig, CodebookConfig],
+) -> dict:
    """Load a YAML file specified in path into a Python dictionary.

    Args:
@ -334,6 +347,11 @@ def load_yaml_dict_from_file(yaml_file_path: Path) -> dict:
    """
    with open(yaml_file_path, encoding="UTF-8") as file:
        yaml_dict = yaml.load(file, Loader=yaml.FullLoader)
+
+        # validate YAML
+        yaml_config_schema = class_schema(schema_class)
+        yaml_config_schema().load(yaml_dict)
+
    return yaml_dict