Marshmallow Schemas for YAML files (#1497)

* Marshmallow Schemas for YAML files

* completed ticket

* passing tests

* lint

* click dep

* staging BE map

* Pr review
This commit is contained in:
Jorge Escobar 2022-03-31 13:56:10 -04:00 committed by GitHub
commit 859177a877
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 387 additions and 183 deletions

View file

@ -7,8 +7,7 @@ global_config:
excel_config:
default_column_width: 30
sheets:
- main:
label: "Data"
- label: "Data"
fields:
- score_name: GEOID10_TRACT
label: Census tract ID

View file

@ -0,0 +1,68 @@
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Optional
class FieldType(Enum):
STRING = "string"
INT64 = "int64"
BOOL = "bool"
FLOAT = "float"
PERCENTAGE = "percentage"
LOSS_RATE_PERCENTAGE = "loss_rate_percentage"
@dataclass
class RoundingNum:
float: int
loss_rate_percentage: int
@dataclass
class Field:
score_name: str
label: str
format: FieldType = field(metadata={"by_value": True})
@dataclass
class CSVConfig:
@dataclass
class GlobalConfig:
sort_by_label: str
rounding_num: RoundingNum
global_config: GlobalConfig
fields: List[Field]
@dataclass
class ExcelConfig:
@dataclass
class GlobalConfig:
@dataclass
class ExcelGlobalConfig:
default_column_width: int
sort_by_label: str
rounding_num: RoundingNum
excel_config: ExcelGlobalConfig
@dataclass
class SheetItem:
label: str
fields: List[Field]
global_config: GlobalConfig
sheets: List[SheetItem]
@dataclass
class CodebookConfig:
@dataclass
class Field:
score_name: str
notes: Optional[str]
category: Optional[str]
fields: List[Field]

View file

@ -3,6 +3,11 @@ import json
from numpy import float64
import numpy as np
import pandas as pd
from data_pipeline.content.schemas.download_schemas import (
CSVConfig,
CodebookConfig,
ExcelConfig,
)
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.score.etl_utils import floor_series, create_codebook
@ -460,7 +465,7 @@ class PostScoreETL(ExtractTransformLoad):
# open excel yaml config
excel_csv_config = load_yaml_dict_from_file(
self.CONTENT_CONFIG / "excel.yml"
self.CONTENT_CONFIG / "excel.yml", ExcelConfig
)
# Define Excel Columns Column Width
@ -535,7 +540,7 @@ class PostScoreETL(ExtractTransformLoad):
logger.info("Writing downloadable csv")
# open yaml config
downloadable_csv_config = load_yaml_dict_from_file(
self.CONTENT_CONFIG / "csv.yml"
self.CONTENT_CONFIG / "csv.yml", CSVConfig
)
downloadable_df = self._create_downloadable_data(
score_df=self.output_score_county_state_merged_df,
@ -557,7 +562,8 @@ class PostScoreETL(ExtractTransformLoad):
# load supplemental codebook yml
field_descriptions_for_codebook_config = load_yaml_dict_from_file(
self.CONTENT_CONFIG / "field_descriptions_for_codebook.yml"
self.CONTENT_CONFIG / "field_descriptions_for_codebook.yml",
CodebookConfig,
)
# create codebook

View file

@ -5,6 +5,9 @@ from importlib import reload
from pathlib import Path
import pandas.api.types as ptypes
import pandas.testing as pdt
from data_pipeline.content.schemas.download_schemas import (
CSVConfig,
)
from data_pipeline.etl.score import constants
from data_pipeline.utils import load_yaml_dict_from_file
@ -94,7 +97,7 @@ def test_create_downloadable_data(
etl, score_data_expected, downloadable_data_expected
):
downloadable_csv_config = load_yaml_dict_from_file(
etl.CONTENT_CONFIG / "csv.yml"
etl.CONTENT_CONFIG / "csv.yml", CSVConfig
)
output_downloadable_df_actual = etl._create_downloadable_data(
score_data_expected,

View file

@ -1,17 +1,24 @@
from typing import List
from typing import List, Union
import datetime
import json
import logging
import os
import sys
import shutil
import uuid
import zipfile
from pathlib import Path
import urllib3
import requests
import yaml
from marshmallow_dataclass import class_schema
from data_pipeline.config import settings
from data_pipeline.content.schemas.download_schemas import (
CSVConfig,
CodebookConfig,
ExcelConfig,
)
## zlib is not available on all systems
@ -175,9 +182,12 @@ def unzip_file_from_url(
None
"""
# file_id allows us to evade race conditions on parallel ETLs
file_id = uuid.uuid4()
zip_file_path = download_file_from_url(
file_url=file_url,
download_file_name=download_path / "downloaded.zip",
download_file_name=download_path / f"downloaded-{file_id}.zip",
verify=verify,
)
@ -323,7 +333,10 @@ def zip_directory(
)
def load_yaml_dict_from_file(yaml_file_path: Path) -> dict:
def load_yaml_dict_from_file(
yaml_file_path: Path,
schema_class: Union[CSVConfig, ExcelConfig, CodebookConfig],
) -> dict:
"""Load a YAML file specified in path into a Python dictionary.
Args:
@ -334,6 +347,11 @@ def load_yaml_dict_from_file(yaml_file_path: Path) -> dict:
"""
with open(yaml_file_path, encoding="UTF-8") as file:
yaml_dict = yaml.load(file, Loader=yaml.FullLoader)
# validate YAML
yaml_config_schema = class_schema(schema_class)
yaml_config_schema().load(yaml_dict)
return yaml_dict