mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-29 10:01:17 -07:00
Marshmallow Schemas for YAML files (#1497)
* Marshmallow Schemas for YAML files * completed ticket * passing tests * lint * click dep * staging BE map * Pr review
This commit is contained in:
parent
27311b11e2
commit
859177a877
11 changed files with 387 additions and 183 deletions
|
@ -7,8 +7,7 @@ global_config:
|
|||
excel_config:
|
||||
default_column_width: 30
|
||||
sheets:
|
||||
- main:
|
||||
label: "Data"
|
||||
- label: "Data"
|
||||
fields:
|
||||
- score_name: GEOID10_TRACT
|
||||
label: Census tract ID
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class FieldType(Enum):
|
||||
STRING = "string"
|
||||
INT64 = "int64"
|
||||
BOOL = "bool"
|
||||
FLOAT = "float"
|
||||
PERCENTAGE = "percentage"
|
||||
LOSS_RATE_PERCENTAGE = "loss_rate_percentage"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RoundingNum:
|
||||
float: int
|
||||
loss_rate_percentage: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class Field:
|
||||
score_name: str
|
||||
label: str
|
||||
format: FieldType = field(metadata={"by_value": True})
|
||||
|
||||
|
||||
@dataclass
|
||||
class CSVConfig:
|
||||
@dataclass
|
||||
class GlobalConfig:
|
||||
sort_by_label: str
|
||||
rounding_num: RoundingNum
|
||||
|
||||
global_config: GlobalConfig
|
||||
fields: List[Field]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExcelConfig:
|
||||
@dataclass
|
||||
class GlobalConfig:
|
||||
@dataclass
|
||||
class ExcelGlobalConfig:
|
||||
default_column_width: int
|
||||
|
||||
sort_by_label: str
|
||||
rounding_num: RoundingNum
|
||||
excel_config: ExcelGlobalConfig
|
||||
|
||||
@dataclass
|
||||
class SheetItem:
|
||||
label: str
|
||||
fields: List[Field]
|
||||
|
||||
global_config: GlobalConfig
|
||||
sheets: List[SheetItem]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CodebookConfig:
|
||||
@dataclass
|
||||
class Field:
|
||||
score_name: str
|
||||
notes: Optional[str]
|
||||
category: Optional[str]
|
||||
|
||||
fields: List[Field]
|
|
@ -3,6 +3,11 @@ import json
|
|||
from numpy import float64
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from data_pipeline.content.schemas.download_schemas import (
|
||||
CSVConfig,
|
||||
CodebookConfig,
|
||||
ExcelConfig,
|
||||
)
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.score.etl_utils import floor_series, create_codebook
|
||||
|
@ -460,7 +465,7 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
# open excel yaml config
|
||||
excel_csv_config = load_yaml_dict_from_file(
|
||||
self.CONTENT_CONFIG / "excel.yml"
|
||||
self.CONTENT_CONFIG / "excel.yml", ExcelConfig
|
||||
)
|
||||
|
||||
# Define Excel Columns Column Width
|
||||
|
@ -535,7 +540,7 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
logger.info("Writing downloadable csv")
|
||||
# open yaml config
|
||||
downloadable_csv_config = load_yaml_dict_from_file(
|
||||
self.CONTENT_CONFIG / "csv.yml"
|
||||
self.CONTENT_CONFIG / "csv.yml", CSVConfig
|
||||
)
|
||||
downloadable_df = self._create_downloadable_data(
|
||||
score_df=self.output_score_county_state_merged_df,
|
||||
|
@ -557,7 +562,8 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
|
||||
# load supplemental codebook yml
|
||||
field_descriptions_for_codebook_config = load_yaml_dict_from_file(
|
||||
self.CONTENT_CONFIG / "field_descriptions_for_codebook.yml"
|
||||
self.CONTENT_CONFIG / "field_descriptions_for_codebook.yml",
|
||||
CodebookConfig,
|
||||
)
|
||||
|
||||
# create codebook
|
||||
|
|
|
@ -5,6 +5,9 @@ from importlib import reload
|
|||
from pathlib import Path
|
||||
import pandas.api.types as ptypes
|
||||
import pandas.testing as pdt
|
||||
from data_pipeline.content.schemas.download_schemas import (
|
||||
CSVConfig,
|
||||
)
|
||||
|
||||
from data_pipeline.etl.score import constants
|
||||
from data_pipeline.utils import load_yaml_dict_from_file
|
||||
|
@ -94,7 +97,7 @@ def test_create_downloadable_data(
|
|||
etl, score_data_expected, downloadable_data_expected
|
||||
):
|
||||
downloadable_csv_config = load_yaml_dict_from_file(
|
||||
etl.CONTENT_CONFIG / "csv.yml"
|
||||
etl.CONTENT_CONFIG / "csv.yml", CSVConfig
|
||||
)
|
||||
output_downloadable_df_actual = etl._create_downloadable_data(
|
||||
score_data_expected,
|
||||
|
|
|
@ -1,17 +1,24 @@
|
|||
from typing import List
|
||||
from typing import List, Union
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import uuid
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
import urllib3
|
||||
import requests
|
||||
import yaml
|
||||
from marshmallow_dataclass import class_schema
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.content.schemas.download_schemas import (
|
||||
CSVConfig,
|
||||
CodebookConfig,
|
||||
ExcelConfig,
|
||||
)
|
||||
|
||||
|
||||
## zlib is not available on all systems
|
||||
|
@ -175,9 +182,12 @@ def unzip_file_from_url(
|
|||
None
|
||||
|
||||
"""
|
||||
# file_id allows us to evade race conditions on parallel ETLs
|
||||
file_id = uuid.uuid4()
|
||||
|
||||
zip_file_path = download_file_from_url(
|
||||
file_url=file_url,
|
||||
download_file_name=download_path / "downloaded.zip",
|
||||
download_file_name=download_path / f"downloaded-{file_id}.zip",
|
||||
verify=verify,
|
||||
)
|
||||
|
||||
|
@ -323,7 +333,10 @@ def zip_directory(
|
|||
)
|
||||
|
||||
|
||||
def load_yaml_dict_from_file(yaml_file_path: Path) -> dict:
|
||||
def load_yaml_dict_from_file(
|
||||
yaml_file_path: Path,
|
||||
schema_class: Union[CSVConfig, ExcelConfig, CodebookConfig],
|
||||
) -> dict:
|
||||
"""Load a YAML file specified in path into a Python dictionary.
|
||||
|
||||
Args:
|
||||
|
@ -334,6 +347,11 @@ def load_yaml_dict_from_file(yaml_file_path: Path) -> dict:
|
|||
"""
|
||||
with open(yaml_file_path, encoding="UTF-8") as file:
|
||||
yaml_dict = yaml.load(file, Loader=yaml.FullLoader)
|
||||
|
||||
# validate YAML
|
||||
yaml_config_schema = class_schema(schema_class)
|
||||
yaml_config_schema().load(yaml_dict)
|
||||
|
||||
return yaml_dict
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue