checkpoint

2025-02-23 10:04:18 -08:00 · 2022-04-07 18:55:44 -04:00 · 2022-04-07 18:55:44 -04:00 · 0e1e15eeaa
commit 0e1e15eeaa
parent d3a54e4820
3 changed files with 41 additions and 4 deletions
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -1,12 +1,15 @@
 import enum
 import pathlib
 import sys
 import typing
 from typing import Optional
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
 from data_pipeline.utils import (
    load_yaml_dict_from_file,
    unzip_file_from_url,
    remove_all_from_dir,
    get_module_logger,
@ -79,6 +82,34 @@ class ExtractTransformLoad:
    output_df: pd.DataFrame = None
    def yaml_config_load(self):
        # check if the class instance has score YAML definitions
        datasets_config = load_yaml_dict_from_file(
            self.APP_ROOT / "etl" / "score" / "config" / "datasets.yml",
            DatasetsConfig,
        )
        # get the config for this dataset
        try:
            dataset_config = next(
                item
                for item in datasets_config.get("datasets")
                if item["module_name"] == "self.NAME"
            )
        except StopIteration:
            # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
            logger.error(
                f"Exception encountered while extracting dataset config for dataset {self.NAME}"
            )
            sys.exit()
        # set the fields
        self.LAST_UPDATED_YEAR = dataset_config["last_updated_year"]
        self.SOURCE_URL = dataset_config["source_url"]
        self.INPUT_CSV = (
            self.get_tmp_path() / dataset_config["extracted_file_name"]
        )
    # This is a classmethod so it can be used by `get_data_frame` without
    # needing to create an instance of the class. This is a use case in `etl_score`.
    @classmethod
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -15,8 +15,6 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    """ETL class for the FEMA National Risk Index dataset"""
    NAME = "national_risk_index"
    LAST_UPDATED_YEAR = 2020
    SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    ## TEMPORARILY HERE
@ -27,7 +25,8 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    AGRIVALUE_LOWER_BOUND = 408000
    def __init__(self):
-        self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
+        # load YAML config
        super().yaml_config_load()
        self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
            "EAL_SCORE"
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@ -6,8 +6,10 @@ import os
 import sys
 import shutil
 import uuid
 from xml.dom import ValidationErr
 import zipfile
 from pathlib import Path
 from marshmallow import ValidationError
 import urllib3
 import requests
 import yaml
@ -350,7 +352,12 @@ def load_yaml_dict_from_file(
        # validate YAML
        yaml_config_schema = class_schema(schema_class)
-        yaml_config_schema().load(yaml_dict)
+
        try:
            yaml_config_schema().load(yaml_dict)
        except ValidationError as e:
            logger.error(f"Invalid YAML config file {yaml_file_path}")
            logger.error(e.normalized_messages())
    return yaml_dict