NRI dataset and initial score YAML configuration (#1534)

* update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * update be staging gha * checkpoint * update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * checkpoint * PR Review * renoving source url * tests * stop execution of ETL if there's a YAML schema issue * update be staging gha * adding source url as class var again * clean up * force cache bust * gha cache bust * dynamically set score vars from YAML * docsctrings * removing last updated year - optional reverse percentile * passing tests * sort order * column ordening * PR review * class level vars * Updating DatasetsConfig * fix pylint errors * moving metadata hint back to code Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
2025-02-21 09:11:26 -08:00 · 2022-08-09 16:37:10 -04:00 · 2022-08-09 16:37:10 -04:00 · 1c448a77f9
commit 1c448a77f9
parent 1833e3e794
15 changed files with 272 additions and 3485 deletions
--- a/.github/workflows/data-checks.yml
+++ b/.github/workflows/data-checks.yml
@ -2,7 +2,9 @@
 name: Data Checks
 on:
  pull_request:
-    branches: [main] # runs on any PR against main
+    branches:
      - main
      - "**/release/**"
    paths:
      - "data/**"
 jobs:
@ -16,7 +18,7 @@ jobs:
        # checks all of the versions allowed in pyproject.toml
        python-version: [3.8, 3.9]
    steps:
-      # installs python
+      # installs Python
      # one execution of the tests per version listed above
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
--- a/.github/workflows/deploy_be_staging.yml
+++ b/.github/workflows/deploy_be_staging.yml
@ -1,7 +1,9 @@
 name: Deploy Backend Staging
 on:
  pull_request:
-    branches: [main]
+    branches:
      - main
      - "**/release/**"
    paths:
      - "data/**"
 env:
@ -60,7 +62,7 @@ jobs:
      - name: Update PR with deployed Score URLs
        uses: mshick/add-pr-comment@v1
        with:
-          # Deploy to S3 for the staging URL
+          # Deploy to S3 for the Staging URL
          message: |
            ** Score Deployed! **   
            Find it here: 
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -1,12 +1,15 @@
 import enum
 import pathlib
 import sys
 import typing
 from typing import Optional
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
 from data_pipeline.utils import (
    load_yaml_dict_from_file,
    unzip_file_from_url,
    remove_all_from_dir,
    get_module_logger,
@ -30,6 +33,9 @@ class ExtractTransformLoad:
    Attributes:
        DATA_PATH (pathlib.Path): Local path where all data will be stored
        TMP_PATH (pathlib.Path): Local path where temporary data will be stored
        TODO: Fill missing attrs here
        GEOID_FIELD_NAME (str): The common column name for a Census Block Group identifier
        GEOID_TRACT_FIELD_NAME (str): The common column name for a Census Tract identifier
    """
@ -40,6 +46,7 @@ class ExtractTransformLoad:
    DATA_PATH: pathlib.Path = APP_ROOT / "data"
    TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
    CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
    DATASET_CONFIG: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
    # Parameters
    GEOID_FIELD_NAME: str = "GEOID10"
@ -55,6 +62,9 @@ class ExtractTransformLoad:
    # SOURCE_URL is used to extract source data in extract().
    SOURCE_URL: str = None
    # INPUT_EXTRACTED_FILE_NAME is the name of the file after extract().
    INPUT_EXTRACTED_FILE_NAME: str = None
    # GEO_LEVEL is used to identify whether output data is at the unit of the tract or
    # census block group.
    # TODO: add tests that enforce seeing the expected geographic identifier field
@ -64,6 +74,13 @@ class ExtractTransformLoad:
    # COLUMNS_TO_KEEP is used to identify which columns to keep in the output df.
    COLUMNS_TO_KEEP: typing.List[str] = None
    # INPUT_GEOID_TRACT_FIELD_NAME is the field name that identifies the Census Tract ID
    # on the input file
    INPUT_GEOID_TRACT_FIELD_NAME: str = None
    # NULL_REPRESENTATION is how nulls are represented on the input field
    NULL_REPRESENTATION: str = None
    # Thirteen digits in a census block group ID.
    EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
    # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
@ -77,8 +94,53 @@ class ExtractTransformLoad:
    #  periods. https://github.com/usds/justice40-tool/issues/964
    EXPECTED_MAX_CENSUS_TRACTS: int = 74160
    # We use output_df as the final dataframe to use to write to the CSV
    # It is used on the "load" base class method
    output_df: pd.DataFrame = None
    @classmethod
    def yaml_config_load(cls) -> dict:
        """Generate config dictionary and set instance variables from YAML dataset."""
        # check if the class instance has score YAML definitions
        datasets_config = load_yaml_dict_from_file(
            cls.DATASET_CONFIG / "datasets.yml",
            DatasetsConfig,
        )
        # get the config for this dataset
        try:
            dataset_config = next(
                item
                for item in datasets_config.get("datasets")
                if item["module_name"] == cls.NAME
            )
        except StopIteration:
            # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
            logger.error(
                f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
            )
            sys.exit()
        # set some of the basic fields
        cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
            "input_geoid_tract_field_name"
        ]
        # get the columns to write on the CSV
        # and set the constants
        cls.COLUMNS_TO_KEEP = [
            cls.GEOID_TRACT_FIELD_NAME,  # always index with geoid tract id
        ]
        for field in dataset_config["load_fields"]:
            cls.COLUMNS_TO_KEEP.append(field["long_name"])
            # set the constants for the class
            setattr(cls, field["df_field_name"], field["long_name"])
        # return the config dict
        return dataset_config
    # This is a classmethod so it can be used by `get_data_frame` without
    # needing to create an instance of the class. This is a use case in `etl_score`.
    @classmethod
@ -87,16 +149,10 @@ class ExtractTransformLoad:
        if cls.NAME is None:
            raise NotImplementedError(
                f"Child ETL class needs to specify `cls.NAME` (currently "
-                f"{cls.NAME}) and `cls.LAST_UPDATED_YEAR` (currently "
+                f"{cls.NAME})."
                f"{cls.LAST_UPDATED_YEAR})."
            )
-        output_file_path = (
+        output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
            cls.DATA_PATH
            / "dataset"
            / f"{cls.NAME}_{cls.LAST_UPDATED_YEAR}"
            / "usa.csv"
        )
        return output_file_path
    def get_tmp_path(self) -> pathlib.Path:
@ -229,8 +285,7 @@ class ExtractTransformLoad:
        Data is written in the specified local data folder or remote AWS S3 bucket.
-        Uses the directory from `self.OUTPUT_DIR` and the file name from
+        Uses the directory and the file name from `self._get_output_file_path`.
        `self._get_output_file_path`.
        """
        logger.info(f"Saving `{self.NAME}` CSV")
--- a/data/data-pipeline/data_pipeline/etl/score/config/init.py
+++ b/data/data-pipeline/data_pipeline/etl/score/config/init.py
--- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
+++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@ -0,0 +1,79 @@
 ---
 datasets:
  - long_name: "FEMA National Risk Index"
    short_name: "nri"
    module_name: national_risk_index
    input_geoid_tract_field_name: "TRACTFIPS"
    load_fields:
      - short_name: "ex_loss"
        df_field_name: "RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME"
        long_name: "FEMA Risk Index Expected Annual Loss Score"
        field_type: float
        number_of_decimals_in_output: 6
      - short_name: "ex_pop_loss"
        df_field_name: "EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME"
        long_name: "Expected population loss rate (Natural Hazards Risk Index)"
        description_short:
          "Rate of fatalities and injuries resulting from natural hazards each year"
        description_long:
          "Rate relative to the population of fatalities and injuries due to fourteen
          types of natural hazards each year that have some link to climate change:
          avalanche, coastal flooding, cold wave, drought, hail, heat wave, hurricane,
          ice storm, landslide, riverine flooding, strong wind, tornado, wildfire, and
          winter weather. Population loss is defined as the Spatial Hazard Events and
          Losses and National Centers for Environmental Information’s (NCEI) reported
          number of fatalities and injuries caused by the hazard occurrence. To combine
          fatalities and injuries for the computation of population loss value, an
          injury is counted as one-tenth (1/10) of a fatality. The NCEI Storm Events
          Database classifies injuries and fatalities as direct or indirect. Both direct
          and indirect injuries and fatalities are counted as population loss. This
          total number of injuries and fatalities is then divided by the population in
          the census tract to get a per-capita rate of population risk."
        field_type: float
        number_of_decimals_in_output: 6
        include_in_tiles: true
        include_in_downloadable_files: true
        create_percentile: true
      - short_name: "ex_ag_loss"
        df_field_name: "EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME"
        long_name: "Expected agricultural loss rate (Natural Hazards Risk Index)"
        description_short:
          "Economic loss rate to agricultural value resulting from natural hazards each
          year"
        description_long:
          "Percent of agricultural value at risk from losses due to fourteen types of
          natural hazards that have some link to climate change: avalanche, coastal
          flooding, cold wave, drought, hail, heat wave, hurricane, ice storm,
          landslide, riverine flooding, strong wind, tornado, wildfire, and winter
          weather. Rate calculated by dividing the agricultural value at risk in a
          census tract by the total agricultural value in that census tract."
        field_type: float
        number_of_decimals_in_output: 6
        include_in_tiles: true
        include_in_downloadable_files: true
        create_percentile: true
      - short_name: "ex_bldg_loss"
        df_field_name: "EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME"
        long_name: "Expected building loss rate (Natural Hazards Risk Index)"
        description_short:
          "Economic loss rate to building value resulting from natural hazards each year"
        description_long:
          "Percent of building value at risk from losses due to fourteen types of
          natural hazards that have some link to climate change: avalanche, coastal
          flooding, cold wave, drought, hail, heat wave, hurricane, ice storm,
          landslide, riverine flooding, strong wind, tornado, wildfire, and winter
          weather. Rate calculated by dividing the building value at risk in a census
          tract by the total building value in that census tract."
        field_type: float
        number_of_decimals_in_output: 6
        include_in_tiles: true
        include_in_downloadable_files: true
        create_percentile: true
      - short_name: "has_ag_val"
        df_field_name: "CONTAINS_AGRIVALUE"
        long_name: "Contains agricultural value"
        field_type: bool
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -480,6 +480,7 @@ class ScoreETL(ExtractTransformLoad):
            # for instance, 3rd grade reading level : Low 3rd grade reading level.
            # This low field will not exist yet, it is only calculated for the
            # percentile.
            # TODO: This will come from the YAML dataset config
            ReversePercentile(
                field_name=field_names.READING_FIELD,
                low_field_name=field_names.LOW_READING_FIELD,
--- a/data/data-pipeline/data_pipeline/etl/score/schemas/init.py
+++ b/data/data-pipeline/data_pipeline/etl/score/schemas/init.py
--- a/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py
+++ b/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py
@ -0,0 +1,83 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import List, Optional
 class FieldType(Enum):
    STRING = "string"
    INT64 = "int64"
    BOOL = "bool"
    FLOAT = "float"
    PERCENTAGE = "percentage"
@dataclass
 class DatasetsConfig:
    @dataclass
    class Dataset:
        """A class that defines a dataset and its load variables.
        Attributes:
            long_name (str): A human readable title for the dataset.
            short_name (str): used to compose the short variable names for tiles/arcgis. All short variable names will be prepended
            with the short name of the data set it comes from, i.e. `nri__ex_loss`.
            module_name (str): A string that matches both the Python module name for the dataset and the `NAME` property on the ETL class.
            load_fields (LoadField): A list of type LoadField that will drive the score ETL and side effects (tiles, downloadables).
        """
        @dataclass
        class LoadField:
            """A class to define the fields to be saved on the dataset's output.
            These fields will be then imported by the score generation ETL.
            Attributes:
                short_name (str): Used in conjunction with the dataset's `short_name` for files where short names are needed.
                df_field_name (str): Name for the field in the etl class.
                long_name (str): Column name for the dataset's output csv.
                field_type (FieldType): An enum that dictates what type of field this is.
                description_short (Optional str): Description used if the field appears in the side panel.
                description_long (Optional str): Description used if the field appears in the Methodology page.
                number_of_decimals_in_output (Optional int): Used to represent number of decimals in side effects, like Excel. Defaults to 2 decimals.
                include_in_tiles (Optional bool): Include this field on the tile export. Defaults to False.
                include_in_downloadable_files (Optional bool): Include this field on the CSV and Excel exports. Defaults to False.
                create_percentile (Optional bool): Whether or not the backend processing should create a percentile field (ranked in ascending order)
                    from the values in this field. Defaults to False.
                create_reverse_percentile (Optional bool): Whether or not the backend processing should create a "reverse percentile" field (ranked in
                    descending order) from the values in this field. Defaults to False.
                include_in_comparison_tool_as_index (Optional bool): Whether or not to include this field in the comparison tool
                    as an index used as comparison (e.g., this field might be a state or national index that identifies priority communities).
                    The field itself must be a boolean for the comparison tool to work appropriately. Defaults to False.
                include_in_comparison_tool_as_statistical_descriptor (Optional bool): Whether or not to include this field in the comparison tool as a
                    statistical descriptor of census tracts (e.g., this field might income levels, life expectancy, etc). This will be
                    used to generate reports that produce information such as, tracts identified by Index A but not Index B have higher
                    income levels but lower life expectancy. Defaults to False.
            """
            short_name: str
            df_field_name: str
            long_name: str
            field_type: FieldType = field(
                metadata={"by_value": True}
            )  # This will be used on the `etl_score_post` for the
            # data manipulation. The `by_value` metadata prop will load the field type's Enum value instead of the index, i.e. "string"
            # and not STRING
            description_short: Optional[str] = None
            description_long: Optional[str] = None
            number_of_decimals_in_output: Optional[int] = 2
            include_in_tiles: Optional[bool] = False
            include_in_downloadable_files: Optional[bool] = False
            create_percentile: Optional[bool] = False
            create_reverse_percentile: Optional[bool] = False
            include_in_comparison_tool_as_index: Optional[bool] = False
            include_in_comparison_tool_as_statistical_descriptor: Optional[
                bool
            ] = False
        long_name: str
        short_name: str
        module_name: str
        input_geoid_tract_field_name: str
        load_fields: List[LoadField]
    datasets: List[Dataset]
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -15,10 +15,16 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    """ETL class for the FEMA National Risk Index dataset"""
    NAME = "national_risk_index"
    LAST_UPDATED_YEAR = 2020
    SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
    # Output score variables (values set on datasets.yml) for linting purposes
    RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME: str
    EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME: str
    EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME: str
    EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME: str
    CONTAINS_AGRIVALUE: str
    ## TEMPORARILY HERE
    ## To get this value up in time for launch, we've hard coded it. We would like
    ## to, in the future, have this pull the 10th percentile (or nth percentile)
@ -27,54 +33,34 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    AGRIVALUE_LOWER_BOUND = 408000
    def __init__(self):
        # load YAML config
        self.DATASET_CONFIG = super().yaml_config_load()
        # define the full path for the input CSV file
        self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"
        # this is the main dataframe
        self.df: pd.DataFrame
        # Start dataset-specific vars here
        self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
            "EAL_SCORE"
        )
        self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME = (
            "FEMA Risk Index Expected Annual Loss Score"
        )
        self.EXPECTED_ANNUAL_LOSS_BUILDING_VALUE_INPUT_FIELD_NAME = "EAL_VALB"
        self.EXPECTED_ANNUAL_LOSS_AGRICULTURAL_VALUE_INPUT_FIELD_NAME = (
            "EAL_VALA"
        )
        self.EXPECTED_ANNUAL_LOSS_POPULATION_VALUE_INPUT_FIELD_NAME = "EAL_VALP"
        self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME = "AGRIVALUE"
        self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
        self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"
        self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME = (
            "Expected building loss rate (Natural Hazards Risk Index)"
        )
        self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME = (
            "Expected agricultural loss rate (Natural Hazards Risk Index)"
        )
        self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME = (
            "Expected population loss rate (Natural Hazards Risk Index)"
        )
        self.CONTAINS_AGRIVALUE = "Contains agricultural value"
        self.COLUMNS_TO_KEEP = [
            self.GEOID_TRACT_FIELD_NAME,
            self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
            self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME,
            self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME,
            self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME,
            self.CONTAINS_AGRIVALUE,
        ]
        self.df: pd.DataFrame
    def extract(self) -> None:
        """Unzips NRI dataset from the FEMA data source and writes the files
        to the temporary data folder for use in the transform() method
        """
        logger.info("Downloading 405MB National Risk Index Data")
        super().extract(
            source_url=self.SOURCE_URL,
            extract_path=self.get_tmp_path(),
@ -90,19 +76,18 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        """
        logger.info("Transforming National Risk Index Data")
        NRI_TRACT_COL = "TRACTFIPS"  # Census Tract Column in NRI data
        # read in the unzipped csv from NRI data source then rename the
        # Census Tract column for merging
        df_nri: pd.DataFrame = pd.read_csv(
            self.INPUT_CSV,
-            dtype={NRI_TRACT_COL: "string"},
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
            na_values=["None"],
            low_memory=False,
        )
        df_nri.rename(
            columns={
-                NRI_TRACT_COL: self.GEOID_TRACT_FIELD_NAME,
+                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
            },
            inplace=True,
@ -170,6 +155,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        ].clip(
            lower=self.AGRIVALUE_LOWER_BOUND
        )
        # This produces a boolean that is True in the case of non-zero agricultural value
        df_nri[self.CONTAINS_AGRIVALUE] = (
            df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
@ -185,6 +171,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        # Note: `round` is smart enough to only apply to float columns.
        df_nri = df_nri.round(10)
        # Assign the final df to the class' output_df for the load method
        self.output_df = df_nri
    def load(self) -> None:
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@ -119,6 +119,7 @@ class TestETL:
        """
        # Setup
        etl = self._get_instance_of_etl_class()
        etl.__init__()
        data_path, tmp_path = mock_paths
        assert etl.DATA_PATH == data_path
@ -126,8 +127,6 @@ class TestETL:
        # Also make sure all parameters that need to be non-null are non-null
        assert etl.NAME is not None
        assert etl.LAST_UPDATED_YEAR is not None
        assert etl.SOURCE_URL is not None
        assert etl.GEO_LEVEL is not None
        assert etl.COLUMNS_TO_KEEP is not None
        assert len(etl.COLUMNS_TO_KEEP) > 0
@ -148,14 +147,10 @@ class TestETL:
        etl = self._get_instance_of_etl_class()
        data_path, tmp_path = mock_paths
        etl.__init__()
        actual_file_path = etl._get_output_file_path()
-        expected_file_path = (
+        expected_file_path = data_path / "dataset" / etl.NAME / "usa.csv"
            data_path
            / "dataset"
            / f"{etl.NAME}_{etl.LAST_UPDATED_YEAR}"
            / "usa.csv"
        )
        logger.info(f"Expected: {expected_file_path}")
@ -255,6 +250,7 @@ class TestETL:
        etl = self._setup_etl_instance_and_run_extract(
            mock_etl=mock_etl, mock_paths=mock_paths
        )
        etl.__init__()
        etl.transform()
        assert etl.output_df is not None
@ -272,6 +268,7 @@ class TestETL:
        """
        # setup - input variables
        etl = self._get_instance_of_etl_class()
        etl.__init__()
        # setup - mock transform step
        df_transform = pd.read_csv(
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@ -87,11 +87,6 @@ class TestNationalRiskIndexETL(TestETL):
        assert etl.GEOID_FIELD_NAME == "GEOID10"
        assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
        assert etl.NAME == "national_risk_index"
        assert etl.LAST_UPDATED_YEAR == 2020
        assert (
            etl.SOURCE_URL
            == "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
        )
        assert etl.GEO_LEVEL == ValidGeoLevel.CENSUS_TRACT
        assert etl.COLUMNS_TO_KEEP == [
            etl.GEOID_TRACT_FIELD_NAME,
@ -109,6 +104,6 @@ class TestNationalRiskIndexETL(TestETL):
        output_file_path = etl._get_output_file_path()
        expected_output_file_path = (
-            data_path / "dataset" / "national_risk_index_2020" / "usa.csv"
+            data_path / "dataset" / "national_risk_index" / "usa.csv"
        )
        assert output_file_path == expected_output_file_path
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@ -8,6 +8,7 @@ import shutil
 import uuid
 import zipfile
 from pathlib import Path
 from marshmallow import ValidationError
 import urllib3
 import requests
 import yaml
@ -350,7 +351,13 @@ def load_yaml_dict_from_file(
        # validate YAML
        yaml_config_schema = class_schema(schema_class)
-        yaml_config_schema().load(yaml_dict)
+
        try:
            yaml_config_schema().load(yaml_dict)
        except ValidationError as e:
            logger.error(f"Invalid YAML config file {yaml_file_path}")
            logger.error(e.normalized_messages())
            sys.exit()
    return yaml_dict
--- a/mlc_config.json
+++ b/mlc_config.json
@ -1,4 +1,5 @@
 {
  "_comment": "Markdown Link Checker configuration, see https://github.com/gaurav-nelson/github-action-markdown-link-check and https://github.com/tcort/markdown-link-check",
  "ignorePatterns": [
    {
      "pattern": "^http://localhost"
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -1,7 +0,0 @@
 {
  "dependencies": {
    "@turf/turf": "^6.5.0",
    "@types/d3-ease": "^3.0.0",
    "d3-ease": "^3.0.1"
  }
 }