NRI dataset and initial score YAML configuration (#1534)

* update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * update be staging gha * checkpoint * update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * checkpoint * PR Review * renoving source url * tests * stop execution of ETL if there's a YAML schema issue * update be staging gha * adding source url as class var again * clean up * force cache bust * gha cache bust * dynamically set score vars from YAML * docsctrings * removing last updated year - optional reverse percentile * passing tests * sort order * column ordening * PR review * class level vars * Updating DatasetsConfig * fix pylint errors * moving metadata hint back to code Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
2025-09-29 14:13:17 -07:00 · 2022-08-09 16:37:10 -04:00 · 2022-08-09 16:37:10 -04:00 · 1c448a77f9
commit 1c448a77f9
parent 1833e3e794
15 changed files with 272 additions and 3485 deletions
--- a/.github/workflows/data-checks.yml
+++ b/.github/workflows/data-checks.yml
@ -2,7 +2,9 @@
 name: Data Checks
 on:
  pull_request:
-    branches: [main] # runs on any PR against main
+    branches:
+      - main
+      - "**/release/**"
    paths:
      - "data/**"
 jobs:
@ -16,7 +18,7 @@ jobs:
        # checks all of the versions allowed in pyproject.toml
        python-version: [3.8, 3.9]
    steps:
-      # installs python
+      # installs Python
      # one execution of the tests per version listed above
      - uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
--- a/.github/workflows/deploy_be_staging.yml
+++ b/.github/workflows/deploy_be_staging.yml
@ -1,7 +1,9 @@
 name: Deploy Backend Staging
 on:
  pull_request:
-    branches: [main]
+    branches:
+      - main
+      - "**/release/**"
    paths:
      - "data/**"
 env:
@ -60,7 +62,7 @@ jobs:
      - name: Update PR with deployed Score URLs
        uses: mshick/add-pr-comment@v1
        with:
-          # Deploy to S3 for the staging URL
+          # Deploy to S3 for the Staging URL
          message: |
            ** Score Deployed! **   
            Find it here: 
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -1,12 +1,15 @@
 import enum
 import pathlib
+import sys
 import typing
 from typing import Optional

 import pandas as pd

 from data_pipeline.config import settings
+from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
 from data_pipeline.utils import (
+    load_yaml_dict_from_file,
    unzip_file_from_url,
    remove_all_from_dir,
    get_module_logger,
@ -30,6 +33,9 @@ class ExtractTransformLoad:
    Attributes:
        DATA_PATH (pathlib.Path): Local path where all data will be stored
        TMP_PATH (pathlib.Path): Local path where temporary data will be stored
+
+        TODO: Fill missing attrs here
+
        GEOID_FIELD_NAME (str): The common column name for a Census Block Group identifier
        GEOID_TRACT_FIELD_NAME (str): The common column name for a Census Tract identifier
    """
@ -40,6 +46,7 @@ class ExtractTransformLoad:
    DATA_PATH: pathlib.Path = APP_ROOT / "data"
    TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
    CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
+    DATASET_CONFIG: pathlib.Path = APP_ROOT / "etl" / "score" / "config"

    # Parameters
    GEOID_FIELD_NAME: str = "GEOID10"
@ -55,6 +62,9 @@ class ExtractTransformLoad:
    # SOURCE_URL is used to extract source data in extract().
    SOURCE_URL: str = None

+    # INPUT_EXTRACTED_FILE_NAME is the name of the file after extract().
+    INPUT_EXTRACTED_FILE_NAME: str = None
+
    # GEO_LEVEL is used to identify whether output data is at the unit of the tract or
    # census block group.
    # TODO: add tests that enforce seeing the expected geographic identifier field
@ -64,6 +74,13 @@ class ExtractTransformLoad:
    # COLUMNS_TO_KEEP is used to identify which columns to keep in the output df.
    COLUMNS_TO_KEEP: typing.List[str] = None

+    # INPUT_GEOID_TRACT_FIELD_NAME is the field name that identifies the Census Tract ID
+    # on the input file
+    INPUT_GEOID_TRACT_FIELD_NAME: str = None
+
+    # NULL_REPRESENTATION is how nulls are represented on the input field
+    NULL_REPRESENTATION: str = None
+
    # Thirteen digits in a census block group ID.
    EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
    # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
@ -77,8 +94,53 @@ class ExtractTransformLoad:
    #  periods. https://github.com/usds/justice40-tool/issues/964
    EXPECTED_MAX_CENSUS_TRACTS: int = 74160

+    # We use output_df as the final dataframe to use to write to the CSV
+    # It is used on the "load" base class method
    output_df: pd.DataFrame = None

+    @classmethod
+    def yaml_config_load(cls) -> dict:
+        """Generate config dictionary and set instance variables from YAML dataset."""
+
+        # check if the class instance has score YAML definitions
+        datasets_config = load_yaml_dict_from_file(
+            cls.DATASET_CONFIG / "datasets.yml",
+            DatasetsConfig,
+        )
+
+        # get the config for this dataset
+        try:
+            dataset_config = next(
+                item
+                for item in datasets_config.get("datasets")
+                if item["module_name"] == cls.NAME
+            )
+        except StopIteration:
+            # Note: it'd be nice to log the name of the dataframe, but that's not accessible in this scope.
+            logger.error(
+                f"Exception encountered while extracting dataset config for dataset {cls.NAME}"
+            )
+            sys.exit()
+
+        # set some of the basic fields
+        cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
+            "input_geoid_tract_field_name"
+        ]
+
+        # get the columns to write on the CSV
+        # and set the constants
+        cls.COLUMNS_TO_KEEP = [
+            cls.GEOID_TRACT_FIELD_NAME,  # always index with geoid tract id
+        ]
+        for field in dataset_config["load_fields"]:
+            cls.COLUMNS_TO_KEEP.append(field["long_name"])
+
+            # set the constants for the class
+            setattr(cls, field["df_field_name"], field["long_name"])
+
+        # return the config dict
+        return dataset_config
+
    # This is a classmethod so it can be used by `get_data_frame` without
    # needing to create an instance of the class. This is a use case in `etl_score`.
    @classmethod
@ -87,16 +149,10 @@ class ExtractTransformLoad:
        if cls.NAME is None:
            raise NotImplementedError(
                f"Child ETL class needs to specify `cls.NAME` (currently "
-                f"{cls.NAME}) and `cls.LAST_UPDATED_YEAR` (currently "
-                f"{cls.LAST_UPDATED_YEAR})."
+                f"{cls.NAME})."
            )

-        output_file_path = (
-            cls.DATA_PATH
-            / "dataset"
-            / f"{cls.NAME}_{cls.LAST_UPDATED_YEAR}"
-            / "usa.csv"
-        )
+        output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
        return output_file_path

    def get_tmp_path(self) -> pathlib.Path:
@ -229,8 +285,7 @@ class ExtractTransformLoad:

        Data is written in the specified local data folder or remote AWS S3 bucket.

-        Uses the directory from `self.OUTPUT_DIR` and the file name from
-        `self._get_output_file_path`.
+        Uses the directory and the file name from `self._get_output_file_path`.
        """
        logger.info(f"Saving `{self.NAME}` CSV")

--- a/data/data-pipeline/data_pipeline/etl/score/config/init.py
+++ b/data/data-pipeline/data_pipeline/etl/score/config/init.py
--- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
+++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@ -0,0 +1,79 @@
+---
+datasets:
+  - long_name: "FEMA National Risk Index"
+    short_name: "nri"
+    module_name: national_risk_index
+    input_geoid_tract_field_name: "TRACTFIPS"
+    load_fields:
+      - short_name: "ex_loss"
+        df_field_name: "RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME"
+        long_name: "FEMA Risk Index Expected Annual Loss Score"
+        field_type: float
+        number_of_decimals_in_output: 6
+
+      - short_name: "ex_pop_loss"
+        df_field_name: "EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME"
+        long_name: "Expected population loss rate (Natural Hazards Risk Index)"
+        description_short:
+          "Rate of fatalities and injuries resulting from natural hazards each year"
+        description_long:
+          "Rate relative to the population of fatalities and injuries due to fourteen
+          types of natural hazards each year that have some link to climate change:
+          avalanche, coastal flooding, cold wave, drought, hail, heat wave, hurricane,
+          ice storm, landslide, riverine flooding, strong wind, tornado, wildfire, and
+          winter weather. Population loss is defined as the Spatial Hazard Events and
+          Losses and National Centers for Environmental Information’s (NCEI) reported
+          number of fatalities and injuries caused by the hazard occurrence. To combine
+          fatalities and injuries for the computation of population loss value, an
+          injury is counted as one-tenth (1/10) of a fatality. The NCEI Storm Events
+          Database classifies injuries and fatalities as direct or indirect. Both direct
+          and indirect injuries and fatalities are counted as population loss. This
+          total number of injuries and fatalities is then divided by the population in
+          the census tract to get a per-capita rate of population risk."
+        field_type: float
+        number_of_decimals_in_output: 6
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
+
+      - short_name: "ex_ag_loss"
+        df_field_name: "EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME"
+        long_name: "Expected agricultural loss rate (Natural Hazards Risk Index)"
+        description_short:
+          "Economic loss rate to agricultural value resulting from natural hazards each
+          year"
+        description_long:
+          "Percent of agricultural value at risk from losses due to fourteen types of
+          natural hazards that have some link to climate change: avalanche, coastal
+          flooding, cold wave, drought, hail, heat wave, hurricane, ice storm,
+          landslide, riverine flooding, strong wind, tornado, wildfire, and winter
+          weather. Rate calculated by dividing the agricultural value at risk in a
+          census tract by the total agricultural value in that census tract."
+        field_type: float
+        number_of_decimals_in_output: 6
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
+
+      - short_name: "ex_bldg_loss"
+        df_field_name: "EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME"
+        long_name: "Expected building loss rate (Natural Hazards Risk Index)"
+        description_short:
+          "Economic loss rate to building value resulting from natural hazards each year"
+        description_long:
+          "Percent of building value at risk from losses due to fourteen types of
+          natural hazards that have some link to climate change: avalanche, coastal
+          flooding, cold wave, drought, hail, heat wave, hurricane, ice storm,
+          landslide, riverine flooding, strong wind, tornado, wildfire, and winter
+          weather. Rate calculated by dividing the building value at risk in a census
+          tract by the total building value in that census tract."
+        field_type: float
+        number_of_decimals_in_output: 6
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
+
+      - short_name: "has_ag_val"
+        df_field_name: "CONTAINS_AGRIVALUE"
+        long_name: "Contains agricultural value"
+        field_type: bool
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -480,6 +480,7 @@ class ScoreETL(ExtractTransformLoad):
            # for instance, 3rd grade reading level : Low 3rd grade reading level.
            # This low field will not exist yet, it is only calculated for the
            # percentile.
+            # TODO: This will come from the YAML dataset config
            ReversePercentile(
                field_name=field_names.READING_FIELD,
                low_field_name=field_names.LOW_READING_FIELD,
--- a/data/data-pipeline/data_pipeline/etl/score/schemas/init.py
+++ b/data/data-pipeline/data_pipeline/etl/score/schemas/init.py
--- a/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py
+++ b/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py
@ -0,0 +1,83 @@
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import List, Optional
+
+
+class FieldType(Enum):
+    STRING = "string"
+    INT64 = "int64"
+    BOOL = "bool"
+    FLOAT = "float"
+    PERCENTAGE = "percentage"
+
+
+@dataclass
+class DatasetsConfig:
+    @dataclass
+    class Dataset:
+        """A class that defines a dataset and its load variables.
+
+        Attributes:
+            long_name (str): A human readable title for the dataset.
+            short_name (str): used to compose the short variable names for tiles/arcgis. All short variable names will be prepended
+            with the short name of the data set it comes from, i.e. `nri__ex_loss`.
+            module_name (str): A string that matches both the Python module name for the dataset and the `NAME` property on the ETL class.
+            load_fields (LoadField): A list of type LoadField that will drive the score ETL and side effects (tiles, downloadables).
+        """
+
+        @dataclass
+        class LoadField:
+            """A class to define the fields to be saved on the dataset's output.
+
+            These fields will be then imported by the score generation ETL.
+
+            Attributes:
+                short_name (str): Used in conjunction with the dataset's `short_name` for files where short names are needed.
+                df_field_name (str): Name for the field in the etl class.
+                long_name (str): Column name for the dataset's output csv.
+                field_type (FieldType): An enum that dictates what type of field this is.
+                description_short (Optional str): Description used if the field appears in the side panel.
+                description_long (Optional str): Description used if the field appears in the Methodology page.
+                number_of_decimals_in_output (Optional int): Used to represent number of decimals in side effects, like Excel. Defaults to 2 decimals.
+                include_in_tiles (Optional bool): Include this field on the tile export. Defaults to False.
+                include_in_downloadable_files (Optional bool): Include this field on the CSV and Excel exports. Defaults to False.
+                create_percentile (Optional bool): Whether or not the backend processing should create a percentile field (ranked in ascending order)
+                    from the values in this field. Defaults to False.
+                create_reverse_percentile (Optional bool): Whether or not the backend processing should create a "reverse percentile" field (ranked in
+                    descending order) from the values in this field. Defaults to False.
+                include_in_comparison_tool_as_index (Optional bool): Whether or not to include this field in the comparison tool
+                    as an index used as comparison (e.g., this field might be a state or national index that identifies priority communities).
+                    The field itself must be a boolean for the comparison tool to work appropriately. Defaults to False.
+                include_in_comparison_tool_as_statistical_descriptor (Optional bool): Whether or not to include this field in the comparison tool as a
+                    statistical descriptor of census tracts (e.g., this field might income levels, life expectancy, etc). This will be
+                    used to generate reports that produce information such as, tracts identified by Index A but not Index B have higher
+                    income levels but lower life expectancy. Defaults to False.
+            """
+
+            short_name: str
+            df_field_name: str
+            long_name: str
+            field_type: FieldType = field(
+                metadata={"by_value": True}
+            )  # This will be used on the `etl_score_post` for the
+            # data manipulation. The `by_value` metadata prop will load the field type's Enum value instead of the index, i.e. "string"
+            # and not STRING
+            description_short: Optional[str] = None
+            description_long: Optional[str] = None
+            number_of_decimals_in_output: Optional[int] = 2
+            include_in_tiles: Optional[bool] = False
+            include_in_downloadable_files: Optional[bool] = False
+            create_percentile: Optional[bool] = False
+            create_reverse_percentile: Optional[bool] = False
+            include_in_comparison_tool_as_index: Optional[bool] = False
+            include_in_comparison_tool_as_statistical_descriptor: Optional[
+                bool
+            ] = False
+
+        long_name: str
+        short_name: str
+        module_name: str
+        input_geoid_tract_field_name: str
+        load_fields: List[LoadField]
+
+    datasets: List[Dataset]
--- a/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/national_risk_index/etl.py
@ -15,10 +15,16 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    """ETL class for the FEMA National Risk Index dataset"""

    NAME = "national_risk_index"
-    LAST_UPDATED_YEAR = 2020
    SOURCE_URL = "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
    GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT

+    # Output score variables (values set on datasets.yml) for linting purposes
+    RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME: str
+    EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME: str
+    EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME: str
+    EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME: str
+    CONTAINS_AGRIVALUE: str
+
    ## TEMPORARILY HERE
    ## To get this value up in time for launch, we've hard coded it. We would like
    ## to, in the future, have this pull the 10th percentile (or nth percentile)
@ -27,54 +33,34 @@ class NationalRiskIndexETL(ExtractTransformLoad):
    AGRIVALUE_LOWER_BOUND = 408000

    def __init__(self):
+        # load YAML config
+        self.DATASET_CONFIG = super().yaml_config_load()
+
+        # define the full path for the input CSV file
        self.INPUT_CSV = self.get_tmp_path() / "NRI_Table_CensusTracts.csv"

+        # this is the main dataframe
+        self.df: pd.DataFrame
+
+        # Start dataset-specific vars here
        self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME = (
            "EAL_SCORE"
        )
-
-        self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME = (
-            "FEMA Risk Index Expected Annual Loss Score"
-        )
-
        self.EXPECTED_ANNUAL_LOSS_BUILDING_VALUE_INPUT_FIELD_NAME = "EAL_VALB"
-
        self.EXPECTED_ANNUAL_LOSS_AGRICULTURAL_VALUE_INPUT_FIELD_NAME = (
            "EAL_VALA"
        )
        self.EXPECTED_ANNUAL_LOSS_POPULATION_VALUE_INPUT_FIELD_NAME = "EAL_VALP"
-
        self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME = "AGRIVALUE"
        self.POPULATION_INPUT_FIELD_NAME = "POPULATION"
        self.BUILDING_VALUE_INPUT_FIELD_NAME = "BUILDVALUE"

-        self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME = (
-            "Expected building loss rate (Natural Hazards Risk Index)"
-        )
-        self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME = (
-            "Expected agricultural loss rate (Natural Hazards Risk Index)"
-        )
-        self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME = (
-            "Expected population loss rate (Natural Hazards Risk Index)"
-        )
-        self.CONTAINS_AGRIVALUE = "Contains agricultural value"
-
-        self.COLUMNS_TO_KEEP = [
-            self.GEOID_TRACT_FIELD_NAME,
-            self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
-            self.EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME,
-            self.EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME,
-            self.EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME,
-            self.CONTAINS_AGRIVALUE,
-        ]
-
-        self.df: pd.DataFrame
-
    def extract(self) -> None:
        """Unzips NRI dataset from the FEMA data source and writes the files
        to the temporary data folder for use in the transform() method
        """
        logger.info("Downloading 405MB National Risk Index Data")
+
        super().extract(
            source_url=self.SOURCE_URL,
            extract_path=self.get_tmp_path(),
@ -90,19 +76,18 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        """
        logger.info("Transforming National Risk Index Data")

-        NRI_TRACT_COL = "TRACTFIPS"  # Census Tract Column in NRI data
-
        # read in the unzipped csv from NRI data source then rename the
        # Census Tract column for merging
        df_nri: pd.DataFrame = pd.read_csv(
            self.INPUT_CSV,
-            dtype={NRI_TRACT_COL: "string"},
+            dtype={self.INPUT_GEOID_TRACT_FIELD_NAME: "string"},
            na_values=["None"],
            low_memory=False,
        )
+
        df_nri.rename(
            columns={
-                NRI_TRACT_COL: self.GEOID_TRACT_FIELD_NAME,
+                self.INPUT_GEOID_TRACT_FIELD_NAME: self.GEOID_TRACT_FIELD_NAME,
                self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_INPUT_FIELD_NAME: self.RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME,
            },
            inplace=True,
@ -170,6 +155,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        ].clip(
            lower=self.AGRIVALUE_LOWER_BOUND
        )
+
        # This produces a boolean that is True in the case of non-zero agricultural value
        df_nri[self.CONTAINS_AGRIVALUE] = (
            df_nri[self.AGRICULTURAL_VALUE_INPUT_FIELD_NAME] > 0
@ -185,6 +171,7 @@ class NationalRiskIndexETL(ExtractTransformLoad):
        # Note: `round` is smart enough to only apply to float columns.
        df_nri = df_nri.round(10)

+        # Assign the final df to the class' output_df for the load method
        self.output_df = df_nri

    def load(self) -> None:
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@ -119,6 +119,7 @@ class TestETL:
        """
        # Setup
        etl = self._get_instance_of_etl_class()
+        etl.__init__()
        data_path, tmp_path = mock_paths

        assert etl.DATA_PATH == data_path
@ -126,8 +127,6 @@ class TestETL:

        # Also make sure all parameters that need to be non-null are non-null
        assert etl.NAME is not None
-        assert etl.LAST_UPDATED_YEAR is not None
-        assert etl.SOURCE_URL is not None
        assert etl.GEO_LEVEL is not None
        assert etl.COLUMNS_TO_KEEP is not None
        assert len(etl.COLUMNS_TO_KEEP) > 0
@ -148,14 +147,10 @@ class TestETL:
        etl = self._get_instance_of_etl_class()
        data_path, tmp_path = mock_paths

+        etl.__init__()
        actual_file_path = etl._get_output_file_path()

-        expected_file_path = (
-            data_path
-            / "dataset"
-            / f"{etl.NAME}_{etl.LAST_UPDATED_YEAR}"
-            / "usa.csv"
-        )
+        expected_file_path = data_path / "dataset" / etl.NAME / "usa.csv"

        logger.info(f"Expected: {expected_file_path}")

@ -255,6 +250,7 @@ class TestETL:
        etl = self._setup_etl_instance_and_run_extract(
            mock_etl=mock_etl, mock_paths=mock_paths
        )
+        etl.__init__()
        etl.transform()

        assert etl.output_df is not None
@ -272,6 +268,7 @@ class TestETL:
        """
        # setup - input variables
        etl = self._get_instance_of_etl_class()
+        etl.__init__()

        # setup - mock transform step
        df_transform = pd.read_csv(
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@ -87,11 +87,6 @@ class TestNationalRiskIndexETL(TestETL):
        assert etl.GEOID_FIELD_NAME == "GEOID10"
        assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
        assert etl.NAME == "national_risk_index"
-        assert etl.LAST_UPDATED_YEAR == 2020
-        assert (
-            etl.SOURCE_URL
-            == "https://hazards.fema.gov/nri/Content/StaticDocuments/DataDownload//NRI_Table_CensusTracts/NRI_Table_CensusTracts.zip"
-        )
        assert etl.GEO_LEVEL == ValidGeoLevel.CENSUS_TRACT
        assert etl.COLUMNS_TO_KEEP == [
            etl.GEOID_TRACT_FIELD_NAME,
@ -109,6 +104,6 @@ class TestNationalRiskIndexETL(TestETL):

        output_file_path = etl._get_output_file_path()
        expected_output_file_path = (
-            data_path / "dataset" / "national_risk_index_2020" / "usa.csv"
+            data_path / "dataset" / "national_risk_index" / "usa.csv"
        )
        assert output_file_path == expected_output_file_path
--- a/data/data-pipeline/data_pipeline/utils.py
+++ b/data/data-pipeline/data_pipeline/utils.py
@ -8,6 +8,7 @@ import shutil
 import uuid
 import zipfile
 from pathlib import Path
+from marshmallow import ValidationError
 import urllib3
 import requests
 import yaml
@ -350,7 +351,13 @@ def load_yaml_dict_from_file(

        # validate YAML
        yaml_config_schema = class_schema(schema_class)
-        yaml_config_schema().load(yaml_dict)
+
+        try:
+            yaml_config_schema().load(yaml_dict)
+        except ValidationError as e:
+            logger.error(f"Invalid YAML config file {yaml_file_path}")
+            logger.error(e.normalized_messages())
+            sys.exit()

    return yaml_dict

--- a/mlc_config.json
+++ b/mlc_config.json
@ -1,4 +1,5 @@
 {
+  "_comment": "Markdown Link Checker configuration, see https://github.com/gaurav-nelson/github-action-markdown-link-check and https://github.com/tcort/markdown-link-check",
  "ignorePatterns": [
    {
      "pattern": "^http://localhost"
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -1,7 +0,0 @@
-{
-  "dependencies": {
-    "@turf/turf": "^6.5.0",
-    "@types/d3-ease": "^3.0.0",
-    "d3-ease": "^3.0.1"
-  }
-}