NRI dataset and initial score YAML configuration (#1534)

* update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * update be staging gha * checkpoint * update be staging gha * NRI dataset and initial score YAML configuration * checkpoint * adding data checks for release branch * passing tests * adding INPUT_EXTRACTED_FILE_NAME to base class * lint * columns to keep and tests * checkpoint * PR Review * renoving source url * tests * stop execution of ETL if there's a YAML schema issue * update be staging gha * adding source url as class var again * clean up * force cache bust * gha cache bust * dynamically set score vars from YAML * docsctrings * removing last updated year - optional reverse percentile * passing tests * sort order * column ordening * PR review * class level vars * Updating DatasetsConfig * fix pylint errors * moving metadata hint back to code Co-authored-by: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
2025-09-30 02:43:18 -07:00 · 2022-08-09 16:37:10 -04:00 · 2022-08-09 16:37:10 -04:00 · 1c448a77f9
commit 1c448a77f9
parent 1833e3e794
15 changed files with 272 additions and 3485 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/config/init.py
+++ b/data/data-pipeline/data_pipeline/etl/score/config/init.py
--- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
+++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@ -0,0 +1,79 @@
+---
+datasets:
+  - long_name: "FEMA National Risk Index"
+    short_name: "nri"
+    module_name: national_risk_index
+    input_geoid_tract_field_name: "TRACTFIPS"
+    load_fields:
+      - short_name: "ex_loss"
+        df_field_name: "RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME"
+        long_name: "FEMA Risk Index Expected Annual Loss Score"
+        field_type: float
+        number_of_decimals_in_output: 6
+
+      - short_name: "ex_pop_loss"
+        df_field_name: "EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME"
+        long_name: "Expected population loss rate (Natural Hazards Risk Index)"
+        description_short:
+          "Rate of fatalities and injuries resulting from natural hazards each year"
+        description_long:
+          "Rate relative to the population of fatalities and injuries due to fourteen
+          types of natural hazards each year that have some link to climate change:
+          avalanche, coastal flooding, cold wave, drought, hail, heat wave, hurricane,
+          ice storm, landslide, riverine flooding, strong wind, tornado, wildfire, and
+          winter weather. Population loss is defined as the Spatial Hazard Events and
+          Losses and National Centers for Environmental Information’s (NCEI) reported
+          number of fatalities and injuries caused by the hazard occurrence. To combine
+          fatalities and injuries for the computation of population loss value, an
+          injury is counted as one-tenth (1/10) of a fatality. The NCEI Storm Events
+          Database classifies injuries and fatalities as direct or indirect. Both direct
+          and indirect injuries and fatalities are counted as population loss. This
+          total number of injuries and fatalities is then divided by the population in
+          the census tract to get a per-capita rate of population risk."
+        field_type: float
+        number_of_decimals_in_output: 6
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
+
+      - short_name: "ex_ag_loss"
+        df_field_name: "EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME"
+        long_name: "Expected agricultural loss rate (Natural Hazards Risk Index)"
+        description_short:
+          "Economic loss rate to agricultural value resulting from natural hazards each
+          year"
+        description_long:
+          "Percent of agricultural value at risk from losses due to fourteen types of
+          natural hazards that have some link to climate change: avalanche, coastal
+          flooding, cold wave, drought, hail, heat wave, hurricane, ice storm,
+          landslide, riverine flooding, strong wind, tornado, wildfire, and winter
+          weather. Rate calculated by dividing the agricultural value at risk in a
+          census tract by the total agricultural value in that census tract."
+        field_type: float
+        number_of_decimals_in_output: 6
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
+
+      - short_name: "ex_bldg_loss"
+        df_field_name: "EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME"
+        long_name: "Expected building loss rate (Natural Hazards Risk Index)"
+        description_short:
+          "Economic loss rate to building value resulting from natural hazards each year"
+        description_long:
+          "Percent of building value at risk from losses due to fourteen types of
+          natural hazards that have some link to climate change: avalanche, coastal
+          flooding, cold wave, drought, hail, heat wave, hurricane, ice storm,
+          landslide, riverine flooding, strong wind, tornado, wildfire, and winter
+          weather. Rate calculated by dividing the building value at risk in a census
+          tract by the total building value in that census tract."
+        field_type: float
+        number_of_decimals_in_output: 6
+        include_in_tiles: true
+        include_in_downloadable_files: true
+        create_percentile: true
+
+      - short_name: "has_ag_val"
+        df_field_name: "CONTAINS_AGRIVALUE"
+        long_name: "Contains agricultural value"
+        field_type: bool
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@ -480,6 +480,7 @@ class ScoreETL(ExtractTransformLoad):
            # for instance, 3rd grade reading level : Low 3rd grade reading level.
            # This low field will not exist yet, it is only calculated for the
            # percentile.
+            # TODO: This will come from the YAML dataset config
            ReversePercentile(
                field_name=field_names.READING_FIELD,
                low_field_name=field_names.LOW_READING_FIELD,
--- a/data/data-pipeline/data_pipeline/etl/score/schemas/init.py
+++ b/data/data-pipeline/data_pipeline/etl/score/schemas/init.py
--- a/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py
+++ b/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py
@ -0,0 +1,83 @@
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import List, Optional
+
+
+class FieldType(Enum):
+    STRING = "string"
+    INT64 = "int64"
+    BOOL = "bool"
+    FLOAT = "float"
+    PERCENTAGE = "percentage"
+
+
+@dataclass
+class DatasetsConfig:
+    @dataclass
+    class Dataset:
+        """A class that defines a dataset and its load variables.
+
+        Attributes:
+            long_name (str): A human readable title for the dataset.
+            short_name (str): used to compose the short variable names for tiles/arcgis. All short variable names will be prepended
+            with the short name of the data set it comes from, i.e. `nri__ex_loss`.
+            module_name (str): A string that matches both the Python module name for the dataset and the `NAME` property on the ETL class.
+            load_fields (LoadField): A list of type LoadField that will drive the score ETL and side effects (tiles, downloadables).
+        """
+
+        @dataclass
+        class LoadField:
+            """A class to define the fields to be saved on the dataset's output.
+
+            These fields will be then imported by the score generation ETL.
+
+            Attributes:
+                short_name (str): Used in conjunction with the dataset's `short_name` for files where short names are needed.
+                df_field_name (str): Name for the field in the etl class.
+                long_name (str): Column name for the dataset's output csv.
+                field_type (FieldType): An enum that dictates what type of field this is.
+                description_short (Optional str): Description used if the field appears in the side panel.
+                description_long (Optional str): Description used if the field appears in the Methodology page.
+                number_of_decimals_in_output (Optional int): Used to represent number of decimals in side effects, like Excel. Defaults to 2 decimals.
+                include_in_tiles (Optional bool): Include this field on the tile export. Defaults to False.
+                include_in_downloadable_files (Optional bool): Include this field on the CSV and Excel exports. Defaults to False.
+                create_percentile (Optional bool): Whether or not the backend processing should create a percentile field (ranked in ascending order)
+                    from the values in this field. Defaults to False.
+                create_reverse_percentile (Optional bool): Whether or not the backend processing should create a "reverse percentile" field (ranked in
+                    descending order) from the values in this field. Defaults to False.
+                include_in_comparison_tool_as_index (Optional bool): Whether or not to include this field in the comparison tool
+                    as an index used as comparison (e.g., this field might be a state or national index that identifies priority communities).
+                    The field itself must be a boolean for the comparison tool to work appropriately. Defaults to False.
+                include_in_comparison_tool_as_statistical_descriptor (Optional bool): Whether or not to include this field in the comparison tool as a
+                    statistical descriptor of census tracts (e.g., this field might income levels, life expectancy, etc). This will be
+                    used to generate reports that produce information such as, tracts identified by Index A but not Index B have higher
+                    income levels but lower life expectancy. Defaults to False.
+            """
+
+            short_name: str
+            df_field_name: str
+            long_name: str
+            field_type: FieldType = field(
+                metadata={"by_value": True}
+            )  # This will be used on the `etl_score_post` for the
+            # data manipulation. The `by_value` metadata prop will load the field type's Enum value instead of the index, i.e. "string"
+            # and not STRING
+            description_short: Optional[str] = None
+            description_long: Optional[str] = None
+            number_of_decimals_in_output: Optional[int] = 2
+            include_in_tiles: Optional[bool] = False
+            include_in_downloadable_files: Optional[bool] = False
+            create_percentile: Optional[bool] = False
+            create_reverse_percentile: Optional[bool] = False
+            include_in_comparison_tool_as_index: Optional[bool] = False
+            include_in_comparison_tool_as_statistical_descriptor: Optional[
+                bool
+            ] = False
+
+        long_name: str
+        short_name: str
+        module_name: str
+        input_geoid_tract_field_name: str
+        load_fields: List[LoadField]
+
+    datasets: List[Dataset]