From e5b84dc28d3a5d5b67b43da400731d50bfdae1e8 Mon Sep 17 00:00:00 2001
From: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
Date: Mon, 27 Jun 2022 12:59:17 -0400
Subject: [PATCH] removing last updated year - optional reverse percentile

---
 data/data-pipeline/data_pipeline/etl/base.py  | 11 ++-----
 .../etl/score/config/datasets.yml             | 31 +++++++++----------
 .../data_pipeline/etl/score/etl_score.py      |  1 +
 .../etl/score/schemas/datasets.py             |  5 ++-
 .../tests/sources/example/test_etl.py         |  1 -
 .../sources/national_risk_index/test_etl.py   |  3 +-
 6 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py
index b1aed335..873aa74c 100644
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@@ -123,7 +123,6 @@ class ExtractTransformLoad:
             sys.exit()
 
         # set some of the basic fields
-        cls.LAST_UPDATED_YEAR = dataset_config["last_updated_year"]
         cls.INPUT_GEOID_TRACT_FIELD_NAME = dataset_config[
             "input_geoid_tract_field_name"
         ]
@@ -150,16 +149,10 @@ class ExtractTransformLoad:
         if cls.NAME is None:
             raise NotImplementedError(
                 f"Child ETL class needs to specify `cls.NAME` (currently "
-                f"{cls.NAME}) and `cls.LAST_UPDATED_YEAR` (currently "
-                f"{cls.LAST_UPDATED_YEAR})."
+                f"{cls.NAME})."
             )
 
-        output_file_path = (
-            cls.DATA_PATH
-            / "dataset"
-            / f"{cls.NAME}_{cls.LAST_UPDATED_YEAR}"
-            / "usa.csv"
-        )
+        output_file_path = cls.DATA_PATH / "dataset" / f"{cls.NAME}" / "usa.csv"
         return output_file_path
 
     def get_tmp_path(self) -> pathlib.Path:
diff --git a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
index 3986ad34..8d18ae38 100644
--- a/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
+++ b/data/data-pipeline/data_pipeline/etl/score/config/datasets.yml
@@ -3,7 +3,6 @@ datasets:
   - long_name: "FEMA National Risk Index"
     short_name: "nri"
     module_name: national_risk_index
-    last_updated_year: 2020
     description: "Dataset from FEMA that identifies communities most at risk to 18 natural hazards."
     input_geoid_tract_field_name: "TRACTFIPS"
     load_fields:
@@ -11,34 +10,34 @@ datasets:
         df_field_name: "RISK_INDEX_EXPECTED_ANNUAL_LOSS_SCORE_FIELD_NAME"
         long_name: "FEMA Risk Index Expected Annual Loss Score"
         field_type: float
-        tile_include: true
-        csv_download: true
-        excel_download: true
+        include_in_tiles: true
+        include_in_csv: true
+        include_in_excel: true
       - short_name: "ex_pop_loss"
         df_field_name: "EXPECTED_POPULATION_LOSS_RATE_FIELD_NAME"
         long_name: "Expected population loss rate (Natural Hazards Risk Index)"
         field_type: float
-        tile_include: true
-        csv_download: true
-        excel_download: true
+        include_in_tiles: true
+        include_in_csv: true
+        include_in_excel: true
       - short_name: "ex_ag_loss"
         df_field_name: "EXPECTED_AGRICULTURE_LOSS_RATE_FIELD_NAME"
         long_name: "Expected agricultural loss rate (Natural Hazards Risk Index)"
         field_type: float
-        tile_include: true
-        csv_download: true
-        excel_download: true
+        include_in_tiles: true
+        include_in_csv: true
+        include_in_excel: true
       - short_name: "ex_bldg_loss"
         df_field_name: "EXPECTED_BUILDING_LOSS_RATE_FIELD_NAME"
         long_name: "Expected building loss rate (Natural Hazards Risk Index)"
         field_type: float
-        tile_include: true
-        csv_download: true
-        excel_download: true
+        include_in_tiles: true
+        include_in_csv: true
+        include_in_excel: true
       - short_name: "has_ag_val"
         df_field_name: "CONTAINS_AGRIVALUE"
         long_name: "Contains agricultural value"
         field_type: bool
-        tile_include: true
-        csv_download: true
-        excel_download: true
+        include_in_tiles: true
+        include_in_csv: true
+        include_in_excel: true
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score.py b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
index fbef9dc2..6d952d0d 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score.py
@@ -480,6 +480,7 @@ class ScoreETL(ExtractTransformLoad):
             # for instance, 3rd grade reading level : Low 3rd grade reading level.
             # This low field will not exist yet, it is only calculated for the
             # percentile.
+            # TODO: This will come from the YAML dataset config
             ReversePercentile(
                 field_name=field_names.READING_FIELD,
                 low_field_name=field_names.LOW_READING_FIELD,
diff --git a/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py b/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py
index 0d1e6f84..369e8c15 100644
--- a/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py
+++ b/data/data-pipeline/data_pipeline/etl/score/schemas/datasets.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import List
+from typing import List, Optional
 
 
 class FieldType(Enum):
@@ -40,6 +40,8 @@ class DatasetsConfig:
                 field_type (FieldType): An enum that dictates what type of field this is. This will be used on the `etl_score_post`
                 for the data manipulation.
                 The `by_value` metadata prop will load the field type's Enum value instead of the index, i.e. "string" and not STRING
+                reverse_percentile (Optional bool): An optional bool to denote this field to be a reverse_percentile.
+                TODO: data/data-pipeline/data_pipeline/etl/score/etl_score.py:477
                 include_in_tiles (bool): Include this field on the tile export.
                 include_in_csv (bool): Include this field on the CSV export.
                 include_in_excel (bool): Include this field on the Excel export.
@@ -49,6 +51,7 @@ class DatasetsConfig:
             df_field_name: str
             long_name: str
             field_type: FieldType = field(metadata={"by_value": True})
+            reverse_percentile: Optional[bool]
             include_in_tiles: bool
             include_in_csv: bool
             include_in_excel: bool
diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
index 82ab807d..6bbf2672 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@@ -127,7 +127,6 @@ class TestETL:
 
         # Also make sure all parameters that need to be non-null are non-null
         assert etl.NAME is not None
-        assert etl.LAST_UPDATED_YEAR is not None
         assert etl.GEO_LEVEL is not None
         assert etl.COLUMNS_TO_KEEP is not None
         assert len(etl.COLUMNS_TO_KEEP) > 0
diff --git a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
index 0798fcce..f428565f 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/national_risk_index/test_etl.py
@@ -87,7 +87,6 @@ class TestNationalRiskIndexETL(TestETL):
         assert etl.GEOID_FIELD_NAME == "GEOID10"
         assert etl.GEOID_TRACT_FIELD_NAME == "GEOID10_TRACT"
         assert etl.NAME == "national_risk_index"
-        assert etl.LAST_UPDATED_YEAR == 2020
         assert etl.GEO_LEVEL == ValidGeoLevel.CENSUS_TRACT
         assert etl.COLUMNS_TO_KEEP == [
             etl.GEOID_TRACT_FIELD_NAME,
@@ -105,6 +104,6 @@ class TestNationalRiskIndexETL(TestETL):
 
         output_file_path = etl._get_output_file_path()
         expected_output_file_path = (
-            data_path / "dataset" / "national_risk_index_2020" / "usa.csv"
+            data_path / "dataset" / "national_risk_index" / "usa.csv"
         )
         assert output_file_path == expected_output_file_path