From 3f98206e6b4379eb1ed6469bb4eaf63f0bf1f969 Mon Sep 17 00:00:00 2001
From: lucasmbrown-usds <lucas.m.brown@omb.eop.gov>
Date: Fri, 9 Sep 2022 12:20:03 -0400
Subject: [PATCH] fixing tests

---
 data/data-pipeline/data_pipeline/etl/base.py  | 18 ++++++-----
 .../data_pipeline/etl/score/constants.py      |  5 ++--
 .../data_pipeline/etl/score/etl_utils.py      | 30 +++++++++++++------
 .../etl/score/tests/test_etl_utils.py         |  2 +-
 .../etl/sources/cdc_life_expectancy/etl.py    |  4 +--
 .../etl/sources/fsf_wildfire_risk/etl.py      |  4 +--
 .../etl/sources/nlcd_nature_deprived/etl.py   |  1 +
 .../tests/sources/example/test_etl.py         |  9 ++++--
 8 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py
index 02f066e7..65580f9a 100644
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@@ -85,8 +85,12 @@ class ExtractTransformLoad:
     # NULL_REPRESENTATION is how nulls are represented on the input field
     NULL_REPRESENTATION: str = None
 
-    # Whether this ETL contains data for the nation (the US states)
-    NATION_EXPECTED_IN_DATA: bool = True
+    # Whether this ETL contains data for the continental nation (DC & the US states
+    # except for Alaska and Hawaii)
+    CONTINENTAL_US_EXPECTED_IN_DATA: bool = True
+
+    # Whether this ETL contains data for Alaska and Hawaii
+    ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = True
 
     # Whether this ETL contains data for Puerto Rico
     PUERTO_RICO_EXPECTED_IN_DATA: bool = True
@@ -223,8 +227,6 @@ class ExtractTransformLoad:
         """
         # TODO: remove this once all ETL classes are converted to using the new
         #  base class parameters and patterns.
-        # TODO: determine how to use this currently in the partially refactored world.
-        #   https://github.com/usds/justice40-tool/issues/1891
         if self.GEO_LEVEL is None:
             logger.info(
                 "Skipping validation step for this class because it does not "
@@ -308,15 +310,17 @@ class ExtractTransformLoad:
                     )
 
         # Check whether data contains expected states
-        states_in_output_df = list(
+        states_in_output_df = (
             self.output_df[self.GEOID_TRACT_FIELD_NAME]
-            .astype(str)
             .str[0:2]
             .unique()
+            .tolist()
         )
+
         compare_to_list_of_expected_state_fips_codes(
             actual_state_fips_codes=states_in_output_df,
-            nation_expected=self.NATION_EXPECTED_IN_DATA,
+            continental_us_expected=self.CONTINENTAL_US_EXPECTED_IN_DATA,
+            alaska_and_hawaii_expected=self.ALASKA_AND_HAWAII_EXPECTED_IN_DATA,
             puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
             island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
             additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES,
diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py
index 9e34b096..c112eec0 100644
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@@ -131,9 +131,9 @@ TILES_NATION_THRESHOLD_COUNT = 21
 # 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands
 TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"]
 TILES_PUERTO_RICO_FIPS_CODE = ["72"]
-TILES_NATION_FIPS_CODE = [
+TILES_ALASKA_AND_HAWAII_FIPS_CODE = ["02", "15"]
+TILES_CONTINENTAL_US_FIPS_CODE = [
     "01",
-    "02",
     "04",
     "05",
     "06",
@@ -143,7 +143,6 @@ TILES_NATION_FIPS_CODE = [
     "11",
     "12",
     "13",
-    "15",
     "16",
     "17",
     "18",
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
index 15770ad3..13f2dc70 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@@ -10,7 +10,8 @@ from data_pipeline.config import settings
 from data_pipeline.etl.score.constants import (
     TILES_ISLAND_AREA_FIPS_CODES,
     TILES_PUERTO_RICO_FIPS_CODE,
-    TILES_NATION_FIPS_CODE,
+    TILES_CONTINENTAL_US_FIPS_CODE,
+    TILES_ALASKA_AND_HAWAII_FIPS_CODE,
 )
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import (
@@ -317,7 +318,8 @@ def create_codebook(
 # pylint: disable=too-many-arguments
 def compare_to_list_of_expected_state_fips_codes(
     actual_state_fips_codes: typing.List[str],
-    nation_expected: bool = True,
+    continental_us_expected: bool = True,
+    alaska_and_hawaii_expected: bool = True,
     puerto_rico_expected: bool = True,
     island_areas_expected: bool = True,
     additional_fips_codes_not_expected: typing.List[str] = None,
@@ -327,8 +329,10 @@ def compare_to_list_of_expected_state_fips_codes(
 
     Args:
         actual_state_fips_codes (List of str): Actual state codes observed in data
-        nation_expected (bool, optional): Do you expect the nation (DC & states) to be
-            represented in data?
+        continental_us_expected (bool, optional): Do you expect the continental nation
+            (DC & states except for Alaska and Hawaii) to be represented in data?
+        alaska_and_hawaii_expected (bool, optional): Do you expect Alaska and Hawaii
+            to be represented in the data?
         puerto_rico_expected (bool, optional): Do you expect PR to be represented in data?
         island_areas_expected (bool, optional): Do you expect Island Areas to be represented in
             data?
@@ -354,11 +358,19 @@ def compare_to_list_of_expected_state_fips_codes(
     # Start with the list of all FIPS codes for all states and territories.
     expected_states_set = set(get_state_fips_codes(settings.DATA_PATH))
 
-    # If nation (states and DC) are not expected to be included, remove it from the
-    # expected
-    # states set.
-    if not nation_expected:
-        expected_states_set = expected_states_set - set(TILES_NATION_FIPS_CODE)
+    # If continental US is not expected to be included, remove it from the
+    # expected states set.
+    if not continental_us_expected:
+        expected_states_set = expected_states_set - set(
+            TILES_CONTINENTAL_US_FIPS_CODE
+        )
+
+    # If Alaska and Hawaii are not expected to be included, remove them from the
+    # expected states set.
+    if not continental_us_expected:
+        expected_states_set = expected_states_set - set(
+            TILES_ALASKA_AND_HAWAII_FIPS_CODE
+        )
 
     # If Puerto Rico is not expected to be included, remove it from the expected
     # states set.
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
index 22e7df73..44a3157f 100644
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
@@ -225,5 +225,5 @@ def test_compare_to_list_of_expected_state_fips_codes():
 
     # Should not raise error because Nation is not to be missing
     compare_to_list_of_expected_state_fips_codes(
-        actual_state_fips_codes=fips_codes_test_4, nation_expected=False
+        actual_state_fips_codes=fips_codes_test_4, continental_us_expected=False
     )
diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
index 1f4f01da..d75ca85b 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@@ -81,7 +81,7 @@ class CDCLifeExpectancy(ExtractTransformLoad):
         # Expect that PR, Island Areas, and Maine/Wisconsin are missing
         compare_to_list_of_expected_state_fips_codes(
             actual_state_fips_codes=states_in_life_expectancy_usa_file,
-            nation_expected=self.NATION_EXPECTED_IN_DATA,
+            continental_us_expected=self.CONTINENTAL_US_EXPECTED_IN_DATA,
             puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
             island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
             additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
@@ -117,7 +117,7 @@ class CDCLifeExpectancy(ExtractTransformLoad):
         # Expect that PR and Island Areas are the only things now missing
         compare_to_list_of_expected_state_fips_codes(
             actual_state_fips_codes=states_in_combined_df,
-            nation_expected=self.NATION_EXPECTED_IN_DATA,
+            continental_us_expected=self.CONTINENTAL_US_EXPECTED_IN_DATA,
             puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
             island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
             additional_fips_codes_not_expected=[],
diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
index 5e9f6105..b623206c 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py
@@ -17,9 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad):
     SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip"
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
     PUERTO_RICO_EXPECTED_IN_DATA = False
-
-    # Alaska and Hawaii are missing
-    EXPECTED_MISSING_STATES = ["02", "15"]
+    ALASKA_AND_HAWAII_EXPECTED_IN_DATA = False
 
     # Output score variables (values set on datasets.yml) for linting purposes
     COUNT_PROPERTIES: str
diff --git a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
index a2d67147..651d7f68 100644
--- a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py
@@ -20,6 +20,7 @@ class NatureDeprivedETL(ExtractTransformLoad):
     )
     GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
     PUERTO_RICO_EXPECTED_IN_DATA = False
+    ALASKA_AND_HAWAII_EXPECTED_IN_DATA = False
 
     # Alaska and Hawaii are missing
     EXPECTED_MISSING_STATES = ["02", "15"]
diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
index 72b1c4c0..8855baad 100644
--- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
+++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py
@@ -11,7 +11,10 @@ import numpy as np
 import pandas as pd
 
 from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
-from data_pipeline.etl.score.constants import TILES_NATION_FIPS_CODE
+from data_pipeline.etl.score.constants import (
+    TILES_CONTINENTAL_US_FIPS_CODE,
+    TILES_ALASKA_AND_HAWAII_FIPS_CODE,
+)
 from data_pipeline.tests.sources.example.etl import ExampleETL
 from data_pipeline.utils import get_module_logger
 
@@ -97,11 +100,13 @@ class TestETL:
         # Set values to match test fixtures
         etl_class.EXPECTED_MISSING_STATES = [
             x
-            for x in TILES_NATION_FIPS_CODE
+            for x in TILES_CONTINENTAL_US_FIPS_CODE
+            + TILES_ALASKA_AND_HAWAII_FIPS_CODE
             if x not in states_expected_from_fixtures
         ]
         etl_class.PUERTO_RICO_EXPECTED_IN_DATA = False
         etl_class.ISLAND_AREAS_EXPECTED_IN_DATA = False
+        etl_class.ALASKA_AND_HAWAII_EXPECTED_IN_DATA = True
 
         return etl_class