From 3f98206e6b4379eb1ed6469bb4eaf63f0bf1f969 Mon Sep 17 00:00:00 2001 From: lucasmbrown-usds Date: Fri, 9 Sep 2022 12:20:03 -0400 Subject: [PATCH] fixing tests --- data/data-pipeline/data_pipeline/etl/base.py | 18 ++++++----- .../data_pipeline/etl/score/constants.py | 5 ++-- .../data_pipeline/etl/score/etl_utils.py | 30 +++++++++++++------ .../etl/score/tests/test_etl_utils.py | 2 +- .../etl/sources/cdc_life_expectancy/etl.py | 4 +-- .../etl/sources/fsf_wildfire_risk/etl.py | 4 +-- .../etl/sources/nlcd_nature_deprived/etl.py | 1 + .../tests/sources/example/test_etl.py | 9 ++++-- 8 files changed, 46 insertions(+), 27 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 02f066e7..65580f9a 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -85,8 +85,12 @@ class ExtractTransformLoad: # NULL_REPRESENTATION is how nulls are represented on the input field NULL_REPRESENTATION: str = None - # Whether this ETL contains data for the nation (the US states) - NATION_EXPECTED_IN_DATA: bool = True + # Whether this ETL contains data for the continental nation (DC & the US states + # except for Alaska and Hawaii) + CONTINENTAL_US_EXPECTED_IN_DATA: bool = True + + # Whether this ETL contains data for Alaska and Hawaii + ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = True # Whether this ETL contains data for Puerto Rico PUERTO_RICO_EXPECTED_IN_DATA: bool = True @@ -223,8 +227,6 @@ class ExtractTransformLoad: """ # TODO: remove this once all ETL classes are converted to using the new # base class parameters and patterns. - # TODO: determine how to use this currently in the partially refactored world. - # https://github.com/usds/justice40-tool/issues/1891 if self.GEO_LEVEL is None: logger.info( "Skipping validation step for this class because it does not " @@ -308,15 +310,17 @@ class ExtractTransformLoad: ) # Check whether data contains expected states - states_in_output_df = list( + states_in_output_df = ( self.output_df[self.GEOID_TRACT_FIELD_NAME] - .astype(str) .str[0:2] .unique() + .tolist() ) + compare_to_list_of_expected_state_fips_codes( actual_state_fips_codes=states_in_output_df, - nation_expected=self.NATION_EXPECTED_IN_DATA, + continental_us_expected=self.CONTINENTAL_US_EXPECTED_IN_DATA, + alaska_and_hawaii_expected=self.ALASKA_AND_HAWAII_EXPECTED_IN_DATA, puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA, island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA, additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES, diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 9e34b096..c112eec0 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -131,9 +131,9 @@ TILES_NATION_THRESHOLD_COUNT = 21 # 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"] TILES_PUERTO_RICO_FIPS_CODE = ["72"] -TILES_NATION_FIPS_CODE = [ +TILES_ALASKA_AND_HAWAII_FIPS_CODE = ["02", "15"] +TILES_CONTINENTAL_US_FIPS_CODE = [ "01", - "02", "04", "05", "06", @@ -143,7 +143,6 @@ TILES_NATION_FIPS_CODE = [ "11", "12", "13", - "15", "16", "17", "18", diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py index 15770ad3..13f2dc70 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py @@ -10,7 +10,8 @@ from data_pipeline.config import settings from data_pipeline.etl.score.constants import ( TILES_ISLAND_AREA_FIPS_CODES, TILES_PUERTO_RICO_FIPS_CODE, - TILES_NATION_FIPS_CODE, + TILES_CONTINENTAL_US_FIPS_CODE, + TILES_ALASKA_AND_HAWAII_FIPS_CODE, ) from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.utils import ( @@ -317,7 +318,8 @@ def create_codebook( # pylint: disable=too-many-arguments def compare_to_list_of_expected_state_fips_codes( actual_state_fips_codes: typing.List[str], - nation_expected: bool = True, + continental_us_expected: bool = True, + alaska_and_hawaii_expected: bool = True, puerto_rico_expected: bool = True, island_areas_expected: bool = True, additional_fips_codes_not_expected: typing.List[str] = None, @@ -327,8 +329,10 @@ def compare_to_list_of_expected_state_fips_codes( Args: actual_state_fips_codes (List of str): Actual state codes observed in data - nation_expected (bool, optional): Do you expect the nation (DC & states) to be - represented in data? + continental_us_expected (bool, optional): Do you expect the continental nation + (DC & states except for Alaska and Hawaii) to be represented in data? + alaska_and_hawaii_expected (bool, optional): Do you expect Alaska and Hawaii + to be represented in the data? puerto_rico_expected (bool, optional): Do you expect PR to be represented in data? island_areas_expected (bool, optional): Do you expect Island Areas to be represented in data? @@ -354,11 +358,19 @@ def compare_to_list_of_expected_state_fips_codes( # Start with the list of all FIPS codes for all states and territories. expected_states_set = set(get_state_fips_codes(settings.DATA_PATH)) - # If nation (states and DC) are not expected to be included, remove it from the - # expected - # states set. - if not nation_expected: - expected_states_set = expected_states_set - set(TILES_NATION_FIPS_CODE) + # If continental US is not expected to be included, remove it from the + # expected states set. + if not continental_us_expected: + expected_states_set = expected_states_set - set( + TILES_CONTINENTAL_US_FIPS_CODE + ) + + # If Alaska and Hawaii are not expected to be included, remove them from the + # expected states set. + if not continental_us_expected: + expected_states_set = expected_states_set - set( + TILES_ALASKA_AND_HAWAII_FIPS_CODE + ) # If Puerto Rico is not expected to be included, remove it from the expected # states set. diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py index 22e7df73..44a3157f 100644 --- a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py @@ -225,5 +225,5 @@ def test_compare_to_list_of_expected_state_fips_codes(): # Should not raise error because Nation is not to be missing compare_to_list_of_expected_state_fips_codes( - actual_state_fips_codes=fips_codes_test_4, nation_expected=False + actual_state_fips_codes=fips_codes_test_4, continental_us_expected=False ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py index 1f4f01da..d75ca85b 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py @@ -81,7 +81,7 @@ class CDCLifeExpectancy(ExtractTransformLoad): # Expect that PR, Island Areas, and Maine/Wisconsin are missing compare_to_list_of_expected_state_fips_codes( actual_state_fips_codes=states_in_life_expectancy_usa_file, - nation_expected=self.NATION_EXPECTED_IN_DATA, + continental_us_expected=self.CONTINENTAL_US_EXPECTED_IN_DATA, puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA, island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA, additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE, @@ -117,7 +117,7 @@ class CDCLifeExpectancy(ExtractTransformLoad): # Expect that PR and Island Areas are the only things now missing compare_to_list_of_expected_state_fips_codes( actual_state_fips_codes=states_in_combined_df, - nation_expected=self.NATION_EXPECTED_IN_DATA, + continental_us_expected=self.CONTINENTAL_US_EXPECTED_IN_DATA, puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA, island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA, additional_fips_codes_not_expected=[], diff --git a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py index 5e9f6105..b623206c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/fsf_wildfire_risk/etl.py @@ -17,9 +17,7 @@ class WildfireRiskETL(ExtractTransformLoad): SOURCE_URL = settings.AWS_JUSTICE40_DATASOURCES_URL + "/fsf_fire.zip" GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False - - # Alaska and Hawaii are missing - EXPECTED_MISSING_STATES = ["02", "15"] + ALASKA_AND_HAWAII_EXPECTED_IN_DATA = False # Output score variables (values set on datasets.yml) for linting purposes COUNT_PROPERTIES: str diff --git a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py index a2d67147..651d7f68 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/nlcd_nature_deprived/etl.py @@ -20,6 +20,7 @@ class NatureDeprivedETL(ExtractTransformLoad): ) GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT PUERTO_RICO_EXPECTED_IN_DATA = False + ALASKA_AND_HAWAII_EXPECTED_IN_DATA = False # Alaska and Hawaii are missing EXPECTED_MISSING_STATES = ["02", "15"] diff --git a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py index 72b1c4c0..8855baad 100644 --- a/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py +++ b/data/data-pipeline/data_pipeline/tests/sources/example/test_etl.py @@ -11,7 +11,10 @@ import numpy as np import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel -from data_pipeline.etl.score.constants import TILES_NATION_FIPS_CODE +from data_pipeline.etl.score.constants import ( + TILES_CONTINENTAL_US_FIPS_CODE, + TILES_ALASKA_AND_HAWAII_FIPS_CODE, +) from data_pipeline.tests.sources.example.etl import ExampleETL from data_pipeline.utils import get_module_logger @@ -97,11 +100,13 @@ class TestETL: # Set values to match test fixtures etl_class.EXPECTED_MISSING_STATES = [ x - for x in TILES_NATION_FIPS_CODE + for x in TILES_CONTINENTAL_US_FIPS_CODE + + TILES_ALASKA_AND_HAWAII_FIPS_CODE if x not in states_expected_from_fixtures ] etl_class.PUERTO_RICO_EXPECTED_IN_DATA = False etl_class.ISLAND_AREAS_EXPECTED_IN_DATA = False + etl_class.ALASKA_AND_HAWAII_EXPECTED_IN_DATA = True return etl_class