Issue 1831: missing life expectancy data from Maine and Wisconsin (#1887)

* Fixing missing states and adding tests for states to all classes
This commit is contained in:
Lucas Merrill Brown 2022-09-09 20:35:01 -04:00 committed by GitHub
commit 6e9c44ea72
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
21 changed files with 522 additions and 187 deletions

View file

@ -131,6 +131,58 @@ TILES_NATION_THRESHOLD_COUNT = 21
# 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands
TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"]
TILES_PUERTO_RICO_FIPS_CODE = ["72"]
TILES_ALASKA_AND_HAWAII_FIPS_CODE = ["02", "15"]
TILES_CONTINENTAL_US_FIPS_CODE = [
"01",
"04",
"05",
"06",
"08",
"09",
"10",
"11",
"12",
"13",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"53",
"54",
"55",
"56",
]
# Constant to reflect UI Experience version
# "Nation" referring to 50 states and DC is from Census
@ -399,5 +451,5 @@ TILES_SCORE_FLOAT_COLUMNS = [
# that use null to signify missing information in a boolean field.
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
field_names.AML_BOOLEAN,
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
]

View file

@ -1,11 +1,19 @@
import os
import sys
import typing
from pathlib import Path
from collections import namedtuple
import numpy as np
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.score.constants import (
TILES_ISLAND_AREA_FIPS_CODES,
TILES_PUERTO_RICO_FIPS_CODE,
TILES_CONTINENTAL_US_FIPS_CODE,
TILES_ALASKA_AND_HAWAII_FIPS_CODE,
)
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import (
download_file_from_url,
get_module_logger,
@ -305,3 +313,106 @@ def create_codebook(
return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
)
# pylint: disable=too-many-arguments
def compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes: typing.List[str],
continental_us_expected: bool = True,
alaska_and_hawaii_expected: bool = True,
puerto_rico_expected: bool = True,
island_areas_expected: bool = True,
additional_fips_codes_not_expected: typing.List[str] = None,
dataset_name: str = None,
) -> None:
"""Check whether a list of state/territory FIPS codes match expectations.
Args:
actual_state_fips_codes (List of str): Actual state codes observed in data
continental_us_expected (bool, optional): Do you expect the continental nation
(DC & states except for Alaska and Hawaii) to be represented in data?
alaska_and_hawaii_expected (bool, optional): Do you expect Alaska and Hawaii
to be represented in the data? Note: if only *1* of Alaska and Hawaii are
not expected to be included, do not use this argument -- instead,
use `additional_fips_codes_not_expected` for the 1 state you expected to
be missing.
puerto_rico_expected (bool, optional): Do you expect PR to be represented in data?
island_areas_expected (bool, optional): Do you expect Island Areas to be represented in
data?
additional_fips_codes_not_expected (List of str, optional): Additional state codes
not expected in the data. For example, the data may be known to be missing
data from Maine and Wisconsin.
dataset_name (str, optional): The name of the data set, used only in printing an
error message. (This is helpful for debugging during parallel etl runs.)
Returns:
None: Does not return any values.
Raises:
ValueError: if lists do not match expectations.
"""
# Setting default argument of [] here to avoid mutability problems.
if additional_fips_codes_not_expected is None:
additional_fips_codes_not_expected = []
# Cast input to a set.
actual_state_fips_codes_set = set(actual_state_fips_codes)
# Start with the list of all FIPS codes for all states and territories.
expected_states_set = set(get_state_fips_codes(settings.DATA_PATH))
# If continental US is not expected to be included, remove it from the
# expected states set.
if not continental_us_expected:
expected_states_set = expected_states_set - set(
TILES_CONTINENTAL_US_FIPS_CODE
)
# If both Alaska and Hawaii are not expected to be included, remove them from the
# expected states set.
# Note: if only *1* of Alaska and Hawaii are not expected to be included,
# do not use this argument -- instead, use `additional_fips_codes_not_expected`
# for the 1 state you expected to be missing.
if not alaska_and_hawaii_expected:
expected_states_set = expected_states_set - set(
TILES_ALASKA_AND_HAWAII_FIPS_CODE
)
# If Puerto Rico is not expected to be included, remove it from the expected
# states set.
if not puerto_rico_expected:
expected_states_set = expected_states_set - set(
TILES_PUERTO_RICO_FIPS_CODE
)
# If island areas are not expected to be included, remove them from the expected
# states set.
if not island_areas_expected:
expected_states_set = expected_states_set - set(
TILES_ISLAND_AREA_FIPS_CODES
)
# If additional FIPS codes are not expected to be included, remove them from the
# expected states set.
expected_states_set = expected_states_set - set(
additional_fips_codes_not_expected
)
dataset_name_phrase = (
f" for dataset `{dataset_name}`" if dataset_name is not None else ""
)
if expected_states_set != actual_state_fips_codes_set:
raise ValueError(
f"The states and territories in the data{dataset_name_phrase} are not "
f"as expected.\n"
"FIPS state codes expected that are not present in the data:\n"
f"{sorted(list(expected_states_set - actual_state_fips_codes_set))}\n"
"FIPS state codes in the data that were not expected:\n"
f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
)
else:
logger.info(
"Data matches expected state and territory representation"
f"{dataset_name_phrase}."
)

View file

@ -2,7 +2,10 @@ import pandas as pd
import numpy as np
import pytest
from data_pipeline.etl.score.etl_utils import floor_series
from data_pipeline.etl.score.etl_utils import (
floor_series,
compare_to_list_of_expected_state_fips_codes,
)
def test_floor_series():
@ -70,3 +73,159 @@ def test_floor_series():
match="Argument series must be of type pandas series, not of type list.",
):
floor_series(invalid_type, number_of_decimals=3)
def test_compare_to_list_of_expected_state_fips_codes():
# Has every state/territory/DC code
fips_codes_test_1 = [
"01",
"02",
"04",
"05",
"06",
"08",
"09",
"10",
"11",
"12",
"13",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"53",
"54",
"55",
"56",
"60",
"66",
"69",
"72",
"78",
]
# Should not raise any errors
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_1
)
# Should raise error because Puerto Rico is not expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_1,
puerto_rico_expected=False,
)
partial_expected_error_message = (
"FIPS state codes in the data that were not expected:\n['72']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# Should raise error because Island Areas are not expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_1,
island_areas_expected=False,
)
partial_expected_error_message = (
"FIPS state codes in the data that were not expected:\n"
"['60', '66', '69', '78']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# List missing PR and Guam
fips_codes_test_2 = [x for x in fips_codes_test_1 if x not in ["66", "72"]]
# Should raise error because all Island Areas and PR are expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_2,
)
partial_expected_error_message = (
"FIPS state codes expected that are not present in the data:\n"
"['66', '72']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# Missing Maine and Wisconsin
fips_codes_test_3 = [x for x in fips_codes_test_1 if x not in ["23", "55"]]
# Should raise error because Maine and Wisconsin are expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_3,
)
partial_expected_error_message = (
"FIPS state codes expected that are not present in the data:\n"
"['23', '55']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# Should not raise error because Maine and Wisconsin are expected to be missing
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_3,
additional_fips_codes_not_expected=["23", "55"],
)
# Missing the continental & AK/HI nation
fips_codes_test_4 = [
"60",
"66",
"69",
"72",
"78",
]
# Should raise error because the nation is expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_4,
)
partial_expected_error_message = (
"FIPS state codes expected that are not present in the data:\n"
"['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', "
"'17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', "
"'30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', "
"'44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56']"
)
assert partial_expected_error_message in str(exception_info.value)
# Should not raise error because continental US and AK/HI is not to be missing
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_4,
continental_us_expected=False,
alaska_and_hawaii_expected=False,
)