mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-29 07:31:17 -07:00
Issue 1831: missing life expectancy data from Maine and Wisconsin (#1887)
* Fixing missing states and adding tests for states to all classes
This commit is contained in:
parent
fb4c484e5c
commit
6e9c44ea72
21 changed files with 522 additions and 187 deletions
|
@ -131,6 +131,58 @@ TILES_NATION_THRESHOLD_COUNT = 21
|
|||
# 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands
|
||||
TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"]
|
||||
TILES_PUERTO_RICO_FIPS_CODE = ["72"]
|
||||
TILES_ALASKA_AND_HAWAII_FIPS_CODE = ["02", "15"]
|
||||
TILES_CONTINENTAL_US_FIPS_CODE = [
|
||||
"01",
|
||||
"04",
|
||||
"05",
|
||||
"06",
|
||||
"08",
|
||||
"09",
|
||||
"10",
|
||||
"11",
|
||||
"12",
|
||||
"13",
|
||||
"16",
|
||||
"17",
|
||||
"18",
|
||||
"19",
|
||||
"20",
|
||||
"21",
|
||||
"22",
|
||||
"23",
|
||||
"24",
|
||||
"25",
|
||||
"26",
|
||||
"27",
|
||||
"28",
|
||||
"29",
|
||||
"30",
|
||||
"31",
|
||||
"32",
|
||||
"33",
|
||||
"34",
|
||||
"35",
|
||||
"36",
|
||||
"37",
|
||||
"38",
|
||||
"39",
|
||||
"40",
|
||||
"41",
|
||||
"42",
|
||||
"44",
|
||||
"45",
|
||||
"46",
|
||||
"47",
|
||||
"48",
|
||||
"49",
|
||||
"50",
|
||||
"51",
|
||||
"53",
|
||||
"54",
|
||||
"55",
|
||||
"56",
|
||||
]
|
||||
|
||||
# Constant to reflect UI Experience version
|
||||
# "Nation" referring to 50 states and DC is from Census
|
||||
|
@ -399,5 +451,5 @@ TILES_SCORE_FLOAT_COLUMNS = [
|
|||
# that use null to signify missing information in a boolean field.
|
||||
field_names.ELIGIBLE_FUDS_BINARY_FIELD_NAME,
|
||||
field_names.AML_BOOLEAN,
|
||||
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED
|
||||
field_names.HISTORIC_REDLINING_SCORE_EXCEEDED,
|
||||
]
|
||||
|
|
|
@ -1,11 +1,19 @@
|
|||
import os
|
||||
import sys
|
||||
import typing
|
||||
from pathlib import Path
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.etl.score.constants import (
|
||||
TILES_ISLAND_AREA_FIPS_CODES,
|
||||
TILES_PUERTO_RICO_FIPS_CODE,
|
||||
TILES_CONTINENTAL_US_FIPS_CODE,
|
||||
TILES_ALASKA_AND_HAWAII_FIPS_CODE,
|
||||
)
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
from data_pipeline.utils import (
|
||||
download_file_from_url,
|
||||
get_module_logger,
|
||||
|
@ -305,3 +313,106 @@ def create_codebook(
|
|||
return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
|
||||
columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
|
||||
)
|
||||
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
def compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes: typing.List[str],
|
||||
continental_us_expected: bool = True,
|
||||
alaska_and_hawaii_expected: bool = True,
|
||||
puerto_rico_expected: bool = True,
|
||||
island_areas_expected: bool = True,
|
||||
additional_fips_codes_not_expected: typing.List[str] = None,
|
||||
dataset_name: str = None,
|
||||
) -> None:
|
||||
"""Check whether a list of state/territory FIPS codes match expectations.
|
||||
|
||||
Args:
|
||||
actual_state_fips_codes (List of str): Actual state codes observed in data
|
||||
continental_us_expected (bool, optional): Do you expect the continental nation
|
||||
(DC & states except for Alaska and Hawaii) to be represented in data?
|
||||
alaska_and_hawaii_expected (bool, optional): Do you expect Alaska and Hawaii
|
||||
to be represented in the data? Note: if only *1* of Alaska and Hawaii are
|
||||
not expected to be included, do not use this argument -- instead,
|
||||
use `additional_fips_codes_not_expected` for the 1 state you expected to
|
||||
be missing.
|
||||
puerto_rico_expected (bool, optional): Do you expect PR to be represented in data?
|
||||
island_areas_expected (bool, optional): Do you expect Island Areas to be represented in
|
||||
data?
|
||||
additional_fips_codes_not_expected (List of str, optional): Additional state codes
|
||||
not expected in the data. For example, the data may be known to be missing
|
||||
data from Maine and Wisconsin.
|
||||
dataset_name (str, optional): The name of the data set, used only in printing an
|
||||
error message. (This is helpful for debugging during parallel etl runs.)
|
||||
|
||||
Returns:
|
||||
None: Does not return any values.
|
||||
|
||||
Raises:
|
||||
ValueError: if lists do not match expectations.
|
||||
"""
|
||||
# Setting default argument of [] here to avoid mutability problems.
|
||||
if additional_fips_codes_not_expected is None:
|
||||
additional_fips_codes_not_expected = []
|
||||
|
||||
# Cast input to a set.
|
||||
actual_state_fips_codes_set = set(actual_state_fips_codes)
|
||||
|
||||
# Start with the list of all FIPS codes for all states and territories.
|
||||
expected_states_set = set(get_state_fips_codes(settings.DATA_PATH))
|
||||
|
||||
# If continental US is not expected to be included, remove it from the
|
||||
# expected states set.
|
||||
if not continental_us_expected:
|
||||
expected_states_set = expected_states_set - set(
|
||||
TILES_CONTINENTAL_US_FIPS_CODE
|
||||
)
|
||||
|
||||
# If both Alaska and Hawaii are not expected to be included, remove them from the
|
||||
# expected states set.
|
||||
# Note: if only *1* of Alaska and Hawaii are not expected to be included,
|
||||
# do not use this argument -- instead, use `additional_fips_codes_not_expected`
|
||||
# for the 1 state you expected to be missing.
|
||||
if not alaska_and_hawaii_expected:
|
||||
expected_states_set = expected_states_set - set(
|
||||
TILES_ALASKA_AND_HAWAII_FIPS_CODE
|
||||
)
|
||||
|
||||
# If Puerto Rico is not expected to be included, remove it from the expected
|
||||
# states set.
|
||||
if not puerto_rico_expected:
|
||||
expected_states_set = expected_states_set - set(
|
||||
TILES_PUERTO_RICO_FIPS_CODE
|
||||
)
|
||||
|
||||
# If island areas are not expected to be included, remove them from the expected
|
||||
# states set.
|
||||
if not island_areas_expected:
|
||||
expected_states_set = expected_states_set - set(
|
||||
TILES_ISLAND_AREA_FIPS_CODES
|
||||
)
|
||||
|
||||
# If additional FIPS codes are not expected to be included, remove them from the
|
||||
# expected states set.
|
||||
expected_states_set = expected_states_set - set(
|
||||
additional_fips_codes_not_expected
|
||||
)
|
||||
|
||||
dataset_name_phrase = (
|
||||
f" for dataset `{dataset_name}`" if dataset_name is not None else ""
|
||||
)
|
||||
|
||||
if expected_states_set != actual_state_fips_codes_set:
|
||||
raise ValueError(
|
||||
f"The states and territories in the data{dataset_name_phrase} are not "
|
||||
f"as expected.\n"
|
||||
"FIPS state codes expected that are not present in the data:\n"
|
||||
f"{sorted(list(expected_states_set - actual_state_fips_codes_set))}\n"
|
||||
"FIPS state codes in the data that were not expected:\n"
|
||||
f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Data matches expected state and territory representation"
|
||||
f"{dataset_name_phrase}."
|
||||
)
|
||||
|
|
|
@ -2,7 +2,10 @@ import pandas as pd
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from data_pipeline.etl.score.etl_utils import floor_series
|
||||
from data_pipeline.etl.score.etl_utils import (
|
||||
floor_series,
|
||||
compare_to_list_of_expected_state_fips_codes,
|
||||
)
|
||||
|
||||
|
||||
def test_floor_series():
|
||||
|
@ -70,3 +73,159 @@ def test_floor_series():
|
|||
match="Argument series must be of type pandas series, not of type list.",
|
||||
):
|
||||
floor_series(invalid_type, number_of_decimals=3)
|
||||
|
||||
|
||||
def test_compare_to_list_of_expected_state_fips_codes():
|
||||
# Has every state/territory/DC code
|
||||
fips_codes_test_1 = [
|
||||
"01",
|
||||
"02",
|
||||
"04",
|
||||
"05",
|
||||
"06",
|
||||
"08",
|
||||
"09",
|
||||
"10",
|
||||
"11",
|
||||
"12",
|
||||
"13",
|
||||
"15",
|
||||
"16",
|
||||
"17",
|
||||
"18",
|
||||
"19",
|
||||
"20",
|
||||
"21",
|
||||
"22",
|
||||
"23",
|
||||
"24",
|
||||
"25",
|
||||
"26",
|
||||
"27",
|
||||
"28",
|
||||
"29",
|
||||
"30",
|
||||
"31",
|
||||
"32",
|
||||
"33",
|
||||
"34",
|
||||
"35",
|
||||
"36",
|
||||
"37",
|
||||
"38",
|
||||
"39",
|
||||
"40",
|
||||
"41",
|
||||
"42",
|
||||
"44",
|
||||
"45",
|
||||
"46",
|
||||
"47",
|
||||
"48",
|
||||
"49",
|
||||
"50",
|
||||
"51",
|
||||
"53",
|
||||
"54",
|
||||
"55",
|
||||
"56",
|
||||
"60",
|
||||
"66",
|
||||
"69",
|
||||
"72",
|
||||
"78",
|
||||
]
|
||||
|
||||
# Should not raise any errors
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_1
|
||||
)
|
||||
|
||||
# Should raise error because Puerto Rico is not expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_1,
|
||||
puerto_rico_expected=False,
|
||||
)
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes in the data that were not expected:\n['72']\n"
|
||||
)
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# Should raise error because Island Areas are not expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_1,
|
||||
island_areas_expected=False,
|
||||
)
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes in the data that were not expected:\n"
|
||||
"['60', '66', '69', '78']\n"
|
||||
)
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# List missing PR and Guam
|
||||
fips_codes_test_2 = [x for x in fips_codes_test_1 if x not in ["66", "72"]]
|
||||
|
||||
# Should raise error because all Island Areas and PR are expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_2,
|
||||
)
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes expected that are not present in the data:\n"
|
||||
"['66', '72']\n"
|
||||
)
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# Missing Maine and Wisconsin
|
||||
fips_codes_test_3 = [x for x in fips_codes_test_1 if x not in ["23", "55"]]
|
||||
|
||||
# Should raise error because Maine and Wisconsin are expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_3,
|
||||
)
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes expected that are not present in the data:\n"
|
||||
"['23', '55']\n"
|
||||
)
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# Should not raise error because Maine and Wisconsin are expected to be missing
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_3,
|
||||
additional_fips_codes_not_expected=["23", "55"],
|
||||
)
|
||||
|
||||
# Missing the continental & AK/HI nation
|
||||
fips_codes_test_4 = [
|
||||
"60",
|
||||
"66",
|
||||
"69",
|
||||
"72",
|
||||
"78",
|
||||
]
|
||||
|
||||
# Should raise error because the nation is expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_4,
|
||||
)
|
||||
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes expected that are not present in the data:\n"
|
||||
"['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', "
|
||||
"'17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', "
|
||||
"'30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', "
|
||||
"'44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56']"
|
||||
)
|
||||
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# Should not raise error because continental US and AK/HI is not to be missing
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_4,
|
||||
continental_us_expected=False,
|
||||
alaska_and_hawaii_expected=False,
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue