refactoring

This commit is contained in:
lucasmbrown-usds 2022-09-07 16:02:17 -04:00
parent 56a24b9bd1
commit 70606440fb
6 changed files with 457 additions and 46 deletions

View file

@ -12,6 +12,7 @@ settings = Dynaconf(
# set root dir
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
settings.DATA_PATH = settings.APP_ROOT / "data"
settings.REQUESTS_DEFAULT_TIMOUT = 3600
# To set an environment use:
# Linux/OSX: export ENV_FOR_DYNACONF=staging

View file

@ -7,6 +7,9 @@ from typing import Optional
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.score.etl_utils import (
compare_to_list_of_expected_state_fips_codes,
)
from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
from data_pipeline.utils import (
load_yaml_dict_from_file,
@ -43,7 +46,7 @@ class ExtractTransformLoad:
APP_ROOT: pathlib.Path = settings.APP_ROOT
# Directories
DATA_PATH: pathlib.Path = APP_ROOT / "data"
DATA_PATH: pathlib.Path = settings.DATA_PATH
TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
@ -82,6 +85,19 @@ class ExtractTransformLoad:
# NULL_REPRESENTATION is how nulls are represented on the input field
NULL_REPRESENTATION: str = None
# Whether this ETL contains data for the nation (the US states)
NATION_EXPECTED_IN_DATA: bool = True
# Whether this ETL contains data for Puerto Rico
PUERTO_RICO_EXPECTED_IN_DATA: bool = True
# Whether this ETL contains data for the island areas
ISLAND_AREAS_EXPECTED_IN_DATA: bool = False
# Whether this ETL contains known missing data for any additional
# states/territories
EXPECTED_MISSING_STATES: typing.List[str] = []
# Thirteen digits in a census block group ID.
EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
@ -289,6 +305,21 @@ class ExtractTransformLoad:
f"`{geo_field}`."
)
# Check whether data contains expected states
states_in_output_df = list(
self.output_df[self.GEOID_TRACT_FIELD_NAME]
.astype(str)
.str[0:2]
.unique()
)
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=states_in_output_df,
nation_expected=self.NATION_EXPECTED_IN_DATA,
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES,
)
def load(self, float_format=None) -> None:
"""Saves the transformed data.

View file

@ -131,6 +131,59 @@ TILES_NATION_THRESHOLD_COUNT = 21
# 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands
TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"]
TILES_PUERTO_RICO_FIPS_CODE = ["72"]
TILES_NATION_FIPS_CODE = [
"01",
"02",
"04",
"05",
"06",
"08",
"09",
"10",
"11",
"12",
"13",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"53",
"54",
"55",
"56",
]
# Constant to reflect UI Experience version
# "Nation" referring to 50 states and DC is from Census

View file

@ -1,11 +1,18 @@
import os
import sys
import typing
from pathlib import Path
from collections import namedtuple
import numpy as np
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.etl.score.constants import (
TILES_ISLAND_AREA_FIPS_CODES,
TILES_PUERTO_RICO_FIPS_CODE,
TILES_NATION_FIPS_CODE,
)
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import (
download_file_from_url,
get_module_logger,
@ -305,3 +312,73 @@ def create_codebook(
return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
)
def compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes: typing.List[str],
nation_expected: bool = True,
puerto_rico_expected: bool = True,
island_areas_expected: bool = True,
additional_fips_codes_not_expected: typing.List[str] = [],
) -> None:
"""Check whether a list of state/territory FIPS codes match expectations.
Args:
actual_state_fips_codes (List of str): Actual state codes observed in data
nation_expected (bool): Do you expect the nation (DC & states) to be
represented in data?
puerto_rico_expected (bool): Do you expect PR to be represented in data?
island_areas_expected (bool): Do you expect Island Areas to be represented in
data?
additional_fips_codes_not_expected (List of str): Additional state codes
not expected in the data. For example, the data may be known to be missing
data from Maine and Wisconsin.
Returns:
None: Does not return any values.
Raises:
ValueError: if lists do not match expectations.
"""
# Cast input to a set.
actual_state_fips_codes_set = set(actual_state_fips_codes)
# Start with the list of all FIPS codes for all states and territories.
expected_states_set = set(get_state_fips_codes(settings.DATA_PATH))
# If nation (states and DC) are not expected to be included, remove it from the
# expected
# states set.
if not nation_expected:
expected_states_set = expected_states_set - set(TILES_NATION_FIPS_CODE)
# If Puerto Rico is not expected to be included, remove it from the expected
# states set.
if not puerto_rico_expected:
expected_states_set = expected_states_set - set(
TILES_PUERTO_RICO_FIPS_CODE
)
# If island areas are not expected to be included, remove them from the expected
# states set.
if not island_areas_expected:
expected_states_set = expected_states_set - set(
TILES_ISLAND_AREA_FIPS_CODES
)
# If additional FIPS codes are not expected to be included, remove them from the
# expected states set.
expected_states_set = expected_states_set - set(
additional_fips_codes_not_expected
)
if expected_states_set != actual_state_fips_codes_set:
raise ValueError(
"The states and territories in the data are not as expected.\n"
"FIPS state codes expected that are not present in the data:\n"
f"{sorted(list(expected_states_set - actual_state_fips_codes_set))}\n"
"FIPS state codes in the data that were not expected:\n"
f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
)
else:
logger.info("Data matches expected state and territory representation.")

View file

@ -2,7 +2,10 @@ import pandas as pd
import numpy as np
import pytest
from data_pipeline.etl.score.etl_utils import floor_series
from data_pipeline.etl.score.etl_utils import (
floor_series,
compare_to_list_of_expected_state_fips_codes,
)
def test_floor_series():
@ -70,3 +73,265 @@ def test_floor_series():
match="Argument series must be of type pandas series, not of type list.",
):
floor_series(invalid_type, number_of_decimals=3)
def test_compare_to_list_of_expected_state_fips_codes():
fips_codes_test_1 = [
"01",
"02",
"04",
"05",
"06",
"08",
"09",
"10",
"11",
"12",
"13",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"53",
"54",
"55",
"56",
"60",
"66",
"69",
"72",
"78",
]
# Should not raise any errors
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_1
)
# Should raise error because Puerto Rico is not expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_1,
puerto_rico_expected=False,
)
partial_expected_error_message = (
"FIPS state codes in the data that were not expected:\n['72']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# Should raise error because Island Areas are not expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_1,
island_areas_expected=False,
)
partial_expected_error_message = (
"FIPS state codes in the data that were not expected:\n"
"['60', '66', '69', '78']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# List missing PR and Guam
fips_codes_test_2 = [
"01",
"02",
"04",
"05",
"06",
"08",
"09",
"10",
"11",
"12",
"13",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"53",
"54",
"55",
"56",
"60",
"69",
"78",
]
# Should raise error because all Island Areas and PR are expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_2,
)
partial_expected_error_message = (
"FIPS state codes expected that are not present in the data:\n"
"['66', '72']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# Missing Maine and Wisconsin
fips_codes_test_3 = [
"01",
"02",
"04",
"05",
"06",
"08",
"09",
"10",
"11",
"12",
"13",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"53",
"54",
"56",
"60",
"66",
"69",
"72",
"78",
]
# Should raise error because Maine and Wisconsin are expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_3,
)
partial_expected_error_message = (
"FIPS state codes expected that are not present in the data:\n"
"['23', '55']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# Should not raise error because Maine and Wisconsin are expected to be missing
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_3,
additional_fips_codes_not_expected=["23", "55"],
)
# Missing the nation
fips_codes_test_4 = [
"60",
"66",
"69",
"72",
"78",
]
# Should raise error because the nation is expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_4,
)
partial_expected_error_message = (
"FIPS state codes expected that are not present in the data:\n"
"['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', "
"'17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', "
"'30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', "
"'44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56']"
)
assert partial_expected_error_message in str(exception_info.value)
# Should not raise error because Nation is not to be missing
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_4, nation_expected=False
)

View file

@ -1,10 +1,9 @@
from pathlib import Path
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.score.constants import (
TILES_ISLAND_AREA_FIPS_CODES,
TILES_PUERTO_RICO_FIPS_CODE,
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.etl.score.etl_utils import (
compare_to_list_of_expected_state_fips_codes,
)
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger, download_file_from_url
@ -14,8 +13,13 @@ logger = get_module_logger(__name__)
class CDCLifeExpectancy(ExtractTransformLoad):
def __init__(self):
self.GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
self.PUERTO_RICO_EXPECTED_IN_DATA = False
self.USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
self.STATES_MISSING_FROM_USA_FILE = ["23", "55"]
# For some reason, LEEP does not include Maine or Wisconsin in its "All of
# USA" file. Load these separately.
self.WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
@ -35,19 +39,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
self.LIFE_EXPECTANCY_FIELD_NAME,
]
# Set some constants that will be helpful for debugging the source data later.
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
self.EXPECTED_STATES_SET = (
set(self.STATE_FIPS_CODES)
# We don't expect LEEP to have data for island areas or Puerto Rico.
- set(TILES_ISLAND_AREA_FIPS_CODES)
- set(TILES_PUERTO_RICO_FIPS_CODE)
)
# These states are currently missing from LEEP's whole USA file.
self.EXPECTED_MISSING_STATES = ["23", "55"]
self.raw_df: pd.DataFrame
self.output_df: pd.DataFrame
@ -76,22 +67,17 @@ class CDCLifeExpectancy(ExtractTransformLoad):
)
# Check which states are missing
states_in_life_expectancy_usa_file = all_usa_raw_df[
self.STATE_INPUT_COLUMN_NAME
].unique()
# Find which states are missing from the expected set.
states_missing = sorted(
list(
self.EXPECTED_STATES_SET
- set(states_in_life_expectancy_usa_file)
)
states_in_life_expectancy_usa_file = list(
all_usa_raw_df[self.STATE_INPUT_COLUMN_NAME].unique()
)
if states_missing != self.EXPECTED_MISSING_STATES:
raise ValueError(
"LEEP data has changed. The states missing from the data are "
"no longer the same."
# Expect that PR, Island Areas, and Maine/Wisconsin are missing
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=states_in_life_expectancy_usa_file,
nation_expected=self.NATION_EXPECTED_IN_DATA,
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
)
logger.info("Downloading data for Maine")
@ -131,19 +117,17 @@ class CDCLifeExpectancy(ExtractTransformLoad):
axis=0,
)
states_in_combined_df = combined_df[
self.STATE_INPUT_COLUMN_NAME
].unique()
# Find which states are missing from the combined df.
states_missing = sorted(
list(self.EXPECTED_STATES_SET - set(states_in_combined_df))
states_in_combined_df = list(
combined_df[self.STATE_INPUT_COLUMN_NAME].unique()
)
if len(states_missing) != 0:
raise ValueError(
"The states missing from combined dataframe are "
"no longer as expected."
# Expect that PR and Island Areas are the only things now missing
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=states_in_combined_df,
nation_expected=self.NATION_EXPECTED_IN_DATA,
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
additional_fips_codes_not_expected=[],
)
# Save the updated version