refactoring

This commit is contained in:
lucasmbrown-usds 2022-09-07 16:02:17 -04:00
parent 56a24b9bd1
commit 70606440fb
6 changed files with 457 additions and 46 deletions

View file

@ -12,6 +12,7 @@ settings = Dynaconf(
# set root dir # set root dir
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
settings.DATA_PATH = settings.APP_ROOT / "data"
settings.REQUESTS_DEFAULT_TIMOUT = 3600 settings.REQUESTS_DEFAULT_TIMOUT = 3600
# To set an environment use: # To set an environment use:
# Linux/OSX: export ENV_FOR_DYNACONF=staging # Linux/OSX: export ENV_FOR_DYNACONF=staging

View file

@ -7,6 +7,9 @@ from typing import Optional
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.score.etl_utils import (
compare_to_list_of_expected_state_fips_codes,
)
from data_pipeline.etl.score.schemas.datasets import DatasetsConfig from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
from data_pipeline.utils import ( from data_pipeline.utils import (
load_yaml_dict_from_file, load_yaml_dict_from_file,
@ -43,7 +46,7 @@ class ExtractTransformLoad:
APP_ROOT: pathlib.Path = settings.APP_ROOT APP_ROOT: pathlib.Path = settings.APP_ROOT
# Directories # Directories
DATA_PATH: pathlib.Path = APP_ROOT / "data" DATA_PATH: pathlib.Path = settings.DATA_PATH
TMP_PATH: pathlib.Path = DATA_PATH / "tmp" TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config" CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config" DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
@ -82,6 +85,19 @@ class ExtractTransformLoad:
# NULL_REPRESENTATION is how nulls are represented on the input field # NULL_REPRESENTATION is how nulls are represented on the input field
NULL_REPRESENTATION: str = None NULL_REPRESENTATION: str = None
# Whether this ETL contains data for the nation (the US states)
NATION_EXPECTED_IN_DATA: bool = True
# Whether this ETL contains data for Puerto Rico
PUERTO_RICO_EXPECTED_IN_DATA: bool = True
# Whether this ETL contains data for the island areas
ISLAND_AREAS_EXPECTED_IN_DATA: bool = False
# Whether this ETL contains known missing data for any additional
# states/territories
EXPECTED_MISSING_STATES: typing.List[str] = []
# Thirteen digits in a census block group ID. # Thirteen digits in a census block group ID.
EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13 EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
@ -289,6 +305,21 @@ class ExtractTransformLoad:
f"`{geo_field}`." f"`{geo_field}`."
) )
# Check whether data contains expected states
states_in_output_df = list(
self.output_df[self.GEOID_TRACT_FIELD_NAME]
.astype(str)
.str[0:2]
.unique()
)
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=states_in_output_df,
nation_expected=self.NATION_EXPECTED_IN_DATA,
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES,
)
def load(self, float_format=None) -> None: def load(self, float_format=None) -> None:
"""Saves the transformed data. """Saves the transformed data.

View file

@ -131,6 +131,59 @@ TILES_NATION_THRESHOLD_COUNT = 21
# 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands # 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands
TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"] TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"]
TILES_PUERTO_RICO_FIPS_CODE = ["72"] TILES_PUERTO_RICO_FIPS_CODE = ["72"]
TILES_NATION_FIPS_CODE = [
"01",
"02",
"04",
"05",
"06",
"08",
"09",
"10",
"11",
"12",
"13",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"53",
"54",
"55",
"56",
]
# Constant to reflect UI Experience version # Constant to reflect UI Experience version
# "Nation" referring to 50 states and DC is from Census # "Nation" referring to 50 states and DC is from Census

View file

@ -1,11 +1,18 @@
import os import os
import sys import sys
import typing
from pathlib import Path from pathlib import Path
from collections import namedtuple from collections import namedtuple
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from data_pipeline.config import settings from data_pipeline.config import settings
from data_pipeline.etl.score.constants import (
TILES_ISLAND_AREA_FIPS_CODES,
TILES_PUERTO_RICO_FIPS_CODE,
TILES_NATION_FIPS_CODE,
)
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import ( from data_pipeline.utils import (
download_file_from_url, download_file_from_url,
get_module_logger, get_module_logger,
@ -305,3 +312,73 @@ def create_codebook(
return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename( return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"} columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
) )
def compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes: typing.List[str],
nation_expected: bool = True,
puerto_rico_expected: bool = True,
island_areas_expected: bool = True,
additional_fips_codes_not_expected: typing.List[str] = [],
) -> None:
"""Check whether a list of state/territory FIPS codes match expectations.
Args:
actual_state_fips_codes (List of str): Actual state codes observed in data
nation_expected (bool): Do you expect the nation (DC & states) to be
represented in data?
puerto_rico_expected (bool): Do you expect PR to be represented in data?
island_areas_expected (bool): Do you expect Island Areas to be represented in
data?
additional_fips_codes_not_expected (List of str): Additional state codes
not expected in the data. For example, the data may be known to be missing
data from Maine and Wisconsin.
Returns:
None: Does not return any values.
Raises:
ValueError: if lists do not match expectations.
"""
# Cast input to a set.
actual_state_fips_codes_set = set(actual_state_fips_codes)
# Start with the list of all FIPS codes for all states and territories.
expected_states_set = set(get_state_fips_codes(settings.DATA_PATH))
# If nation (states and DC) are not expected to be included, remove it from the
# expected
# states set.
if not nation_expected:
expected_states_set = expected_states_set - set(TILES_NATION_FIPS_CODE)
# If Puerto Rico is not expected to be included, remove it from the expected
# states set.
if not puerto_rico_expected:
expected_states_set = expected_states_set - set(
TILES_PUERTO_RICO_FIPS_CODE
)
# If island areas are not expected to be included, remove them from the expected
# states set.
if not island_areas_expected:
expected_states_set = expected_states_set - set(
TILES_ISLAND_AREA_FIPS_CODES
)
# If additional FIPS codes are not expected to be included, remove them from the
# expected states set.
expected_states_set = expected_states_set - set(
additional_fips_codes_not_expected
)
if expected_states_set != actual_state_fips_codes_set:
raise ValueError(
"The states and territories in the data are not as expected.\n"
"FIPS state codes expected that are not present in the data:\n"
f"{sorted(list(expected_states_set - actual_state_fips_codes_set))}\n"
"FIPS state codes in the data that were not expected:\n"
f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
)
else:
logger.info("Data matches expected state and territory representation.")

View file

@ -2,7 +2,10 @@ import pandas as pd
import numpy as np import numpy as np
import pytest import pytest
from data_pipeline.etl.score.etl_utils import floor_series from data_pipeline.etl.score.etl_utils import (
floor_series,
compare_to_list_of_expected_state_fips_codes,
)
def test_floor_series(): def test_floor_series():
@ -70,3 +73,265 @@ def test_floor_series():
match="Argument series must be of type pandas series, not of type list.", match="Argument series must be of type pandas series, not of type list.",
): ):
floor_series(invalid_type, number_of_decimals=3) floor_series(invalid_type, number_of_decimals=3)
def test_compare_to_list_of_expected_state_fips_codes():
fips_codes_test_1 = [
"01",
"02",
"04",
"05",
"06",
"08",
"09",
"10",
"11",
"12",
"13",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"53",
"54",
"55",
"56",
"60",
"66",
"69",
"72",
"78",
]
# Should not raise any errors
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_1
)
# Should raise error because Puerto Rico is not expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_1,
puerto_rico_expected=False,
)
partial_expected_error_message = (
"FIPS state codes in the data that were not expected:\n['72']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# Should raise error because Island Areas are not expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_1,
island_areas_expected=False,
)
partial_expected_error_message = (
"FIPS state codes in the data that were not expected:\n"
"['60', '66', '69', '78']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# List missing PR and Guam
fips_codes_test_2 = [
"01",
"02",
"04",
"05",
"06",
"08",
"09",
"10",
"11",
"12",
"13",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"53",
"54",
"55",
"56",
"60",
"69",
"78",
]
# Should raise error because all Island Areas and PR are expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_2,
)
partial_expected_error_message = (
"FIPS state codes expected that are not present in the data:\n"
"['66', '72']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# Missing Maine and Wisconsin
fips_codes_test_3 = [
"01",
"02",
"04",
"05",
"06",
"08",
"09",
"10",
"11",
"12",
"13",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"53",
"54",
"56",
"60",
"66",
"69",
"72",
"78",
]
# Should raise error because Maine and Wisconsin are expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_3,
)
partial_expected_error_message = (
"FIPS state codes expected that are not present in the data:\n"
"['23', '55']\n"
)
assert partial_expected_error_message in str(exception_info.value)
# Should not raise error because Maine and Wisconsin are expected to be missing
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_3,
additional_fips_codes_not_expected=["23", "55"],
)
# Missing the nation
fips_codes_test_4 = [
"60",
"66",
"69",
"72",
"78",
]
# Should raise error because the nation is expected
with pytest.raises(ValueError) as exception_info:
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_4,
)
partial_expected_error_message = (
"FIPS state codes expected that are not present in the data:\n"
"['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', "
"'17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', "
"'30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', "
"'44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56']"
)
assert partial_expected_error_message in str(exception_info.value)
# Should not raise error because Nation is not to be missing
compare_to_list_of_expected_state_fips_codes(
actual_state_fips_codes=fips_codes_test_4, nation_expected=False
)

View file

@ -1,10 +1,9 @@
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
from data_pipeline.etl.score.constants import ( from data_pipeline.etl.score.etl_utils import (
TILES_ISLAND_AREA_FIPS_CODES, compare_to_list_of_expected_state_fips_codes,
TILES_PUERTO_RICO_FIPS_CODE,
) )
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
from data_pipeline.utils import get_module_logger, download_file_from_url from data_pipeline.utils import get_module_logger, download_file_from_url
@ -14,8 +13,13 @@ logger = get_module_logger(__name__)
class CDCLifeExpectancy(ExtractTransformLoad): class CDCLifeExpectancy(ExtractTransformLoad):
def __init__(self): def __init__(self):
self.GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
self.PUERTO_RICO_EXPECTED_IN_DATA = False
self.USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV" self.USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
self.STATES_MISSING_FROM_USA_FILE = ["23", "55"]
# For some reason, LEEP does not include Maine or Wisconsin in its "All of # For some reason, LEEP does not include Maine or Wisconsin in its "All of
# USA" file. Load these separately. # USA" file. Load these separately.
self.WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV" self.WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
@ -35,19 +39,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
self.LIFE_EXPECTANCY_FIELD_NAME, self.LIFE_EXPECTANCY_FIELD_NAME,
] ]
# Set some constants that will be helpful for debugging the source data later.
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
self.EXPECTED_STATES_SET = (
set(self.STATE_FIPS_CODES)
# We don't expect LEEP to have data for island areas or Puerto Rico.
- set(TILES_ISLAND_AREA_FIPS_CODES)
- set(TILES_PUERTO_RICO_FIPS_CODE)
)
# These states are currently missing from LEEP's whole USA file.
self.EXPECTED_MISSING_STATES = ["23", "55"]
self.raw_df: pd.DataFrame self.raw_df: pd.DataFrame
self.output_df: pd.DataFrame self.output_df: pd.DataFrame
@ -76,23 +67,18 @@ class CDCLifeExpectancy(ExtractTransformLoad):
) )
# Check which states are missing # Check which states are missing
states_in_life_expectancy_usa_file = all_usa_raw_df[ states_in_life_expectancy_usa_file = list(
self.STATE_INPUT_COLUMN_NAME all_usa_raw_df[self.STATE_INPUT_COLUMN_NAME].unique()
].unique()
# Find which states are missing from the expected set.
states_missing = sorted(
list(
self.EXPECTED_STATES_SET
- set(states_in_life_expectancy_usa_file)
)
) )
if states_missing != self.EXPECTED_MISSING_STATES: # Expect that PR, Island Areas, and Maine/Wisconsin are missing
raise ValueError( compare_to_list_of_expected_state_fips_codes(
"LEEP data has changed. The states missing from the data are " actual_state_fips_codes=states_in_life_expectancy_usa_file,
"no longer the same." nation_expected=self.NATION_EXPECTED_IN_DATA,
) puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
)
logger.info("Downloading data for Maine") logger.info("Downloading data for Maine")
maine_download_file_name = ( maine_download_file_name = (
@ -131,20 +117,18 @@ class CDCLifeExpectancy(ExtractTransformLoad):
axis=0, axis=0,
) )
states_in_combined_df = combined_df[ states_in_combined_df = list(
self.STATE_INPUT_COLUMN_NAME combined_df[self.STATE_INPUT_COLUMN_NAME].unique()
].unique()
# Find which states are missing from the combined df.
states_missing = sorted(
list(self.EXPECTED_STATES_SET - set(states_in_combined_df))
) )
if len(states_missing) != 0: # Expect that PR and Island Areas are the only things now missing
raise ValueError( compare_to_list_of_expected_state_fips_codes(
"The states missing from combined dataframe are " actual_state_fips_codes=states_in_combined_df,
"no longer as expected." nation_expected=self.NATION_EXPECTED_IN_DATA,
) puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
additional_fips_codes_not_expected=[],
)
# Save the updated version # Save the updated version
self.raw_df = combined_df self.raw_df = combined_df