diff --git a/data/data-pipeline/data_pipeline/config.py b/data/data-pipeline/data_pipeline/config.py index 23e550a8..5dc336c8 100644 --- a/data/data-pipeline/data_pipeline/config.py +++ b/data/data-pipeline/data_pipeline/config.py @@ -12,6 +12,7 @@ settings = Dynaconf( # set root dir settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent +settings.DATA_PATH = settings.APP_ROOT / "data" settings.REQUESTS_DEFAULT_TIMOUT = 3600 # To set an environment use: # Linux/OSX: export ENV_FOR_DYNACONF=staging diff --git a/data/data-pipeline/data_pipeline/etl/base.py b/data/data-pipeline/data_pipeline/etl/base.py index 211fbc31..aef222c2 100644 --- a/data/data-pipeline/data_pipeline/etl/base.py +++ b/data/data-pipeline/data_pipeline/etl/base.py @@ -7,6 +7,9 @@ from typing import Optional import pandas as pd from data_pipeline.config import settings +from data_pipeline.etl.score.etl_utils import ( + compare_to_list_of_expected_state_fips_codes, +) from data_pipeline.etl.score.schemas.datasets import DatasetsConfig from data_pipeline.utils import ( load_yaml_dict_from_file, @@ -43,7 +46,7 @@ class ExtractTransformLoad: APP_ROOT: pathlib.Path = settings.APP_ROOT # Directories - DATA_PATH: pathlib.Path = APP_ROOT / "data" + DATA_PATH: pathlib.Path = settings.DATA_PATH TMP_PATH: pathlib.Path = DATA_PATH / "tmp" CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config" DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config" @@ -82,6 +85,19 @@ class ExtractTransformLoad: # NULL_REPRESENTATION is how nulls are represented on the input field NULL_REPRESENTATION: str = None + # Whether this ETL contains data for the nation (the US states) + NATION_EXPECTED_IN_DATA: bool = True + + # Whether this ETL contains data for Puerto Rico + PUERTO_RICO_EXPECTED_IN_DATA: bool = True + + # Whether this ETL contains data for the island areas + ISLAND_AREAS_EXPECTED_IN_DATA: bool = False + + # Whether this ETL contains known missing data for any additional + # states/territories + EXPECTED_MISSING_STATES: typing.List[str] = [] + # Thirteen digits in a census block group ID. EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13 # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might @@ -289,6 +305,21 @@ class ExtractTransformLoad: f"`{geo_field}`." ) + # Check whether data contains expected states + states_in_output_df = list( + self.output_df[self.GEOID_TRACT_FIELD_NAME] + .astype(str) + .str[0:2] + .unique() + ) + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=states_in_output_df, + nation_expected=self.NATION_EXPECTED_IN_DATA, + puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA, + island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA, + additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES, + ) + def load(self, float_format=None) -> None: """Saves the transformed data. diff --git a/data/data-pipeline/data_pipeline/etl/score/constants.py b/data/data-pipeline/data_pipeline/etl/score/constants.py index 71c36612..9e34b096 100644 --- a/data/data-pipeline/data_pipeline/etl/score/constants.py +++ b/data/data-pipeline/data_pipeline/etl/score/constants.py @@ -131,6 +131,59 @@ TILES_NATION_THRESHOLD_COUNT = 21 # 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"] TILES_PUERTO_RICO_FIPS_CODE = ["72"] +TILES_NATION_FIPS_CODE = [ + "01", + "02", + "04", + "05", + "06", + "08", + "09", + "10", + "11", + "12", + "13", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "53", + "54", + "55", + "56", +] # Constant to reflect UI Experience version # "Nation" referring to 50 states and DC is from Census diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py index f5222620..1d3283f9 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py @@ -1,11 +1,18 @@ import os import sys +import typing from pathlib import Path from collections import namedtuple import numpy as np import pandas as pd from data_pipeline.config import settings +from data_pipeline.etl.score.constants import ( + TILES_ISLAND_AREA_FIPS_CODES, + TILES_PUERTO_RICO_FIPS_CODE, + TILES_NATION_FIPS_CODE, +) +from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.utils import ( download_file_from_url, get_module_logger, @@ -305,3 +312,73 @@ def create_codebook( return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename( columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"} ) + + +def compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes: typing.List[str], + nation_expected: bool = True, + puerto_rico_expected: bool = True, + island_areas_expected: bool = True, + additional_fips_codes_not_expected: typing.List[str] = [], +) -> None: + """Check whether a list of state/territory FIPS codes match expectations. + + Args: + actual_state_fips_codes (List of str): Actual state codes observed in data + nation_expected (bool): Do you expect the nation (DC & states) to be + represented in data? + puerto_rico_expected (bool): Do you expect PR to be represented in data? + island_areas_expected (bool): Do you expect Island Areas to be represented in + data? + additional_fips_codes_not_expected (List of str): Additional state codes + not expected in the data. For example, the data may be known to be missing + data from Maine and Wisconsin. + + Returns: + None: Does not return any values. + + Raises: + ValueError: if lists do not match expectations. + """ + # Cast input to a set. + actual_state_fips_codes_set = set(actual_state_fips_codes) + + # Start with the list of all FIPS codes for all states and territories. + expected_states_set = set(get_state_fips_codes(settings.DATA_PATH)) + + # If nation (states and DC) are not expected to be included, remove it from the + # expected + # states set. + if not nation_expected: + expected_states_set = expected_states_set - set(TILES_NATION_FIPS_CODE) + + # If Puerto Rico is not expected to be included, remove it from the expected + # states set. + if not puerto_rico_expected: + expected_states_set = expected_states_set - set( + TILES_PUERTO_RICO_FIPS_CODE + ) + + # If island areas are not expected to be included, remove them from the expected + # states set. + if not island_areas_expected: + expected_states_set = expected_states_set - set( + TILES_ISLAND_AREA_FIPS_CODES + ) + + # If additional FIPS codes are not expected to be included, remove them from the + # expected states set. + expected_states_set = expected_states_set - set( + additional_fips_codes_not_expected + ) + + if expected_states_set != actual_state_fips_codes_set: + raise ValueError( + "The states and territories in the data are not as expected.\n" + "FIPS state codes expected that are not present in the data:\n" + f"{sorted(list(expected_states_set - actual_state_fips_codes_set))}\n" + "FIPS state codes in the data that were not expected:\n" + f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n" + ) + else: + logger.info("Data matches expected state and territory representation.") diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py index 594f4856..425ee7ad 100644 --- a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py @@ -2,7 +2,10 @@ import pandas as pd import numpy as np import pytest -from data_pipeline.etl.score.etl_utils import floor_series +from data_pipeline.etl.score.etl_utils import ( + floor_series, + compare_to_list_of_expected_state_fips_codes, +) def test_floor_series(): @@ -70,3 +73,265 @@ def test_floor_series(): match="Argument series must be of type pandas series, not of type list.", ): floor_series(invalid_type, number_of_decimals=3) + + +def test_compare_to_list_of_expected_state_fips_codes(): + fips_codes_test_1 = [ + "01", + "02", + "04", + "05", + "06", + "08", + "09", + "10", + "11", + "12", + "13", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "53", + "54", + "55", + "56", + "60", + "66", + "69", + "72", + "78", + ] + + # Should not raise any errors + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_1 + ) + + # Should raise error because Puerto Rico is not expected + with pytest.raises(ValueError) as exception_info: + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_1, + puerto_rico_expected=False, + ) + partial_expected_error_message = ( + "FIPS state codes in the data that were not expected:\n['72']\n" + ) + assert partial_expected_error_message in str(exception_info.value) + + # Should raise error because Island Areas are not expected + with pytest.raises(ValueError) as exception_info: + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_1, + island_areas_expected=False, + ) + partial_expected_error_message = ( + "FIPS state codes in the data that were not expected:\n" + "['60', '66', '69', '78']\n" + ) + assert partial_expected_error_message in str(exception_info.value) + + # List missing PR and Guam + fips_codes_test_2 = [ + "01", + "02", + "04", + "05", + "06", + "08", + "09", + "10", + "11", + "12", + "13", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "53", + "54", + "55", + "56", + "60", + "69", + "78", + ] + # Should raise error because all Island Areas and PR are expected + with pytest.raises(ValueError) as exception_info: + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_2, + ) + partial_expected_error_message = ( + "FIPS state codes expected that are not present in the data:\n" + "['66', '72']\n" + ) + assert partial_expected_error_message in str(exception_info.value) + + # Missing Maine and Wisconsin + fips_codes_test_3 = [ + "01", + "02", + "04", + "05", + "06", + "08", + "09", + "10", + "11", + "12", + "13", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "53", + "54", + "56", + "60", + "66", + "69", + "72", + "78", + ] + + # Should raise error because Maine and Wisconsin are expected + with pytest.raises(ValueError) as exception_info: + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_3, + ) + partial_expected_error_message = ( + "FIPS state codes expected that are not present in the data:\n" + "['23', '55']\n" + ) + assert partial_expected_error_message in str(exception_info.value) + + # Should not raise error because Maine and Wisconsin are expected to be missing + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_3, + additional_fips_codes_not_expected=["23", "55"], + ) + + # Missing the nation + fips_codes_test_4 = [ + "60", + "66", + "69", + "72", + "78", + ] + + # Should raise error because the nation is expected + with pytest.raises(ValueError) as exception_info: + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_4, + ) + + partial_expected_error_message = ( + "FIPS state codes expected that are not present in the data:\n" + "['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', " + "'17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', " + "'30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', " + "'44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56']" + ) + + assert partial_expected_error_message in str(exception_info.value) + + # Should not raise error because Nation is not to be missing + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=fips_codes_test_4, nation_expected=False + ) diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py index a0eb9102..f6792e9c 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py @@ -1,10 +1,9 @@ from pathlib import Path import pandas as pd -from data_pipeline.etl.base import ExtractTransformLoad -from data_pipeline.etl.score.constants import ( - TILES_ISLAND_AREA_FIPS_CODES, - TILES_PUERTO_RICO_FIPS_CODE, +from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel +from data_pipeline.etl.score.etl_utils import ( + compare_to_list_of_expected_state_fips_codes, ) from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes from data_pipeline.utils import get_module_logger, download_file_from_url @@ -14,8 +13,13 @@ logger = get_module_logger(__name__) class CDCLifeExpectancy(ExtractTransformLoad): def __init__(self): + self.GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT + self.PUERTO_RICO_EXPECTED_IN_DATA = False + self.USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV" + self.STATES_MISSING_FROM_USA_FILE = ["23", "55"] + # For some reason, LEEP does not include Maine or Wisconsin in its "All of # USA" file. Load these separately. self.WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV" @@ -35,19 +39,6 @@ class CDCLifeExpectancy(ExtractTransformLoad): self.LIFE_EXPECTANCY_FIELD_NAME, ] - # Set some constants that will be helpful for debugging the source data later. - self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH) - - self.EXPECTED_STATES_SET = ( - set(self.STATE_FIPS_CODES) - # We don't expect LEEP to have data for island areas or Puerto Rico. - - set(TILES_ISLAND_AREA_FIPS_CODES) - - set(TILES_PUERTO_RICO_FIPS_CODE) - ) - - # These states are currently missing from LEEP's whole USA file. - self.EXPECTED_MISSING_STATES = ["23", "55"] - self.raw_df: pd.DataFrame self.output_df: pd.DataFrame @@ -76,23 +67,18 @@ class CDCLifeExpectancy(ExtractTransformLoad): ) # Check which states are missing - states_in_life_expectancy_usa_file = all_usa_raw_df[ - self.STATE_INPUT_COLUMN_NAME - ].unique() - - # Find which states are missing from the expected set. - states_missing = sorted( - list( - self.EXPECTED_STATES_SET - - set(states_in_life_expectancy_usa_file) - ) + states_in_life_expectancy_usa_file = list( + all_usa_raw_df[self.STATE_INPUT_COLUMN_NAME].unique() ) - if states_missing != self.EXPECTED_MISSING_STATES: - raise ValueError( - "LEEP data has changed. The states missing from the data are " - "no longer the same." - ) + # Expect that PR, Island Areas, and Maine/Wisconsin are missing + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=states_in_life_expectancy_usa_file, + nation_expected=self.NATION_EXPECTED_IN_DATA, + puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA, + island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA, + additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE, + ) logger.info("Downloading data for Maine") maine_download_file_name = ( @@ -131,20 +117,18 @@ class CDCLifeExpectancy(ExtractTransformLoad): axis=0, ) - states_in_combined_df = combined_df[ - self.STATE_INPUT_COLUMN_NAME - ].unique() - - # Find which states are missing from the combined df. - states_missing = sorted( - list(self.EXPECTED_STATES_SET - set(states_in_combined_df)) + states_in_combined_df = list( + combined_df[self.STATE_INPUT_COLUMN_NAME].unique() ) - if len(states_missing) != 0: - raise ValueError( - "The states missing from combined dataframe are " - "no longer as expected." - ) + # Expect that PR and Island Areas are the only things now missing + compare_to_list_of_expected_state_fips_codes( + actual_state_fips_codes=states_in_combined_df, + nation_expected=self.NATION_EXPECTED_IN_DATA, + puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA, + island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA, + additional_fips_codes_not_expected=[], + ) # Save the updated version self.raw_df = combined_df