refactoring

2025-09-22 19:24:30 -07:00 · 2022-09-07 16:02:17 -04:00 · 2022-09-07 16:02:17 -04:00 · 70606440fb
commit 70606440fb
parent 56a24b9bd1
6 changed files with 457 additions and 46 deletions
--- a/data/data-pipeline/data_pipeline/config.py
+++ b/data/data-pipeline/data_pipeline/config.py
@ -12,6 +12,7 @@ settings = Dynaconf(

 # set root dir
 settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
+settings.DATA_PATH = settings.APP_ROOT / "data"
 settings.REQUESTS_DEFAULT_TIMOUT = 3600
 # To set an environment use:
 # Linux/OSX: export ENV_FOR_DYNACONF=staging
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -7,6 +7,9 @@ from typing import Optional
 import pandas as pd

 from data_pipeline.config import settings
+from data_pipeline.etl.score.etl_utils import (
+    compare_to_list_of_expected_state_fips_codes,
+)
 from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
 from data_pipeline.utils import (
    load_yaml_dict_from_file,
@ -43,7 +46,7 @@ class ExtractTransformLoad:
    APP_ROOT: pathlib.Path = settings.APP_ROOT

    # Directories
-    DATA_PATH: pathlib.Path = APP_ROOT / "data"
+    DATA_PATH: pathlib.Path = settings.DATA_PATH
    TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
    CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
    DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
@ -82,6 +85,19 @@ class ExtractTransformLoad:
    # NULL_REPRESENTATION is how nulls are represented on the input field
    NULL_REPRESENTATION: str = None

+    # Whether this ETL contains data for the nation (the US states)
+    NATION_EXPECTED_IN_DATA: bool = True
+
+    # Whether this ETL contains data for Puerto Rico
+    PUERTO_RICO_EXPECTED_IN_DATA: bool = True
+
+    # Whether this ETL contains data for the island areas
+    ISLAND_AREAS_EXPECTED_IN_DATA: bool = False
+
+    # Whether this ETL contains known missing data for any additional
+    # states/territories
+    EXPECTED_MISSING_STATES: typing.List[str] = []
+
    # Thirteen digits in a census block group ID.
    EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
    # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
@ -289,6 +305,21 @@ class ExtractTransformLoad:
                        f"`{geo_field}`."
                    )

+        # Check whether data contains expected states
+        states_in_output_df = list(
+            self.output_df[self.GEOID_TRACT_FIELD_NAME]
+            .astype(str)
+            .str[0:2]
+            .unique()
+        )
+        compare_to_list_of_expected_state_fips_codes(
+            actual_state_fips_codes=states_in_output_df,
+            nation_expected=self.NATION_EXPECTED_IN_DATA,
+            puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
+            island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
+            additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES,
+        )
+
    def load(self, float_format=None) -> None:
        """Saves the transformed data.

--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -131,6 +131,59 @@ TILES_NATION_THRESHOLD_COUNT = 21
 # 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands
 TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"]
 TILES_PUERTO_RICO_FIPS_CODE = ["72"]
+TILES_NATION_FIPS_CODE = [
+    "01",
+    "02",
+    "04",
+    "05",
+    "06",
+    "08",
+    "09",
+    "10",
+    "11",
+    "12",
+    "13",
+    "15",
+    "16",
+    "17",
+    "18",
+    "19",
+    "20",
+    "21",
+    "22",
+    "23",
+    "24",
+    "25",
+    "26",
+    "27",
+    "28",
+    "29",
+    "30",
+    "31",
+    "32",
+    "33",
+    "34",
+    "35",
+    "36",
+    "37",
+    "38",
+    "39",
+    "40",
+    "41",
+    "42",
+    "44",
+    "45",
+    "46",
+    "47",
+    "48",
+    "49",
+    "50",
+    "51",
+    "53",
+    "54",
+    "55",
+    "56",
+]

 # Constant to reflect UI Experience version
 # "Nation" referring to 50 states and DC is from Census
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -1,11 +1,18 @@
 import os
 import sys
+import typing
 from pathlib import Path
 from collections import namedtuple
 import numpy as np
 import pandas as pd

 from data_pipeline.config import settings
+from data_pipeline.etl.score.constants import (
+    TILES_ISLAND_AREA_FIPS_CODES,
+    TILES_PUERTO_RICO_FIPS_CODE,
+    TILES_NATION_FIPS_CODE,
+)
+from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import (
    download_file_from_url,
    get_module_logger,
@ -305,3 +312,73 @@ def create_codebook(
    return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
        columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
    )
+
+
+def compare_to_list_of_expected_state_fips_codes(
+    actual_state_fips_codes: typing.List[str],
+    nation_expected: bool = True,
+    puerto_rico_expected: bool = True,
+    island_areas_expected: bool = True,
+    additional_fips_codes_not_expected: typing.List[str] = [],
+) -> None:
+    """Check whether a list of state/territory FIPS codes match expectations.
+
+    Args:
+        actual_state_fips_codes (List of str): Actual state codes observed in data
+        nation_expected (bool): Do you expect the nation (DC & states) to be
+            represented in data?
+        puerto_rico_expected (bool): Do you expect PR to be represented in data?
+        island_areas_expected (bool): Do you expect Island Areas to be represented in
+            data?
+        additional_fips_codes_not_expected (List of str): Additional state codes
+            not expected in the data. For example, the data may be known to be missing
+            data from Maine and Wisconsin.
+
+    Returns:
+        None: Does not return any values.
+
+    Raises:
+        ValueError: if lists do not match expectations.
+    """
+    # Cast input to a set.
+    actual_state_fips_codes_set = set(actual_state_fips_codes)
+
+    # Start with the list of all FIPS codes for all states and territories.
+    expected_states_set = set(get_state_fips_codes(settings.DATA_PATH))
+
+    # If nation (states and DC) are not expected to be included, remove it from the
+    # expected
+    # states set.
+    if not nation_expected:
+        expected_states_set = expected_states_set - set(TILES_NATION_FIPS_CODE)
+
+    # If Puerto Rico is not expected to be included, remove it from the expected
+    # states set.
+    if not puerto_rico_expected:
+        expected_states_set = expected_states_set - set(
+            TILES_PUERTO_RICO_FIPS_CODE
+        )
+
+    # If island areas are not expected to be included, remove them from the expected
+    # states set.
+    if not island_areas_expected:
+        expected_states_set = expected_states_set - set(
+            TILES_ISLAND_AREA_FIPS_CODES
+        )
+
+    # If additional FIPS codes are not expected to be included, remove them from the
+    # expected states set.
+    expected_states_set = expected_states_set - set(
+        additional_fips_codes_not_expected
+    )
+
+    if expected_states_set != actual_state_fips_codes_set:
+        raise ValueError(
+            "The states and territories in the data are not as expected.\n"
+            "FIPS state codes expected that are not present in the data:\n"
+            f"{sorted(list(expected_states_set - actual_state_fips_codes_set))}\n"
+            "FIPS state codes in the data that were not expected:\n"
+            f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
+        )
+    else:
+        logger.info("Data matches expected state and territory representation.")
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
@ -2,7 +2,10 @@ import pandas as pd
 import numpy as np
 import pytest

-from data_pipeline.etl.score.etl_utils import floor_series
+from data_pipeline.etl.score.etl_utils import (
+    floor_series,
+    compare_to_list_of_expected_state_fips_codes,
+)


 def test_floor_series():
@ -70,3 +73,265 @@ def test_floor_series():
        match="Argument series must be of type pandas series, not of type list.",
    ):
        floor_series(invalid_type, number_of_decimals=3)
+
+
+def test_compare_to_list_of_expected_state_fips_codes():
+    fips_codes_test_1 = [
+        "01",
+        "02",
+        "04",
+        "05",
+        "06",
+        "08",
+        "09",
+        "10",
+        "11",
+        "12",
+        "13",
+        "15",
+        "16",
+        "17",
+        "18",
+        "19",
+        "20",
+        "21",
+        "22",
+        "23",
+        "24",
+        "25",
+        "26",
+        "27",
+        "28",
+        "29",
+        "30",
+        "31",
+        "32",
+        "33",
+        "34",
+        "35",
+        "36",
+        "37",
+        "38",
+        "39",
+        "40",
+        "41",
+        "42",
+        "44",
+        "45",
+        "46",
+        "47",
+        "48",
+        "49",
+        "50",
+        "51",
+        "53",
+        "54",
+        "55",
+        "56",
+        "60",
+        "66",
+        "69",
+        "72",
+        "78",
+    ]
+
+    # Should not raise any errors
+    compare_to_list_of_expected_state_fips_codes(
+        actual_state_fips_codes=fips_codes_test_1
+    )
+
+    # Should raise error because Puerto Rico is not expected
+    with pytest.raises(ValueError) as exception_info:
+        compare_to_list_of_expected_state_fips_codes(
+            actual_state_fips_codes=fips_codes_test_1,
+            puerto_rico_expected=False,
+        )
+    partial_expected_error_message = (
+        "FIPS state codes in the data that were not expected:\n['72']\n"
+    )
+    assert partial_expected_error_message in str(exception_info.value)
+
+    # Should raise error because Island Areas are not expected
+    with pytest.raises(ValueError) as exception_info:
+        compare_to_list_of_expected_state_fips_codes(
+            actual_state_fips_codes=fips_codes_test_1,
+            island_areas_expected=False,
+        )
+    partial_expected_error_message = (
+        "FIPS state codes in the data that were not expected:\n"
+        "['60', '66', '69', '78']\n"
+    )
+    assert partial_expected_error_message in str(exception_info.value)
+
+    # List missing PR and Guam
+    fips_codes_test_2 = [
+        "01",
+        "02",
+        "04",
+        "05",
+        "06",
+        "08",
+        "09",
+        "10",
+        "11",
+        "12",
+        "13",
+        "15",
+        "16",
+        "17",
+        "18",
+        "19",
+        "20",
+        "21",
+        "22",
+        "23",
+        "24",
+        "25",
+        "26",
+        "27",
+        "28",
+        "29",
+        "30",
+        "31",
+        "32",
+        "33",
+        "34",
+        "35",
+        "36",
+        "37",
+        "38",
+        "39",
+        "40",
+        "41",
+        "42",
+        "44",
+        "45",
+        "46",
+        "47",
+        "48",
+        "49",
+        "50",
+        "51",
+        "53",
+        "54",
+        "55",
+        "56",
+        "60",
+        "69",
+        "78",
+    ]
+    # Should raise error because all Island Areas and PR are expected
+    with pytest.raises(ValueError) as exception_info:
+        compare_to_list_of_expected_state_fips_codes(
+            actual_state_fips_codes=fips_codes_test_2,
+        )
+    partial_expected_error_message = (
+        "FIPS state codes expected that are not present in the data:\n"
+        "['66', '72']\n"
+    )
+    assert partial_expected_error_message in str(exception_info.value)
+
+    # Missing Maine and Wisconsin
+    fips_codes_test_3 = [
+        "01",
+        "02",
+        "04",
+        "05",
+        "06",
+        "08",
+        "09",
+        "10",
+        "11",
+        "12",
+        "13",
+        "15",
+        "16",
+        "17",
+        "18",
+        "19",
+        "20",
+        "21",
+        "22",
+        "24",
+        "25",
+        "26",
+        "27",
+        "28",
+        "29",
+        "30",
+        "31",
+        "32",
+        "33",
+        "34",
+        "35",
+        "36",
+        "37",
+        "38",
+        "39",
+        "40",
+        "41",
+        "42",
+        "44",
+        "45",
+        "46",
+        "47",
+        "48",
+        "49",
+        "50",
+        "51",
+        "53",
+        "54",
+        "56",
+        "60",
+        "66",
+        "69",
+        "72",
+        "78",
+    ]
+
+    # Should raise error because Maine and Wisconsin are expected
+    with pytest.raises(ValueError) as exception_info:
+        compare_to_list_of_expected_state_fips_codes(
+            actual_state_fips_codes=fips_codes_test_3,
+        )
+    partial_expected_error_message = (
+        "FIPS state codes expected that are not present in the data:\n"
+        "['23', '55']\n"
+    )
+    assert partial_expected_error_message in str(exception_info.value)
+
+    # Should not raise error because Maine and Wisconsin are expected to be missing
+    compare_to_list_of_expected_state_fips_codes(
+        actual_state_fips_codes=fips_codes_test_3,
+        additional_fips_codes_not_expected=["23", "55"],
+    )
+
+    # Missing the nation
+    fips_codes_test_4 = [
+        "60",
+        "66",
+        "69",
+        "72",
+        "78",
+    ]
+
+    # Should raise error because the nation is expected
+    with pytest.raises(ValueError) as exception_info:
+        compare_to_list_of_expected_state_fips_codes(
+            actual_state_fips_codes=fips_codes_test_4,
+        )
+
+    partial_expected_error_message = (
+        "FIPS state codes expected that are not present in the data:\n"
+        "['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', "
+        "'17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', "
+        "'30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', "
+        "'44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56']"
+    )
+
+    assert partial_expected_error_message in str(exception_info.value)
+
+    # Should not raise error because Nation is not to be missing
+    compare_to_list_of_expected_state_fips_codes(
+        actual_state_fips_codes=fips_codes_test_4, nation_expected=False
+    )
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@ -1,10 +1,9 @@
 from pathlib import Path
 import pandas as pd

-from data_pipeline.etl.base import ExtractTransformLoad
-from data_pipeline.etl.score.constants import (
-    TILES_ISLAND_AREA_FIPS_CODES,
-    TILES_PUERTO_RICO_FIPS_CODE,
+from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
+from data_pipeline.etl.score.etl_utils import (
+    compare_to_list_of_expected_state_fips_codes,
 )
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger, download_file_from_url
@ -14,8 +13,13 @@ logger = get_module_logger(__name__)

 class CDCLifeExpectancy(ExtractTransformLoad):
    def __init__(self):
+        self.GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
+        self.PUERTO_RICO_EXPECTED_IN_DATA = False
+
        self.USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"

+        self.STATES_MISSING_FROM_USA_FILE = ["23", "55"]
+
        # For some reason, LEEP does not include Maine or Wisconsin in its "All of
        # USA" file. Load these separately.
        self.WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
@ -35,19 +39,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
            self.LIFE_EXPECTANCY_FIELD_NAME,
        ]

-        # Set some constants that will be helpful for debugging the source data later.
-        self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
-
-        self.EXPECTED_STATES_SET = (
-            set(self.STATE_FIPS_CODES)
-            # We don't expect LEEP to have data for island areas or Puerto Rico.
-            - set(TILES_ISLAND_AREA_FIPS_CODES)
-            - set(TILES_PUERTO_RICO_FIPS_CODE)
-        )
-
-        # These states are currently missing from LEEP's whole USA file.
-        self.EXPECTED_MISSING_STATES = ["23", "55"]
-
        self.raw_df: pd.DataFrame
        self.output_df: pd.DataFrame

@ -76,23 +67,18 @@ class CDCLifeExpectancy(ExtractTransformLoad):
        )

        # Check which states are missing
-        states_in_life_expectancy_usa_file = all_usa_raw_df[
-            self.STATE_INPUT_COLUMN_NAME
-        ].unique()
-
-        # Find which states are missing from the expected set.
-        states_missing = sorted(
-            list(
-                self.EXPECTED_STATES_SET
-                - set(states_in_life_expectancy_usa_file)
-            )
+        states_in_life_expectancy_usa_file = list(
+            all_usa_raw_df[self.STATE_INPUT_COLUMN_NAME].unique()
        )

-        if states_missing != self.EXPECTED_MISSING_STATES:
-            raise ValueError(
-                "LEEP data has changed. The states missing from the data are "
-                "no longer the same."
-            )
+        # Expect that PR, Island Areas, and Maine/Wisconsin are missing
+        compare_to_list_of_expected_state_fips_codes(
+            actual_state_fips_codes=states_in_life_expectancy_usa_file,
+            nation_expected=self.NATION_EXPECTED_IN_DATA,
+            puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
+            island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
+            additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
+        )

        logger.info("Downloading data for Maine")
        maine_download_file_name = (
@ -131,20 +117,18 @@ class CDCLifeExpectancy(ExtractTransformLoad):
            axis=0,
        )

-        states_in_combined_df = combined_df[
-            self.STATE_INPUT_COLUMN_NAME
-        ].unique()
-
-        # Find which states are missing from the combined df.
-        states_missing = sorted(
-            list(self.EXPECTED_STATES_SET - set(states_in_combined_df))
+        states_in_combined_df = list(
+            combined_df[self.STATE_INPUT_COLUMN_NAME].unique()
        )

-        if len(states_missing) != 0:
-            raise ValueError(
-                "The states missing from combined dataframe are "
-                "no longer as expected."
-            )
+        # Expect that PR and Island Areas are the only things now missing
+        compare_to_list_of_expected_state_fips_codes(
+            actual_state_fips_codes=states_in_combined_df,
+            nation_expected=self.NATION_EXPECTED_IN_DATA,
+            puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
+            island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
+            additional_fips_codes_not_expected=[],
+        )

        # Save the updated version
        self.raw_df = combined_df