refactoring

2025-09-21 12:11:14 -07:00 · 2022-09-07 16:02:17 -04:00 · 2022-09-07 16:02:17 -04:00 · 70606440fb
commit 70606440fb
parent 56a24b9bd1
6 changed files with 457 additions and 46 deletions
--- a/data/data-pipeline/data_pipeline/config.py
+++ b/data/data-pipeline/data_pipeline/config.py
@ -12,6 +12,7 @@ settings = Dynaconf(
 # set root dir
 settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
 settings.DATA_PATH = settings.APP_ROOT / "data"
 settings.REQUESTS_DEFAULT_TIMOUT = 3600
 # To set an environment use:
 # Linux/OSX: export ENV_FOR_DYNACONF=staging
--- a/data/data-pipeline/data_pipeline/etl/base.py
+++ b/data/data-pipeline/data_pipeline/etl/base.py
@ -7,6 +7,9 @@ from typing import Optional
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.score.etl_utils import (
    compare_to_list_of_expected_state_fips_codes,
 )
 from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
 from data_pipeline.utils import (
    load_yaml_dict_from_file,
@ -43,7 +46,7 @@ class ExtractTransformLoad:
    APP_ROOT: pathlib.Path = settings.APP_ROOT
    # Directories
-    DATA_PATH: pathlib.Path = APP_ROOT / "data"
+    DATA_PATH: pathlib.Path = settings.DATA_PATH
    TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
    CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
    DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
@ -82,6 +85,19 @@ class ExtractTransformLoad:
    # NULL_REPRESENTATION is how nulls are represented on the input field
    NULL_REPRESENTATION: str = None
    # Whether this ETL contains data for the nation (the US states)
    NATION_EXPECTED_IN_DATA: bool = True
    # Whether this ETL contains data for Puerto Rico
    PUERTO_RICO_EXPECTED_IN_DATA: bool = True
    # Whether this ETL contains data for the island areas
    ISLAND_AREAS_EXPECTED_IN_DATA: bool = False
    # Whether this ETL contains known missing data for any additional
    # states/territories
    EXPECTED_MISSING_STATES: typing.List[str] = []
    # Thirteen digits in a census block group ID.
    EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
    # TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
@ -289,6 +305,21 @@ class ExtractTransformLoad:
                        f"`{geo_field}`."
                    )
        # Check whether data contains expected states
        states_in_output_df = list(
            self.output_df[self.GEOID_TRACT_FIELD_NAME]
            .astype(str)
            .str[0:2]
            .unique()
        )
        compare_to_list_of_expected_state_fips_codes(
            actual_state_fips_codes=states_in_output_df,
            nation_expected=self.NATION_EXPECTED_IN_DATA,
            puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
            island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
            additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES,
        )
    def load(self, float_format=None) -> None:
        """Saves the transformed data.
--- a/data/data-pipeline/data_pipeline/etl/score/constants.py
+++ b/data/data-pipeline/data_pipeline/etl/score/constants.py
@ -131,6 +131,59 @@ TILES_NATION_THRESHOLD_COUNT = 21
 # 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands
 TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"]
 TILES_PUERTO_RICO_FIPS_CODE = ["72"]
 TILES_NATION_FIPS_CODE = [
    "01",
    "02",
    "04",
    "05",
    "06",
    "08",
    "09",
    "10",
    "11",
    "12",
    "13",
    "15",
    "16",
    "17",
    "18",
    "19",
    "20",
    "21",
    "22",
    "23",
    "24",
    "25",
    "26",
    "27",
    "28",
    "29",
    "30",
    "31",
    "32",
    "33",
    "34",
    "35",
    "36",
    "37",
    "38",
    "39",
    "40",
    "41",
    "42",
    "44",
    "45",
    "46",
    "47",
    "48",
    "49",
    "50",
    "51",
    "53",
    "54",
    "55",
    "56",
 ]
 # Constant to reflect UI Experience version
 # "Nation" referring to 50 states and DC is from Census
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -1,11 +1,18 @@
 import os
 import sys
 import typing
 from pathlib import Path
 from collections import namedtuple
 import numpy as np
 import pandas as pd
 from data_pipeline.config import settings
 from data_pipeline.etl.score.constants import (
    TILES_ISLAND_AREA_FIPS_CODES,
    TILES_PUERTO_RICO_FIPS_CODE,
    TILES_NATION_FIPS_CODE,
 )
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import (
    download_file_from_url,
    get_module_logger,
@ -305,3 +312,73 @@ def create_codebook(
    return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
        columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
    )
 def compare_to_list_of_expected_state_fips_codes(
    actual_state_fips_codes: typing.List[str],
    nation_expected: bool = True,
    puerto_rico_expected: bool = True,
    island_areas_expected: bool = True,
    additional_fips_codes_not_expected: typing.List[str] = [],
 ) -> None:
    """Check whether a list of state/territory FIPS codes match expectations.
    Args:
        actual_state_fips_codes (List of str): Actual state codes observed in data
        nation_expected (bool): Do you expect the nation (DC & states) to be
            represented in data?
        puerto_rico_expected (bool): Do you expect PR to be represented in data?
        island_areas_expected (bool): Do you expect Island Areas to be represented in
            data?
        additional_fips_codes_not_expected (List of str): Additional state codes
            not expected in the data. For example, the data may be known to be missing
            data from Maine and Wisconsin.
    Returns:
        None: Does not return any values.
    Raises:
        ValueError: if lists do not match expectations.
    """
    # Cast input to a set.
    actual_state_fips_codes_set = set(actual_state_fips_codes)
    # Start with the list of all FIPS codes for all states and territories.
    expected_states_set = set(get_state_fips_codes(settings.DATA_PATH))
    # If nation (states and DC) are not expected to be included, remove it from the
    # expected
    # states set.
    if not nation_expected:
        expected_states_set = expected_states_set - set(TILES_NATION_FIPS_CODE)
    # If Puerto Rico is not expected to be included, remove it from the expected
    # states set.
    if not puerto_rico_expected:
        expected_states_set = expected_states_set - set(
            TILES_PUERTO_RICO_FIPS_CODE
        )
    # If island areas are not expected to be included, remove them from the expected
    # states set.
    if not island_areas_expected:
        expected_states_set = expected_states_set - set(
            TILES_ISLAND_AREA_FIPS_CODES
        )
    # If additional FIPS codes are not expected to be included, remove them from the
    # expected states set.
    expected_states_set = expected_states_set - set(
        additional_fips_codes_not_expected
    )
    if expected_states_set != actual_state_fips_codes_set:
        raise ValueError(
            "The states and territories in the data are not as expected.\n"
            "FIPS state codes expected that are not present in the data:\n"
            f"{sorted(list(expected_states_set - actual_state_fips_codes_set))}\n"
            "FIPS state codes in the data that were not expected:\n"
            f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
        )
    else:
        logger.info("Data matches expected state and territory representation.")
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
@ -2,7 +2,10 @@ import pandas as pd
 import numpy as np
 import pytest
-from data_pipeline.etl.score.etl_utils import floor_series
+from data_pipeline.etl.score.etl_utils import (
    floor_series,
    compare_to_list_of_expected_state_fips_codes,
 )
 def test_floor_series():
@ -70,3 +73,265 @@ def test_floor_series():
        match="Argument series must be of type pandas series, not of type list.",
    ):
        floor_series(invalid_type, number_of_decimals=3)
 def test_compare_to_list_of_expected_state_fips_codes():
    fips_codes_test_1 = [
        "01",
        "02",
        "04",
        "05",
        "06",
        "08",
        "09",
        "10",
        "11",
        "12",
        "13",
        "15",
        "16",
        "17",
        "18",
        "19",
        "20",
        "21",
        "22",
        "23",
        "24",
        "25",
        "26",
        "27",
        "28",
        "29",
        "30",
        "31",
        "32",
        "33",
        "34",
        "35",
        "36",
        "37",
        "38",
        "39",
        "40",
        "41",
        "42",
        "44",
        "45",
        "46",
        "47",
        "48",
        "49",
        "50",
        "51",
        "53",
        "54",
        "55",
        "56",
        "60",
        "66",
        "69",
        "72",
        "78",
    ]
    # Should not raise any errors
    compare_to_list_of_expected_state_fips_codes(
        actual_state_fips_codes=fips_codes_test_1
    )
    # Should raise error because Puerto Rico is not expected
    with pytest.raises(ValueError) as exception_info:
        compare_to_list_of_expected_state_fips_codes(
            actual_state_fips_codes=fips_codes_test_1,
            puerto_rico_expected=False,
        )
    partial_expected_error_message = (
        "FIPS state codes in the data that were not expected:\n['72']\n"
    )
    assert partial_expected_error_message in str(exception_info.value)
    # Should raise error because Island Areas are not expected
    with pytest.raises(ValueError) as exception_info:
        compare_to_list_of_expected_state_fips_codes(
            actual_state_fips_codes=fips_codes_test_1,
            island_areas_expected=False,
        )
    partial_expected_error_message = (
        "FIPS state codes in the data that were not expected:\n"
        "['60', '66', '69', '78']\n"
    )
    assert partial_expected_error_message in str(exception_info.value)
    # List missing PR and Guam
    fips_codes_test_2 = [
        "01",
        "02",
        "04",
        "05",
        "06",
        "08",
        "09",
        "10",
        "11",
        "12",
        "13",
        "15",
        "16",
        "17",
        "18",
        "19",
        "20",
        "21",
        "22",
        "23",
        "24",
        "25",
        "26",
        "27",
        "28",
        "29",
        "30",
        "31",
        "32",
        "33",
        "34",
        "35",
        "36",
        "37",
        "38",
        "39",
        "40",
        "41",
        "42",
        "44",
        "45",
        "46",
        "47",
        "48",
        "49",
        "50",
        "51",
        "53",
        "54",
        "55",
        "56",
        "60",
        "69",
        "78",
    ]
    # Should raise error because all Island Areas and PR are expected
    with pytest.raises(ValueError) as exception_info:
        compare_to_list_of_expected_state_fips_codes(
            actual_state_fips_codes=fips_codes_test_2,
        )
    partial_expected_error_message = (
        "FIPS state codes expected that are not present in the data:\n"
        "['66', '72']\n"
    )
    assert partial_expected_error_message in str(exception_info.value)
    # Missing Maine and Wisconsin
    fips_codes_test_3 = [
        "01",
        "02",
        "04",
        "05",
        "06",
        "08",
        "09",
        "10",
        "11",
        "12",
        "13",
        "15",
        "16",
        "17",
        "18",
        "19",
        "20",
        "21",
        "22",
        "24",
        "25",
        "26",
        "27",
        "28",
        "29",
        "30",
        "31",
        "32",
        "33",
        "34",
        "35",
        "36",
        "37",
        "38",
        "39",
        "40",
        "41",
        "42",
        "44",
        "45",
        "46",
        "47",
        "48",
        "49",
        "50",
        "51",
        "53",
        "54",
        "56",
        "60",
        "66",
        "69",
        "72",
        "78",
    ]
    # Should raise error because Maine and Wisconsin are expected
    with pytest.raises(ValueError) as exception_info:
        compare_to_list_of_expected_state_fips_codes(
            actual_state_fips_codes=fips_codes_test_3,
        )
    partial_expected_error_message = (
        "FIPS state codes expected that are not present in the data:\n"
        "['23', '55']\n"
    )
    assert partial_expected_error_message in str(exception_info.value)
    # Should not raise error because Maine and Wisconsin are expected to be missing
    compare_to_list_of_expected_state_fips_codes(
        actual_state_fips_codes=fips_codes_test_3,
        additional_fips_codes_not_expected=["23", "55"],
    )
    # Missing the nation
    fips_codes_test_4 = [
        "60",
        "66",
        "69",
        "72",
        "78",
    ]
    # Should raise error because the nation is expected
    with pytest.raises(ValueError) as exception_info:
        compare_to_list_of_expected_state_fips_codes(
            actual_state_fips_codes=fips_codes_test_4,
        )
    partial_expected_error_message = (
        "FIPS state codes expected that are not present in the data:\n"
        "['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', "
        "'17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', "
        "'30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', "
        "'44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56']"
    )
    assert partial_expected_error_message in str(exception_info.value)
    # Should not raise error because Nation is not to be missing
    compare_to_list_of_expected_state_fips_codes(
        actual_state_fips_codes=fips_codes_test_4, nation_expected=False
    )
--- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
+++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py
@ -1,10 +1,9 @@
 from pathlib import Path
 import pandas as pd
-from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
-from data_pipeline.etl.score.constants import (
+from data_pipeline.etl.score.etl_utils import (
-    TILES_ISLAND_AREA_FIPS_CODES,
+    compare_to_list_of_expected_state_fips_codes,
    TILES_PUERTO_RICO_FIPS_CODE,
 )
 from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
 from data_pipeline.utils import get_module_logger, download_file_from_url
@ -14,8 +13,13 @@ logger = get_module_logger(__name__)
 class CDCLifeExpectancy(ExtractTransformLoad):
    def __init__(self):
        self.GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
        self.PUERTO_RICO_EXPECTED_IN_DATA = False
        self.USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
        self.STATES_MISSING_FROM_USA_FILE = ["23", "55"]
        # For some reason, LEEP does not include Maine or Wisconsin in its "All of
        # USA" file. Load these separately.
        self.WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
@ -35,19 +39,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
            self.LIFE_EXPECTANCY_FIELD_NAME,
        ]
        # Set some constants that will be helpful for debugging the source data later.
        self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
        self.EXPECTED_STATES_SET = (
            set(self.STATE_FIPS_CODES)
            # We don't expect LEEP to have data for island areas or Puerto Rico.
            - set(TILES_ISLAND_AREA_FIPS_CODES)
            - set(TILES_PUERTO_RICO_FIPS_CODE)
        )
        # These states are currently missing from LEEP's whole USA file.
        self.EXPECTED_MISSING_STATES = ["23", "55"]
        self.raw_df: pd.DataFrame
        self.output_df: pd.DataFrame
@ -76,23 +67,18 @@ class CDCLifeExpectancy(ExtractTransformLoad):
        )
        # Check which states are missing
-        states_in_life_expectancy_usa_file = all_usa_raw_df[
+        states_in_life_expectancy_usa_file = list(
-            self.STATE_INPUT_COLUMN_NAME
+            all_usa_raw_df[self.STATE_INPUT_COLUMN_NAME].unique()
        ].unique()
        # Find which states are missing from the expected set.
        states_missing = sorted(
            list(
                self.EXPECTED_STATES_SET
                - set(states_in_life_expectancy_usa_file)
            )
        )
-        if states_missing != self.EXPECTED_MISSING_STATES:
+        # Expect that PR, Island Areas, and Maine/Wisconsin are missing
-            raise ValueError(
+        compare_to_list_of_expected_state_fips_codes(
-                "LEEP data has changed. The states missing from the data are "
+            actual_state_fips_codes=states_in_life_expectancy_usa_file,
-                "no longer the same."
+            nation_expected=self.NATION_EXPECTED_IN_DATA,
-            )
+            puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
            island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
            additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
        )
        logger.info("Downloading data for Maine")
        maine_download_file_name = (
@ -131,20 +117,18 @@ class CDCLifeExpectancy(ExtractTransformLoad):
            axis=0,
        )
-        states_in_combined_df = combined_df[
+        states_in_combined_df = list(
-            self.STATE_INPUT_COLUMN_NAME
+            combined_df[self.STATE_INPUT_COLUMN_NAME].unique()
        ].unique()
        # Find which states are missing from the combined df.
        states_missing = sorted(
            list(self.EXPECTED_STATES_SET - set(states_in_combined_df))
        )
-        if len(states_missing) != 0:
+        # Expect that PR and Island Areas are the only things now missing
-            raise ValueError(
+        compare_to_list_of_expected_state_fips_codes(
-                "The states missing from combined dataframe are "
+            actual_state_fips_codes=states_in_combined_df,
-                "no longer as expected."
+            nation_expected=self.NATION_EXPECTED_IN_DATA,
-            )
+            puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
            island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
            additional_fips_codes_not_expected=[],
        )
        # Save the updated version
        self.raw_df = combined_df