mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
refactoring
This commit is contained in:
parent
56a24b9bd1
commit
70606440fb
6 changed files with 457 additions and 46 deletions
|
@ -12,6 +12,7 @@ settings = Dynaconf(
|
|||
|
||||
# set root dir
|
||||
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
|
||||
settings.DATA_PATH = settings.APP_ROOT / "data"
|
||||
settings.REQUESTS_DEFAULT_TIMOUT = 3600
|
||||
# To set an environment use:
|
||||
# Linux/OSX: export ENV_FOR_DYNACONF=staging
|
||||
|
|
|
@ -7,6 +7,9 @@ from typing import Optional
|
|||
import pandas as pd
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.etl.score.etl_utils import (
|
||||
compare_to_list_of_expected_state_fips_codes,
|
||||
)
|
||||
from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
|
||||
from data_pipeline.utils import (
|
||||
load_yaml_dict_from_file,
|
||||
|
@ -43,7 +46,7 @@ class ExtractTransformLoad:
|
|||
APP_ROOT: pathlib.Path = settings.APP_ROOT
|
||||
|
||||
# Directories
|
||||
DATA_PATH: pathlib.Path = APP_ROOT / "data"
|
||||
DATA_PATH: pathlib.Path = settings.DATA_PATH
|
||||
TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
|
||||
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
|
||||
DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
|
||||
|
@ -82,6 +85,19 @@ class ExtractTransformLoad:
|
|||
# NULL_REPRESENTATION is how nulls are represented on the input field
|
||||
NULL_REPRESENTATION: str = None
|
||||
|
||||
# Whether this ETL contains data for the nation (the US states)
|
||||
NATION_EXPECTED_IN_DATA: bool = True
|
||||
|
||||
# Whether this ETL contains data for Puerto Rico
|
||||
PUERTO_RICO_EXPECTED_IN_DATA: bool = True
|
||||
|
||||
# Whether this ETL contains data for the island areas
|
||||
ISLAND_AREAS_EXPECTED_IN_DATA: bool = False
|
||||
|
||||
# Whether this ETL contains known missing data for any additional
|
||||
# states/territories
|
||||
EXPECTED_MISSING_STATES: typing.List[str] = []
|
||||
|
||||
# Thirteen digits in a census block group ID.
|
||||
EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
|
||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
||||
|
@ -289,6 +305,21 @@ class ExtractTransformLoad:
|
|||
f"`{geo_field}`."
|
||||
)
|
||||
|
||||
# Check whether data contains expected states
|
||||
states_in_output_df = list(
|
||||
self.output_df[self.GEOID_TRACT_FIELD_NAME]
|
||||
.astype(str)
|
||||
.str[0:2]
|
||||
.unique()
|
||||
)
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=states_in_output_df,
|
||||
nation_expected=self.NATION_EXPECTED_IN_DATA,
|
||||
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
|
||||
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
|
||||
additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES,
|
||||
)
|
||||
|
||||
def load(self, float_format=None) -> None:
|
||||
"""Saves the transformed data.
|
||||
|
||||
|
|
|
@ -131,6 +131,59 @@ TILES_NATION_THRESHOLD_COUNT = 21
|
|||
# 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands
|
||||
TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"]
|
||||
TILES_PUERTO_RICO_FIPS_CODE = ["72"]
|
||||
TILES_NATION_FIPS_CODE = [
|
||||
"01",
|
||||
"02",
|
||||
"04",
|
||||
"05",
|
||||
"06",
|
||||
"08",
|
||||
"09",
|
||||
"10",
|
||||
"11",
|
||||
"12",
|
||||
"13",
|
||||
"15",
|
||||
"16",
|
||||
"17",
|
||||
"18",
|
||||
"19",
|
||||
"20",
|
||||
"21",
|
||||
"22",
|
||||
"23",
|
||||
"24",
|
||||
"25",
|
||||
"26",
|
||||
"27",
|
||||
"28",
|
||||
"29",
|
||||
"30",
|
||||
"31",
|
||||
"32",
|
||||
"33",
|
||||
"34",
|
||||
"35",
|
||||
"36",
|
||||
"37",
|
||||
"38",
|
||||
"39",
|
||||
"40",
|
||||
"41",
|
||||
"42",
|
||||
"44",
|
||||
"45",
|
||||
"46",
|
||||
"47",
|
||||
"48",
|
||||
"49",
|
||||
"50",
|
||||
"51",
|
||||
"53",
|
||||
"54",
|
||||
"55",
|
||||
"56",
|
||||
]
|
||||
|
||||
# Constant to reflect UI Experience version
|
||||
# "Nation" referring to 50 states and DC is from Census
|
||||
|
|
|
@ -1,11 +1,18 @@
|
|||
import os
|
||||
import sys
|
||||
import typing
|
||||
from pathlib import Path
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.etl.score.constants import (
|
||||
TILES_ISLAND_AREA_FIPS_CODES,
|
||||
TILES_PUERTO_RICO_FIPS_CODE,
|
||||
TILES_NATION_FIPS_CODE,
|
||||
)
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
from data_pipeline.utils import (
|
||||
download_file_from_url,
|
||||
get_module_logger,
|
||||
|
@ -305,3 +312,73 @@ def create_codebook(
|
|||
return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
|
||||
columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
|
||||
)
|
||||
|
||||
|
||||
def compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes: typing.List[str],
|
||||
nation_expected: bool = True,
|
||||
puerto_rico_expected: bool = True,
|
||||
island_areas_expected: bool = True,
|
||||
additional_fips_codes_not_expected: typing.List[str] = [],
|
||||
) -> None:
|
||||
"""Check whether a list of state/territory FIPS codes match expectations.
|
||||
|
||||
Args:
|
||||
actual_state_fips_codes (List of str): Actual state codes observed in data
|
||||
nation_expected (bool): Do you expect the nation (DC & states) to be
|
||||
represented in data?
|
||||
puerto_rico_expected (bool): Do you expect PR to be represented in data?
|
||||
island_areas_expected (bool): Do you expect Island Areas to be represented in
|
||||
data?
|
||||
additional_fips_codes_not_expected (List of str): Additional state codes
|
||||
not expected in the data. For example, the data may be known to be missing
|
||||
data from Maine and Wisconsin.
|
||||
|
||||
Returns:
|
||||
None: Does not return any values.
|
||||
|
||||
Raises:
|
||||
ValueError: if lists do not match expectations.
|
||||
"""
|
||||
# Cast input to a set.
|
||||
actual_state_fips_codes_set = set(actual_state_fips_codes)
|
||||
|
||||
# Start with the list of all FIPS codes for all states and territories.
|
||||
expected_states_set = set(get_state_fips_codes(settings.DATA_PATH))
|
||||
|
||||
# If nation (states and DC) are not expected to be included, remove it from the
|
||||
# expected
|
||||
# states set.
|
||||
if not nation_expected:
|
||||
expected_states_set = expected_states_set - set(TILES_NATION_FIPS_CODE)
|
||||
|
||||
# If Puerto Rico is not expected to be included, remove it from the expected
|
||||
# states set.
|
||||
if not puerto_rico_expected:
|
||||
expected_states_set = expected_states_set - set(
|
||||
TILES_PUERTO_RICO_FIPS_CODE
|
||||
)
|
||||
|
||||
# If island areas are not expected to be included, remove them from the expected
|
||||
# states set.
|
||||
if not island_areas_expected:
|
||||
expected_states_set = expected_states_set - set(
|
||||
TILES_ISLAND_AREA_FIPS_CODES
|
||||
)
|
||||
|
||||
# If additional FIPS codes are not expected to be included, remove them from the
|
||||
# expected states set.
|
||||
expected_states_set = expected_states_set - set(
|
||||
additional_fips_codes_not_expected
|
||||
)
|
||||
|
||||
if expected_states_set != actual_state_fips_codes_set:
|
||||
raise ValueError(
|
||||
"The states and territories in the data are not as expected.\n"
|
||||
"FIPS state codes expected that are not present in the data:\n"
|
||||
f"{sorted(list(expected_states_set - actual_state_fips_codes_set))}\n"
|
||||
"FIPS state codes in the data that were not expected:\n"
|
||||
f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
|
||||
)
|
||||
else:
|
||||
logger.info("Data matches expected state and territory representation.")
|
||||
|
|
|
@ -2,7 +2,10 @@ import pandas as pd
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from data_pipeline.etl.score.etl_utils import floor_series
|
||||
from data_pipeline.etl.score.etl_utils import (
|
||||
floor_series,
|
||||
compare_to_list_of_expected_state_fips_codes,
|
||||
)
|
||||
|
||||
|
||||
def test_floor_series():
|
||||
|
@ -70,3 +73,265 @@ def test_floor_series():
|
|||
match="Argument series must be of type pandas series, not of type list.",
|
||||
):
|
||||
floor_series(invalid_type, number_of_decimals=3)
|
||||
|
||||
|
||||
def test_compare_to_list_of_expected_state_fips_codes():
|
||||
fips_codes_test_1 = [
|
||||
"01",
|
||||
"02",
|
||||
"04",
|
||||
"05",
|
||||
"06",
|
||||
"08",
|
||||
"09",
|
||||
"10",
|
||||
"11",
|
||||
"12",
|
||||
"13",
|
||||
"15",
|
||||
"16",
|
||||
"17",
|
||||
"18",
|
||||
"19",
|
||||
"20",
|
||||
"21",
|
||||
"22",
|
||||
"23",
|
||||
"24",
|
||||
"25",
|
||||
"26",
|
||||
"27",
|
||||
"28",
|
||||
"29",
|
||||
"30",
|
||||
"31",
|
||||
"32",
|
||||
"33",
|
||||
"34",
|
||||
"35",
|
||||
"36",
|
||||
"37",
|
||||
"38",
|
||||
"39",
|
||||
"40",
|
||||
"41",
|
||||
"42",
|
||||
"44",
|
||||
"45",
|
||||
"46",
|
||||
"47",
|
||||
"48",
|
||||
"49",
|
||||
"50",
|
||||
"51",
|
||||
"53",
|
||||
"54",
|
||||
"55",
|
||||
"56",
|
||||
"60",
|
||||
"66",
|
||||
"69",
|
||||
"72",
|
||||
"78",
|
||||
]
|
||||
|
||||
# Should not raise any errors
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_1
|
||||
)
|
||||
|
||||
# Should raise error because Puerto Rico is not expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_1,
|
||||
puerto_rico_expected=False,
|
||||
)
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes in the data that were not expected:\n['72']\n"
|
||||
)
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# Should raise error because Island Areas are not expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_1,
|
||||
island_areas_expected=False,
|
||||
)
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes in the data that were not expected:\n"
|
||||
"['60', '66', '69', '78']\n"
|
||||
)
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# List missing PR and Guam
|
||||
fips_codes_test_2 = [
|
||||
"01",
|
||||
"02",
|
||||
"04",
|
||||
"05",
|
||||
"06",
|
||||
"08",
|
||||
"09",
|
||||
"10",
|
||||
"11",
|
||||
"12",
|
||||
"13",
|
||||
"15",
|
||||
"16",
|
||||
"17",
|
||||
"18",
|
||||
"19",
|
||||
"20",
|
||||
"21",
|
||||
"22",
|
||||
"23",
|
||||
"24",
|
||||
"25",
|
||||
"26",
|
||||
"27",
|
||||
"28",
|
||||
"29",
|
||||
"30",
|
||||
"31",
|
||||
"32",
|
||||
"33",
|
||||
"34",
|
||||
"35",
|
||||
"36",
|
||||
"37",
|
||||
"38",
|
||||
"39",
|
||||
"40",
|
||||
"41",
|
||||
"42",
|
||||
"44",
|
||||
"45",
|
||||
"46",
|
||||
"47",
|
||||
"48",
|
||||
"49",
|
||||
"50",
|
||||
"51",
|
||||
"53",
|
||||
"54",
|
||||
"55",
|
||||
"56",
|
||||
"60",
|
||||
"69",
|
||||
"78",
|
||||
]
|
||||
# Should raise error because all Island Areas and PR are expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_2,
|
||||
)
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes expected that are not present in the data:\n"
|
||||
"['66', '72']\n"
|
||||
)
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# Missing Maine and Wisconsin
|
||||
fips_codes_test_3 = [
|
||||
"01",
|
||||
"02",
|
||||
"04",
|
||||
"05",
|
||||
"06",
|
||||
"08",
|
||||
"09",
|
||||
"10",
|
||||
"11",
|
||||
"12",
|
||||
"13",
|
||||
"15",
|
||||
"16",
|
||||
"17",
|
||||
"18",
|
||||
"19",
|
||||
"20",
|
||||
"21",
|
||||
"22",
|
||||
"24",
|
||||
"25",
|
||||
"26",
|
||||
"27",
|
||||
"28",
|
||||
"29",
|
||||
"30",
|
||||
"31",
|
||||
"32",
|
||||
"33",
|
||||
"34",
|
||||
"35",
|
||||
"36",
|
||||
"37",
|
||||
"38",
|
||||
"39",
|
||||
"40",
|
||||
"41",
|
||||
"42",
|
||||
"44",
|
||||
"45",
|
||||
"46",
|
||||
"47",
|
||||
"48",
|
||||
"49",
|
||||
"50",
|
||||
"51",
|
||||
"53",
|
||||
"54",
|
||||
"56",
|
||||
"60",
|
||||
"66",
|
||||
"69",
|
||||
"72",
|
||||
"78",
|
||||
]
|
||||
|
||||
# Should raise error because Maine and Wisconsin are expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_3,
|
||||
)
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes expected that are not present in the data:\n"
|
||||
"['23', '55']\n"
|
||||
)
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# Should not raise error because Maine and Wisconsin are expected to be missing
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_3,
|
||||
additional_fips_codes_not_expected=["23", "55"],
|
||||
)
|
||||
|
||||
# Missing the nation
|
||||
fips_codes_test_4 = [
|
||||
"60",
|
||||
"66",
|
||||
"69",
|
||||
"72",
|
||||
"78",
|
||||
]
|
||||
|
||||
# Should raise error because the nation is expected
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_4,
|
||||
)
|
||||
|
||||
partial_expected_error_message = (
|
||||
"FIPS state codes expected that are not present in the data:\n"
|
||||
"['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', "
|
||||
"'17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', "
|
||||
"'30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', "
|
||||
"'44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56']"
|
||||
)
|
||||
|
||||
assert partial_expected_error_message in str(exception_info.value)
|
||||
|
||||
# Should not raise error because Nation is not to be missing
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=fips_codes_test_4, nation_expected=False
|
||||
)
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.score.constants import (
|
||||
TILES_ISLAND_AREA_FIPS_CODES,
|
||||
TILES_PUERTO_RICO_FIPS_CODE,
|
||||
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
|
||||
from data_pipeline.etl.score.etl_utils import (
|
||||
compare_to_list_of_expected_state_fips_codes,
|
||||
)
|
||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||
from data_pipeline.utils import get_module_logger, download_file_from_url
|
||||
|
@ -14,8 +13,13 @@ logger = get_module_logger(__name__)
|
|||
|
||||
class CDCLifeExpectancy(ExtractTransformLoad):
|
||||
def __init__(self):
|
||||
self.GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||
self.PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||
|
||||
self.USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
|
||||
|
||||
self.STATES_MISSING_FROM_USA_FILE = ["23", "55"]
|
||||
|
||||
# For some reason, LEEP does not include Maine or Wisconsin in its "All of
|
||||
# USA" file. Load these separately.
|
||||
self.WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
|
||||
|
@ -35,19 +39,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
|||
self.LIFE_EXPECTANCY_FIELD_NAME,
|
||||
]
|
||||
|
||||
# Set some constants that will be helpful for debugging the source data later.
|
||||
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
|
||||
|
||||
self.EXPECTED_STATES_SET = (
|
||||
set(self.STATE_FIPS_CODES)
|
||||
# We don't expect LEEP to have data for island areas or Puerto Rico.
|
||||
- set(TILES_ISLAND_AREA_FIPS_CODES)
|
||||
- set(TILES_PUERTO_RICO_FIPS_CODE)
|
||||
)
|
||||
|
||||
# These states are currently missing from LEEP's whole USA file.
|
||||
self.EXPECTED_MISSING_STATES = ["23", "55"]
|
||||
|
||||
self.raw_df: pd.DataFrame
|
||||
self.output_df: pd.DataFrame
|
||||
|
||||
|
@ -76,23 +67,18 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
|||
)
|
||||
|
||||
# Check which states are missing
|
||||
states_in_life_expectancy_usa_file = all_usa_raw_df[
|
||||
self.STATE_INPUT_COLUMN_NAME
|
||||
].unique()
|
||||
|
||||
# Find which states are missing from the expected set.
|
||||
states_missing = sorted(
|
||||
list(
|
||||
self.EXPECTED_STATES_SET
|
||||
- set(states_in_life_expectancy_usa_file)
|
||||
)
|
||||
states_in_life_expectancy_usa_file = list(
|
||||
all_usa_raw_df[self.STATE_INPUT_COLUMN_NAME].unique()
|
||||
)
|
||||
|
||||
if states_missing != self.EXPECTED_MISSING_STATES:
|
||||
raise ValueError(
|
||||
"LEEP data has changed. The states missing from the data are "
|
||||
"no longer the same."
|
||||
)
|
||||
# Expect that PR, Island Areas, and Maine/Wisconsin are missing
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=states_in_life_expectancy_usa_file,
|
||||
nation_expected=self.NATION_EXPECTED_IN_DATA,
|
||||
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
|
||||
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
|
||||
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
|
||||
)
|
||||
|
||||
logger.info("Downloading data for Maine")
|
||||
maine_download_file_name = (
|
||||
|
@ -131,20 +117,18 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
|||
axis=0,
|
||||
)
|
||||
|
||||
states_in_combined_df = combined_df[
|
||||
self.STATE_INPUT_COLUMN_NAME
|
||||
].unique()
|
||||
|
||||
# Find which states are missing from the combined df.
|
||||
states_missing = sorted(
|
||||
list(self.EXPECTED_STATES_SET - set(states_in_combined_df))
|
||||
states_in_combined_df = list(
|
||||
combined_df[self.STATE_INPUT_COLUMN_NAME].unique()
|
||||
)
|
||||
|
||||
if len(states_missing) != 0:
|
||||
raise ValueError(
|
||||
"The states missing from combined dataframe are "
|
||||
"no longer as expected."
|
||||
)
|
||||
# Expect that PR and Island Areas are the only things now missing
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=states_in_combined_df,
|
||||
nation_expected=self.NATION_EXPECTED_IN_DATA,
|
||||
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
|
||||
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
|
||||
additional_fips_codes_not_expected=[],
|
||||
)
|
||||
|
||||
# Save the updated version
|
||||
self.raw_df = combined_df
|
||||
|
|
Loading…
Add table
Reference in a new issue