mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-02-23 10:04:18 -08:00
refactoring
This commit is contained in:
parent
56a24b9bd1
commit
70606440fb
6 changed files with 457 additions and 46 deletions
|
@ -12,6 +12,7 @@ settings = Dynaconf(
|
||||||
|
|
||||||
# set root dir
|
# set root dir
|
||||||
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
|
settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
|
||||||
|
settings.DATA_PATH = settings.APP_ROOT / "data"
|
||||||
settings.REQUESTS_DEFAULT_TIMOUT = 3600
|
settings.REQUESTS_DEFAULT_TIMOUT = 3600
|
||||||
# To set an environment use:
|
# To set an environment use:
|
||||||
# Linux/OSX: export ENV_FOR_DYNACONF=staging
|
# Linux/OSX: export ENV_FOR_DYNACONF=staging
|
||||||
|
|
|
@ -7,6 +7,9 @@ from typing import Optional
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
from data_pipeline.etl.score.etl_utils import (
|
||||||
|
compare_to_list_of_expected_state_fips_codes,
|
||||||
|
)
|
||||||
from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
|
from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
|
||||||
from data_pipeline.utils import (
|
from data_pipeline.utils import (
|
||||||
load_yaml_dict_from_file,
|
load_yaml_dict_from_file,
|
||||||
|
@ -43,7 +46,7 @@ class ExtractTransformLoad:
|
||||||
APP_ROOT: pathlib.Path = settings.APP_ROOT
|
APP_ROOT: pathlib.Path = settings.APP_ROOT
|
||||||
|
|
||||||
# Directories
|
# Directories
|
||||||
DATA_PATH: pathlib.Path = APP_ROOT / "data"
|
DATA_PATH: pathlib.Path = settings.DATA_PATH
|
||||||
TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
|
TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
|
||||||
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
|
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
|
||||||
DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
|
DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
|
||||||
|
@ -82,6 +85,19 @@ class ExtractTransformLoad:
|
||||||
# NULL_REPRESENTATION is how nulls are represented on the input field
|
# NULL_REPRESENTATION is how nulls are represented on the input field
|
||||||
NULL_REPRESENTATION: str = None
|
NULL_REPRESENTATION: str = None
|
||||||
|
|
||||||
|
# Whether this ETL contains data for the nation (the US states)
|
||||||
|
NATION_EXPECTED_IN_DATA: bool = True
|
||||||
|
|
||||||
|
# Whether this ETL contains data for Puerto Rico
|
||||||
|
PUERTO_RICO_EXPECTED_IN_DATA: bool = True
|
||||||
|
|
||||||
|
# Whether this ETL contains data for the island areas
|
||||||
|
ISLAND_AREAS_EXPECTED_IN_DATA: bool = False
|
||||||
|
|
||||||
|
# Whether this ETL contains known missing data for any additional
|
||||||
|
# states/territories
|
||||||
|
EXPECTED_MISSING_STATES: typing.List[str] = []
|
||||||
|
|
||||||
# Thirteen digits in a census block group ID.
|
# Thirteen digits in a census block group ID.
|
||||||
EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
|
EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
|
||||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
||||||
|
@ -289,6 +305,21 @@ class ExtractTransformLoad:
|
||||||
f"`{geo_field}`."
|
f"`{geo_field}`."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Check whether data contains expected states
|
||||||
|
states_in_output_df = list(
|
||||||
|
self.output_df[self.GEOID_TRACT_FIELD_NAME]
|
||||||
|
.astype(str)
|
||||||
|
.str[0:2]
|
||||||
|
.unique()
|
||||||
|
)
|
||||||
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
|
actual_state_fips_codes=states_in_output_df,
|
||||||
|
nation_expected=self.NATION_EXPECTED_IN_DATA,
|
||||||
|
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
|
||||||
|
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
|
||||||
|
additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES,
|
||||||
|
)
|
||||||
|
|
||||||
def load(self, float_format=None) -> None:
|
def load(self, float_format=None) -> None:
|
||||||
"""Saves the transformed data.
|
"""Saves the transformed data.
|
||||||
|
|
||||||
|
|
|
@ -131,6 +131,59 @@ TILES_NATION_THRESHOLD_COUNT = 21
|
||||||
# 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands
|
# 60: American Samoa, 66: Guam, 69: N. Mariana Islands, 78: US Virgin Islands
|
||||||
TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"]
|
TILES_ISLAND_AREA_FIPS_CODES = ["60", "66", "69", "78"]
|
||||||
TILES_PUERTO_RICO_FIPS_CODE = ["72"]
|
TILES_PUERTO_RICO_FIPS_CODE = ["72"]
|
||||||
|
TILES_NATION_FIPS_CODE = [
|
||||||
|
"01",
|
||||||
|
"02",
|
||||||
|
"04",
|
||||||
|
"05",
|
||||||
|
"06",
|
||||||
|
"08",
|
||||||
|
"09",
|
||||||
|
"10",
|
||||||
|
"11",
|
||||||
|
"12",
|
||||||
|
"13",
|
||||||
|
"15",
|
||||||
|
"16",
|
||||||
|
"17",
|
||||||
|
"18",
|
||||||
|
"19",
|
||||||
|
"20",
|
||||||
|
"21",
|
||||||
|
"22",
|
||||||
|
"23",
|
||||||
|
"24",
|
||||||
|
"25",
|
||||||
|
"26",
|
||||||
|
"27",
|
||||||
|
"28",
|
||||||
|
"29",
|
||||||
|
"30",
|
||||||
|
"31",
|
||||||
|
"32",
|
||||||
|
"33",
|
||||||
|
"34",
|
||||||
|
"35",
|
||||||
|
"36",
|
||||||
|
"37",
|
||||||
|
"38",
|
||||||
|
"39",
|
||||||
|
"40",
|
||||||
|
"41",
|
||||||
|
"42",
|
||||||
|
"44",
|
||||||
|
"45",
|
||||||
|
"46",
|
||||||
|
"47",
|
||||||
|
"48",
|
||||||
|
"49",
|
||||||
|
"50",
|
||||||
|
"51",
|
||||||
|
"53",
|
||||||
|
"54",
|
||||||
|
"55",
|
||||||
|
"56",
|
||||||
|
]
|
||||||
|
|
||||||
# Constant to reflect UI Experience version
|
# Constant to reflect UI Experience version
|
||||||
# "Nation" referring to 50 states and DC is from Census
|
# "Nation" referring to 50 states and DC is from Census
|
||||||
|
|
|
@ -1,11 +1,18 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import typing
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
|
from data_pipeline.etl.score.constants import (
|
||||||
|
TILES_ISLAND_AREA_FIPS_CODES,
|
||||||
|
TILES_PUERTO_RICO_FIPS_CODE,
|
||||||
|
TILES_NATION_FIPS_CODE,
|
||||||
|
)
|
||||||
|
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||||
from data_pipeline.utils import (
|
from data_pipeline.utils import (
|
||||||
download_file_from_url,
|
download_file_from_url,
|
||||||
get_module_logger,
|
get_module_logger,
|
||||||
|
@ -305,3 +312,73 @@ def create_codebook(
|
||||||
return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
|
return merged_codebook_df[constants.CODEBOOK_COLUMNS].rename(
|
||||||
columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
|
columns={constants.CEJST_SCORE_COLUMN_NAME: "Description"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def compare_to_list_of_expected_state_fips_codes(
|
||||||
|
actual_state_fips_codes: typing.List[str],
|
||||||
|
nation_expected: bool = True,
|
||||||
|
puerto_rico_expected: bool = True,
|
||||||
|
island_areas_expected: bool = True,
|
||||||
|
additional_fips_codes_not_expected: typing.List[str] = [],
|
||||||
|
) -> None:
|
||||||
|
"""Check whether a list of state/territory FIPS codes match expectations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
actual_state_fips_codes (List of str): Actual state codes observed in data
|
||||||
|
nation_expected (bool): Do you expect the nation (DC & states) to be
|
||||||
|
represented in data?
|
||||||
|
puerto_rico_expected (bool): Do you expect PR to be represented in data?
|
||||||
|
island_areas_expected (bool): Do you expect Island Areas to be represented in
|
||||||
|
data?
|
||||||
|
additional_fips_codes_not_expected (List of str): Additional state codes
|
||||||
|
not expected in the data. For example, the data may be known to be missing
|
||||||
|
data from Maine and Wisconsin.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None: Does not return any values.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: if lists do not match expectations.
|
||||||
|
"""
|
||||||
|
# Cast input to a set.
|
||||||
|
actual_state_fips_codes_set = set(actual_state_fips_codes)
|
||||||
|
|
||||||
|
# Start with the list of all FIPS codes for all states and territories.
|
||||||
|
expected_states_set = set(get_state_fips_codes(settings.DATA_PATH))
|
||||||
|
|
||||||
|
# If nation (states and DC) are not expected to be included, remove it from the
|
||||||
|
# expected
|
||||||
|
# states set.
|
||||||
|
if not nation_expected:
|
||||||
|
expected_states_set = expected_states_set - set(TILES_NATION_FIPS_CODE)
|
||||||
|
|
||||||
|
# If Puerto Rico is not expected to be included, remove it from the expected
|
||||||
|
# states set.
|
||||||
|
if not puerto_rico_expected:
|
||||||
|
expected_states_set = expected_states_set - set(
|
||||||
|
TILES_PUERTO_RICO_FIPS_CODE
|
||||||
|
)
|
||||||
|
|
||||||
|
# If island areas are not expected to be included, remove them from the expected
|
||||||
|
# states set.
|
||||||
|
if not island_areas_expected:
|
||||||
|
expected_states_set = expected_states_set - set(
|
||||||
|
TILES_ISLAND_AREA_FIPS_CODES
|
||||||
|
)
|
||||||
|
|
||||||
|
# If additional FIPS codes are not expected to be included, remove them from the
|
||||||
|
# expected states set.
|
||||||
|
expected_states_set = expected_states_set - set(
|
||||||
|
additional_fips_codes_not_expected
|
||||||
|
)
|
||||||
|
|
||||||
|
if expected_states_set != actual_state_fips_codes_set:
|
||||||
|
raise ValueError(
|
||||||
|
"The states and territories in the data are not as expected.\n"
|
||||||
|
"FIPS state codes expected that are not present in the data:\n"
|
||||||
|
f"{sorted(list(expected_states_set - actual_state_fips_codes_set))}\n"
|
||||||
|
"FIPS state codes in the data that were not expected:\n"
|
||||||
|
f"{sorted(list(actual_state_fips_codes_set - expected_states_set))}\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info("Data matches expected state and territory representation.")
|
||||||
|
|
|
@ -2,7 +2,10 @@ import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from data_pipeline.etl.score.etl_utils import floor_series
|
from data_pipeline.etl.score.etl_utils import (
|
||||||
|
floor_series,
|
||||||
|
compare_to_list_of_expected_state_fips_codes,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_floor_series():
|
def test_floor_series():
|
||||||
|
@ -70,3 +73,265 @@ def test_floor_series():
|
||||||
match="Argument series must be of type pandas series, not of type list.",
|
match="Argument series must be of type pandas series, not of type list.",
|
||||||
):
|
):
|
||||||
floor_series(invalid_type, number_of_decimals=3)
|
floor_series(invalid_type, number_of_decimals=3)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compare_to_list_of_expected_state_fips_codes():
|
||||||
|
fips_codes_test_1 = [
|
||||||
|
"01",
|
||||||
|
"02",
|
||||||
|
"04",
|
||||||
|
"05",
|
||||||
|
"06",
|
||||||
|
"08",
|
||||||
|
"09",
|
||||||
|
"10",
|
||||||
|
"11",
|
||||||
|
"12",
|
||||||
|
"13",
|
||||||
|
"15",
|
||||||
|
"16",
|
||||||
|
"17",
|
||||||
|
"18",
|
||||||
|
"19",
|
||||||
|
"20",
|
||||||
|
"21",
|
||||||
|
"22",
|
||||||
|
"23",
|
||||||
|
"24",
|
||||||
|
"25",
|
||||||
|
"26",
|
||||||
|
"27",
|
||||||
|
"28",
|
||||||
|
"29",
|
||||||
|
"30",
|
||||||
|
"31",
|
||||||
|
"32",
|
||||||
|
"33",
|
||||||
|
"34",
|
||||||
|
"35",
|
||||||
|
"36",
|
||||||
|
"37",
|
||||||
|
"38",
|
||||||
|
"39",
|
||||||
|
"40",
|
||||||
|
"41",
|
||||||
|
"42",
|
||||||
|
"44",
|
||||||
|
"45",
|
||||||
|
"46",
|
||||||
|
"47",
|
||||||
|
"48",
|
||||||
|
"49",
|
||||||
|
"50",
|
||||||
|
"51",
|
||||||
|
"53",
|
||||||
|
"54",
|
||||||
|
"55",
|
||||||
|
"56",
|
||||||
|
"60",
|
||||||
|
"66",
|
||||||
|
"69",
|
||||||
|
"72",
|
||||||
|
"78",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Should not raise any errors
|
||||||
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
|
actual_state_fips_codes=fips_codes_test_1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should raise error because Puerto Rico is not expected
|
||||||
|
with pytest.raises(ValueError) as exception_info:
|
||||||
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
|
actual_state_fips_codes=fips_codes_test_1,
|
||||||
|
puerto_rico_expected=False,
|
||||||
|
)
|
||||||
|
partial_expected_error_message = (
|
||||||
|
"FIPS state codes in the data that were not expected:\n['72']\n"
|
||||||
|
)
|
||||||
|
assert partial_expected_error_message in str(exception_info.value)
|
||||||
|
|
||||||
|
# Should raise error because Island Areas are not expected
|
||||||
|
with pytest.raises(ValueError) as exception_info:
|
||||||
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
|
actual_state_fips_codes=fips_codes_test_1,
|
||||||
|
island_areas_expected=False,
|
||||||
|
)
|
||||||
|
partial_expected_error_message = (
|
||||||
|
"FIPS state codes in the data that were not expected:\n"
|
||||||
|
"['60', '66', '69', '78']\n"
|
||||||
|
)
|
||||||
|
assert partial_expected_error_message in str(exception_info.value)
|
||||||
|
|
||||||
|
# List missing PR and Guam
|
||||||
|
fips_codes_test_2 = [
|
||||||
|
"01",
|
||||||
|
"02",
|
||||||
|
"04",
|
||||||
|
"05",
|
||||||
|
"06",
|
||||||
|
"08",
|
||||||
|
"09",
|
||||||
|
"10",
|
||||||
|
"11",
|
||||||
|
"12",
|
||||||
|
"13",
|
||||||
|
"15",
|
||||||
|
"16",
|
||||||
|
"17",
|
||||||
|
"18",
|
||||||
|
"19",
|
||||||
|
"20",
|
||||||
|
"21",
|
||||||
|
"22",
|
||||||
|
"23",
|
||||||
|
"24",
|
||||||
|
"25",
|
||||||
|
"26",
|
||||||
|
"27",
|
||||||
|
"28",
|
||||||
|
"29",
|
||||||
|
"30",
|
||||||
|
"31",
|
||||||
|
"32",
|
||||||
|
"33",
|
||||||
|
"34",
|
||||||
|
"35",
|
||||||
|
"36",
|
||||||
|
"37",
|
||||||
|
"38",
|
||||||
|
"39",
|
||||||
|
"40",
|
||||||
|
"41",
|
||||||
|
"42",
|
||||||
|
"44",
|
||||||
|
"45",
|
||||||
|
"46",
|
||||||
|
"47",
|
||||||
|
"48",
|
||||||
|
"49",
|
||||||
|
"50",
|
||||||
|
"51",
|
||||||
|
"53",
|
||||||
|
"54",
|
||||||
|
"55",
|
||||||
|
"56",
|
||||||
|
"60",
|
||||||
|
"69",
|
||||||
|
"78",
|
||||||
|
]
|
||||||
|
# Should raise error because all Island Areas and PR are expected
|
||||||
|
with pytest.raises(ValueError) as exception_info:
|
||||||
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
|
actual_state_fips_codes=fips_codes_test_2,
|
||||||
|
)
|
||||||
|
partial_expected_error_message = (
|
||||||
|
"FIPS state codes expected that are not present in the data:\n"
|
||||||
|
"['66', '72']\n"
|
||||||
|
)
|
||||||
|
assert partial_expected_error_message in str(exception_info.value)
|
||||||
|
|
||||||
|
# Missing Maine and Wisconsin
|
||||||
|
fips_codes_test_3 = [
|
||||||
|
"01",
|
||||||
|
"02",
|
||||||
|
"04",
|
||||||
|
"05",
|
||||||
|
"06",
|
||||||
|
"08",
|
||||||
|
"09",
|
||||||
|
"10",
|
||||||
|
"11",
|
||||||
|
"12",
|
||||||
|
"13",
|
||||||
|
"15",
|
||||||
|
"16",
|
||||||
|
"17",
|
||||||
|
"18",
|
||||||
|
"19",
|
||||||
|
"20",
|
||||||
|
"21",
|
||||||
|
"22",
|
||||||
|
"24",
|
||||||
|
"25",
|
||||||
|
"26",
|
||||||
|
"27",
|
||||||
|
"28",
|
||||||
|
"29",
|
||||||
|
"30",
|
||||||
|
"31",
|
||||||
|
"32",
|
||||||
|
"33",
|
||||||
|
"34",
|
||||||
|
"35",
|
||||||
|
"36",
|
||||||
|
"37",
|
||||||
|
"38",
|
||||||
|
"39",
|
||||||
|
"40",
|
||||||
|
"41",
|
||||||
|
"42",
|
||||||
|
"44",
|
||||||
|
"45",
|
||||||
|
"46",
|
||||||
|
"47",
|
||||||
|
"48",
|
||||||
|
"49",
|
||||||
|
"50",
|
||||||
|
"51",
|
||||||
|
"53",
|
||||||
|
"54",
|
||||||
|
"56",
|
||||||
|
"60",
|
||||||
|
"66",
|
||||||
|
"69",
|
||||||
|
"72",
|
||||||
|
"78",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Should raise error because Maine and Wisconsin are expected
|
||||||
|
with pytest.raises(ValueError) as exception_info:
|
||||||
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
|
actual_state_fips_codes=fips_codes_test_3,
|
||||||
|
)
|
||||||
|
partial_expected_error_message = (
|
||||||
|
"FIPS state codes expected that are not present in the data:\n"
|
||||||
|
"['23', '55']\n"
|
||||||
|
)
|
||||||
|
assert partial_expected_error_message in str(exception_info.value)
|
||||||
|
|
||||||
|
# Should not raise error because Maine and Wisconsin are expected to be missing
|
||||||
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
|
actual_state_fips_codes=fips_codes_test_3,
|
||||||
|
additional_fips_codes_not_expected=["23", "55"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Missing the nation
|
||||||
|
fips_codes_test_4 = [
|
||||||
|
"60",
|
||||||
|
"66",
|
||||||
|
"69",
|
||||||
|
"72",
|
||||||
|
"78",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Should raise error because the nation is expected
|
||||||
|
with pytest.raises(ValueError) as exception_info:
|
||||||
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
|
actual_state_fips_codes=fips_codes_test_4,
|
||||||
|
)
|
||||||
|
|
||||||
|
partial_expected_error_message = (
|
||||||
|
"FIPS state codes expected that are not present in the data:\n"
|
||||||
|
"['01', '02', '04', '05', '06', '08', '09', '10', '11', '12', '13', '15', '16', "
|
||||||
|
"'17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', "
|
||||||
|
"'30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', "
|
||||||
|
"'44', '45', '46', '47', '48', '49', '50', '51', '53', '54', '55', '56']"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert partial_expected_error_message in str(exception_info.value)
|
||||||
|
|
||||||
|
# Should not raise error because Nation is not to be missing
|
||||||
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
|
actual_state_fips_codes=fips_codes_test_4, nation_expected=False
|
||||||
|
)
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad, ValidGeoLevel
|
||||||
from data_pipeline.etl.score.constants import (
|
from data_pipeline.etl.score.etl_utils import (
|
||||||
TILES_ISLAND_AREA_FIPS_CODES,
|
compare_to_list_of_expected_state_fips_codes,
|
||||||
TILES_PUERTO_RICO_FIPS_CODE,
|
|
||||||
)
|
)
|
||||||
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
from data_pipeline.etl.sources.census.etl_utils import get_state_fips_codes
|
||||||
from data_pipeline.utils import get_module_logger, download_file_from_url
|
from data_pipeline.utils import get_module_logger, download_file_from_url
|
||||||
|
@ -14,8 +13,13 @@ logger = get_module_logger(__name__)
|
||||||
|
|
||||||
class CDCLifeExpectancy(ExtractTransformLoad):
|
class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
self.GEO_LEVEL = ValidGeoLevel.CENSUS_TRACT
|
||||||
|
self.PUERTO_RICO_EXPECTED_IN_DATA = False
|
||||||
|
|
||||||
self.USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
|
self.USA_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/US_A.CSV"
|
||||||
|
|
||||||
|
self.STATES_MISSING_FROM_USA_FILE = ["23", "55"]
|
||||||
|
|
||||||
# For some reason, LEEP does not include Maine or Wisconsin in its "All of
|
# For some reason, LEEP does not include Maine or Wisconsin in its "All of
|
||||||
# USA" file. Load these separately.
|
# USA" file. Load these separately.
|
||||||
self.WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
|
self.WISCONSIN_FILE_URL: str = "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NVSS/USALEEP/CSV/WI_A.CSV"
|
||||||
|
@ -35,19 +39,6 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
self.LIFE_EXPECTANCY_FIELD_NAME,
|
self.LIFE_EXPECTANCY_FIELD_NAME,
|
||||||
]
|
]
|
||||||
|
|
||||||
# Set some constants that will be helpful for debugging the source data later.
|
|
||||||
self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH)
|
|
||||||
|
|
||||||
self.EXPECTED_STATES_SET = (
|
|
||||||
set(self.STATE_FIPS_CODES)
|
|
||||||
# We don't expect LEEP to have data for island areas or Puerto Rico.
|
|
||||||
- set(TILES_ISLAND_AREA_FIPS_CODES)
|
|
||||||
- set(TILES_PUERTO_RICO_FIPS_CODE)
|
|
||||||
)
|
|
||||||
|
|
||||||
# These states are currently missing from LEEP's whole USA file.
|
|
||||||
self.EXPECTED_MISSING_STATES = ["23", "55"]
|
|
||||||
|
|
||||||
self.raw_df: pd.DataFrame
|
self.raw_df: pd.DataFrame
|
||||||
self.output_df: pd.DataFrame
|
self.output_df: pd.DataFrame
|
||||||
|
|
||||||
|
@ -76,23 +67,18 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check which states are missing
|
# Check which states are missing
|
||||||
states_in_life_expectancy_usa_file = all_usa_raw_df[
|
states_in_life_expectancy_usa_file = list(
|
||||||
self.STATE_INPUT_COLUMN_NAME
|
all_usa_raw_df[self.STATE_INPUT_COLUMN_NAME].unique()
|
||||||
].unique()
|
|
||||||
|
|
||||||
# Find which states are missing from the expected set.
|
|
||||||
states_missing = sorted(
|
|
||||||
list(
|
|
||||||
self.EXPECTED_STATES_SET
|
|
||||||
- set(states_in_life_expectancy_usa_file)
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if states_missing != self.EXPECTED_MISSING_STATES:
|
# Expect that PR, Island Areas, and Maine/Wisconsin are missing
|
||||||
raise ValueError(
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
"LEEP data has changed. The states missing from the data are "
|
actual_state_fips_codes=states_in_life_expectancy_usa_file,
|
||||||
"no longer the same."
|
nation_expected=self.NATION_EXPECTED_IN_DATA,
|
||||||
)
|
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
|
||||||
|
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
|
||||||
|
additional_fips_codes_not_expected=self.STATES_MISSING_FROM_USA_FILE,
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("Downloading data for Maine")
|
logger.info("Downloading data for Maine")
|
||||||
maine_download_file_name = (
|
maine_download_file_name = (
|
||||||
|
@ -131,20 +117,18 @@ class CDCLifeExpectancy(ExtractTransformLoad):
|
||||||
axis=0,
|
axis=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
states_in_combined_df = combined_df[
|
states_in_combined_df = list(
|
||||||
self.STATE_INPUT_COLUMN_NAME
|
combined_df[self.STATE_INPUT_COLUMN_NAME].unique()
|
||||||
].unique()
|
|
||||||
|
|
||||||
# Find which states are missing from the combined df.
|
|
||||||
states_missing = sorted(
|
|
||||||
list(self.EXPECTED_STATES_SET - set(states_in_combined_df))
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(states_missing) != 0:
|
# Expect that PR and Island Areas are the only things now missing
|
||||||
raise ValueError(
|
compare_to_list_of_expected_state_fips_codes(
|
||||||
"The states missing from combined dataframe are "
|
actual_state_fips_codes=states_in_combined_df,
|
||||||
"no longer as expected."
|
nation_expected=self.NATION_EXPECTED_IN_DATA,
|
||||||
)
|
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
|
||||||
|
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
|
||||||
|
additional_fips_codes_not_expected=[],
|
||||||
|
)
|
||||||
|
|
||||||
# Save the updated version
|
# Save the updated version
|
||||||
self.raw_df = combined_df
|
self.raw_df = combined_df
|
||||||
|
|
Loading…
Add table
Reference in a new issue