mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-23 10:10:17 -07:00
Issue 1831: missing life expectancy data from Maine and Wisconsin (#1887)
* Fixing missing states and adding tests for states to all classes
This commit is contained in:
parent
fb4c484e5c
commit
6e9c44ea72
21 changed files with 522 additions and 187 deletions
|
@ -7,6 +7,9 @@ from typing import Optional
|
|||
import pandas as pd
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.etl.score.etl_utils import (
|
||||
compare_to_list_of_expected_state_fips_codes,
|
||||
)
|
||||
from data_pipeline.etl.score.schemas.datasets import DatasetsConfig
|
||||
from data_pipeline.utils import (
|
||||
load_yaml_dict_from_file,
|
||||
|
@ -43,7 +46,7 @@ class ExtractTransformLoad:
|
|||
APP_ROOT: pathlib.Path = settings.APP_ROOT
|
||||
|
||||
# Directories
|
||||
DATA_PATH: pathlib.Path = APP_ROOT / "data"
|
||||
DATA_PATH: pathlib.Path = settings.DATA_PATH
|
||||
TMP_PATH: pathlib.Path = DATA_PATH / "tmp"
|
||||
CONTENT_CONFIG: pathlib.Path = APP_ROOT / "content" / "config"
|
||||
DATASET_CONFIG_PATH: pathlib.Path = APP_ROOT / "etl" / "score" / "config"
|
||||
|
@ -82,6 +85,23 @@ class ExtractTransformLoad:
|
|||
# NULL_REPRESENTATION is how nulls are represented on the input field
|
||||
NULL_REPRESENTATION: str = None
|
||||
|
||||
# Whether this ETL contains data for the continental nation (DC & the US states
|
||||
# except for Alaska and Hawaii)
|
||||
CONTINENTAL_US_EXPECTED_IN_DATA: bool = True
|
||||
|
||||
# Whether this ETL contains data for Alaska and Hawaii
|
||||
ALASKA_AND_HAWAII_EXPECTED_IN_DATA: bool = True
|
||||
|
||||
# Whether this ETL contains data for Puerto Rico
|
||||
PUERTO_RICO_EXPECTED_IN_DATA: bool = True
|
||||
|
||||
# Whether this ETL contains data for the island areas
|
||||
ISLAND_AREAS_EXPECTED_IN_DATA: bool = False
|
||||
|
||||
# Whether this ETL contains known missing data for any additional
|
||||
# states/territories
|
||||
EXPECTED_MISSING_STATES: typing.List[str] = []
|
||||
|
||||
# Thirteen digits in a census block group ID.
|
||||
EXPECTED_CENSUS_BLOCK_GROUPS_CHARACTER_LENGTH: int = 13
|
||||
# TODO: investigate. Census says there are only 217,740 CBGs in the US. This might
|
||||
|
@ -289,6 +309,24 @@ class ExtractTransformLoad:
|
|||
f"`{geo_field}`."
|
||||
)
|
||||
|
||||
# Check whether data contains expected states
|
||||
states_in_output_df = (
|
||||
self.output_df[self.GEOID_TRACT_FIELD_NAME]
|
||||
.str[0:2]
|
||||
.unique()
|
||||
.tolist()
|
||||
)
|
||||
|
||||
compare_to_list_of_expected_state_fips_codes(
|
||||
actual_state_fips_codes=states_in_output_df,
|
||||
continental_us_expected=self.CONTINENTAL_US_EXPECTED_IN_DATA,
|
||||
alaska_and_hawaii_expected=self.ALASKA_AND_HAWAII_EXPECTED_IN_DATA,
|
||||
puerto_rico_expected=self.PUERTO_RICO_EXPECTED_IN_DATA,
|
||||
island_areas_expected=self.ISLAND_AREAS_EXPECTED_IN_DATA,
|
||||
additional_fips_codes_not_expected=self.EXPECTED_MISSING_STATES,
|
||||
dataset_name=self.NAME,
|
||||
)
|
||||
|
||||
def load(self, float_format=None) -> None:
|
||||
"""Saves the transformed data.
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue