From 56a24b9bd18f720ec1e2773155be4d5baa0ec5c0 Mon Sep 17 00:00:00 2001 From: lucasmbrown-usds Date: Wed, 7 Sep 2022 14:00:11 -0400 Subject: [PATCH] refactoring for clarity --- .../etl/sources/cdc_life_expectancy/etl.py | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py index 981e40d9..a0eb9102 100644 --- a/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py +++ b/data/data-pipeline/data_pipeline/etl/sources/cdc_life_expectancy/etl.py @@ -35,6 +35,19 @@ class CDCLifeExpectancy(ExtractTransformLoad): self.LIFE_EXPECTANCY_FIELD_NAME, ] + # Set some constants that will be helpful for debugging the source data later. + self.STATE_FIPS_CODES = get_state_fips_codes(self.DATA_PATH) + + self.EXPECTED_STATES_SET = ( + set(self.STATE_FIPS_CODES) + # We don't expect LEEP to have data for island areas or Puerto Rico. + - set(TILES_ISLAND_AREA_FIPS_CODES) + - set(TILES_PUERTO_RICO_FIPS_CODE) + ) + + # These states are currently missing from LEEP's whole USA file. + self.EXPECTED_MISSING_STATES = ["23", "55"] + self.raw_df: pd.DataFrame self.output_df: pd.DataFrame @@ -63,24 +76,19 @@ class CDCLifeExpectancy(ExtractTransformLoad): ) # Check which states are missing - state_fips_codes = get_state_fips_codes(self.DATA_PATH) states_in_life_expectancy_usa_file = all_usa_raw_df[ self.STATE_INPUT_COLUMN_NAME ].unique() - expected_states_set = ( - set(state_fips_codes) - # We don't expect LEEP to have data for island areas or Puerto Rico. - - set(TILES_ISLAND_AREA_FIPS_CODES) - - set(TILES_PUERTO_RICO_FIPS_CODE) - ) - # Find which states are missing from the expected set. states_missing = sorted( - list(expected_states_set - set(states_in_life_expectancy_usa_file)) + list( + self.EXPECTED_STATES_SET + - set(states_in_life_expectancy_usa_file) + ) ) - if states_missing != ["23", "55"]: + if states_missing != self.EXPECTED_MISSING_STATES: raise ValueError( "LEEP data has changed. The states missing from the data are " "no longer the same." @@ -129,7 +137,7 @@ class CDCLifeExpectancy(ExtractTransformLoad): # Find which states are missing from the combined df. states_missing = sorted( - list(expected_states_set - set(states_in_combined_df)) + list(self.EXPECTED_STATES_SET - set(states_in_combined_df)) ) if len(states_missing) != 0: