diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 8f527c7a..244cccbe 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -3,6 +3,7 @@ import json import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.score.etl_utils import floor_series from data_pipeline.utils import get_module_logger, zip_files from data_pipeline.score import field_names @@ -207,13 +208,15 @@ class PostScoreETL(ExtractTransformLoad): # filter the columns on full score score_tiles = score_county_state_merged_df[tiles_score_column_titles] - # round decimals - decimals = pd.Series( - [constants.TILES_ROUND_NUM_DECIMALS] - * len(constants.TILES_SCORE_FLOAT_COLUMNS), - index=constants.TILES_SCORE_FLOAT_COLUMNS, + score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[ + constants.TILES_SCORE_FLOAT_COLUMNS + ].apply( + func=lambda series: floor_series( + series=series, + number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS, + ), + axis=0, ) - score_tiles = score_tiles.round(decimals) # create indexes score_tiles = score_tiles.rename( diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py index c3fec035..2219ab18 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py @@ -1,6 +1,9 @@ import os import sys from pathlib import Path +import numpy as np +import pandas as pd + from data_pipeline.config import settings from data_pipeline.utils import ( @@ -48,3 +51,60 @@ def check_score_data_source( "No local score tiles data found. Please use '-d aws` to fetch from AWS" ) sys.exit() + + +def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series: + """Floors all non-null numerical values to a specific number of decimal points + + Args: + series (pd.Series): Input pandas series + number_of_decimals (int): Number of decimal points to floor all numerical values to + Returns: + floored_series (pd.Series): A Pandas Series of numerical values with appropriate number of decimal points + """ + + # we perform many operations using the division operator + # as well as elementwise multiplication. The result of such + # operations can introduce such values, below, due to numerical + # instability. This results in unsafe type inference for numpy + # float types - exacerbated by panda's type inference engine. + # Hence, to handle such offending values we default to None + # Please see the reference, below, on nullable integer types for more details + unacceptable_values = [-np.inf, np.inf, "None", np.nan] + mapping = { + unacceptable_value: None for unacceptable_value in unacceptable_values + } + + # ensure we are working with a numpy array (which is really what a pandas series is) + if not isinstance(series, pd.Series): + raise TypeError( + f"Argument series must be of type pandas series, not of type {type(series).__name__}." + ) + + # raise exception for handling empty series + if series.empty: + raise ValueError("Empty series provided.") + + # if we have any values, just replace them with None + if series.isin(unacceptable_values).any(): + series.replace(mapping, regex=False, inplace=True) + + multiplication_factor = 10 ** number_of_decimals + + # In order to safely cast NaNs + # First coerce series to float type: series.astype(float) + # Please see here: + # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#nullable-integer-data-type + product_for_numerator = np.floor( + series.astype(float) * multiplication_factor + ) + + floored_series = np.where( + series.isnull(), + # For all null values default to null + None, + # The other default condition - floor non-null values + product_for_numerator / multiplication_factor, + ) + + return floored_series diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl index 8c47d653..d066a170 100644 Binary files a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl and b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl differ diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py new file mode 100644 index 00000000..594f4856 --- /dev/null +++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py @@ -0,0 +1,72 @@ +import pandas as pd +import numpy as np +import pytest + +from data_pipeline.etl.score.etl_utils import floor_series + + +def test_floor_series(): + # test examples + series = pd.Series(data=[None, 1, 0.324534, 1.2341], dtype="float64") + series_exponentiated = pd.Series( + data=[ + -np.inf, + np.inf, + "None", + -0.131321313123123, + 5.62322441e-15, + 1.2341123131313131312e12, + ] + ) + series_of_nan_values = pd.Series(data=[None, None, None, None, None]) + series_empty = pd.Series(data=[], dtype="float64") + # list of randomly generated values + invalid_type = list(np.random.uniform(1, 1000000, size=15)) + + floored_series_1 = floor_series(series, number_of_decimals=2) + floored_series_2 = floor_series(series, number_of_decimals=3) + floored_series_3 = floor_series(series, number_of_decimals=1) + floored_series_4 = floor_series(series_of_nan_values, number_of_decimals=10) + floored_series_5 = floor_series(series_exponentiated, number_of_decimals=1) + # expected fixtures + expected_1 = np.array([None, 1.0, 0.32, 1.23]) + expected_2 = np.array([None, 1.00, 0.324, 1.234]) + expected_3 = np.array([None, 1.0, 0.3, 1.2]) + expected_4 = np.array([None, None, None, None, None]) + expected_5 = np.array([None, None, None, -0.2, 0.0, 1234112313131.3]) + + # Test for expected value with 2 decimal places + # Elewentwise comparison to ensure all values are equal + all_elements_are_equal_one = np.equal(expected_1, floored_series_1) + assert all_elements_are_equal_one.all() + + # Test for expected value with 3 decimal places + # Elewentwise comparison to ensure all values are equal + all_elements_are_equal_two = np.equal(expected_2, floored_series_2) + assert all_elements_are_equal_two.all() + + # Test for expected value with 1 decimal place + # Elewentwise comparison to ensure all values are equal + all_elements_are_equal_three = np.equal(expected_3, floored_series_3) + assert all_elements_are_equal_three.all() + + # Test for expected value for some arbitrary decimal place + # Elewentwise comparison to ensure all values are equal for NaN + all_elements_are_equal_four = np.equal(expected_4, floored_series_4) + assert all_elements_are_equal_four.all() + # Test for expected value for some arbitrary decimal place + # Elewentwise comparison to ensure all floating point imprecision + # is clamped to a certain number of decimal points + all_elements_are_equal_five = np.equal(expected_5, floored_series_5) + assert all_elements_are_equal_five.all() + + # Test for empty series - should raise a ValueError exception + with pytest.raises(ValueError, match="Empty series provided."): + floor_series(series_empty, number_of_decimals=2) + + # Test for invalid type - should raise a TypeError exception + with pytest.raises( + TypeError, + match="Argument series must be of type pandas series, not of type list.", + ): + floor_series(invalid_type, number_of_decimals=3)