Address rounding issue in Pandas series to floor numerically unstable values (#1085)

* wip - added tests - 1 failing

* added check for empty series + added test

* passing tests

* parallelism in variable assingnment choice

* resolve merge conflicts

* variable name changes

* cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf

* cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf

* revisions to handle type ambiguity

* fixing tests

* fix pytest

* fix linting

* fix pytest

* reword comments

* cleanup comments

* cleanup comments - fix typo

* added type check and corresponding test

* added type check and corresponding test

* language cleanup

* revert

* update picke fixture

Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
This commit is contained in:
Saran Ahluwalia 2022-01-05 17:03:37 -05:00 committed by GitHub
commit 56644698ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 141 additions and 6 deletions

View file

@ -3,6 +3,7 @@ import json
import pandas as pd
from data_pipeline.etl.base import ExtractTransformLoad
from data_pipeline.etl.score.etl_utils import floor_series
from data_pipeline.utils import get_module_logger, zip_files
from data_pipeline.score import field_names
@ -207,13 +208,15 @@ class PostScoreETL(ExtractTransformLoad):
# filter the columns on full score
score_tiles = score_county_state_merged_df[tiles_score_column_titles]
# round decimals
decimals = pd.Series(
[constants.TILES_ROUND_NUM_DECIMALS]
* len(constants.TILES_SCORE_FLOAT_COLUMNS),
index=constants.TILES_SCORE_FLOAT_COLUMNS,
score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[
constants.TILES_SCORE_FLOAT_COLUMNS
].apply(
func=lambda series: floor_series(
series=series,
number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS,
),
axis=0,
)
score_tiles = score_tiles.round(decimals)
# create indexes
score_tiles = score_tiles.rename(

View file

@ -1,6 +1,9 @@
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from data_pipeline.config import settings
from data_pipeline.utils import (
@ -48,3 +51,60 @@ def check_score_data_source(
"No local score tiles data found. Please use '-d aws` to fetch from AWS"
)
sys.exit()
def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
"""Floors all non-null numerical values to a specific number of decimal points
Args:
series (pd.Series): Input pandas series
number_of_decimals (int): Number of decimal points to floor all numerical values to
Returns:
floored_series (pd.Series): A Pandas Series of numerical values with appropriate number of decimal points
"""
# we perform many operations using the division operator
# as well as elementwise multiplication. The result of such
# operations can introduce such values, below, due to numerical
# instability. This results in unsafe type inference for numpy
# float types - exacerbated by panda's type inference engine.
# Hence, to handle such offending values we default to None
# Please see the reference, below, on nullable integer types for more details
unacceptable_values = [-np.inf, np.inf, "None", np.nan]
mapping = {
unacceptable_value: None for unacceptable_value in unacceptable_values
}
# ensure we are working with a numpy array (which is really what a pandas series is)
if not isinstance(series, pd.Series):
raise TypeError(
f"Argument series must be of type pandas series, not of type {type(series).__name__}."
)
# raise exception for handling empty series
if series.empty:
raise ValueError("Empty series provided.")
# if we have any values, just replace them with None
if series.isin(unacceptable_values).any():
series.replace(mapping, regex=False, inplace=True)
multiplication_factor = 10 ** number_of_decimals
# In order to safely cast NaNs
# First coerce series to float type: series.astype(float)
# Please see here:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#nullable-integer-data-type
product_for_numerator = np.floor(
series.astype(float) * multiplication_factor
)
floored_series = np.where(
series.isnull(),
# For all null values default to null
None,
# The other default condition - floor non-null values
product_for_numerator / multiplication_factor,
)
return floored_series

View file

@ -0,0 +1,72 @@
import pandas as pd
import numpy as np
import pytest
from data_pipeline.etl.score.etl_utils import floor_series
def test_floor_series():
# test examples
series = pd.Series(data=[None, 1, 0.324534, 1.2341], dtype="float64")
series_exponentiated = pd.Series(
data=[
-np.inf,
np.inf,
"None",
-0.131321313123123,
5.62322441e-15,
1.2341123131313131312e12,
]
)
series_of_nan_values = pd.Series(data=[None, None, None, None, None])
series_empty = pd.Series(data=[], dtype="float64")
# list of randomly generated values
invalid_type = list(np.random.uniform(1, 1000000, size=15))
floored_series_1 = floor_series(series, number_of_decimals=2)
floored_series_2 = floor_series(series, number_of_decimals=3)
floored_series_3 = floor_series(series, number_of_decimals=1)
floored_series_4 = floor_series(series_of_nan_values, number_of_decimals=10)
floored_series_5 = floor_series(series_exponentiated, number_of_decimals=1)
# expected fixtures
expected_1 = np.array([None, 1.0, 0.32, 1.23])
expected_2 = np.array([None, 1.00, 0.324, 1.234])
expected_3 = np.array([None, 1.0, 0.3, 1.2])
expected_4 = np.array([None, None, None, None, None])
expected_5 = np.array([None, None, None, -0.2, 0.0, 1234112313131.3])
# Test for expected value with 2 decimal places
# Elewentwise comparison to ensure all values are equal
all_elements_are_equal_one = np.equal(expected_1, floored_series_1)
assert all_elements_are_equal_one.all()
# Test for expected value with 3 decimal places
# Elewentwise comparison to ensure all values are equal
all_elements_are_equal_two = np.equal(expected_2, floored_series_2)
assert all_elements_are_equal_two.all()
# Test for expected value with 1 decimal place
# Elewentwise comparison to ensure all values are equal
all_elements_are_equal_three = np.equal(expected_3, floored_series_3)
assert all_elements_are_equal_three.all()
# Test for expected value for some arbitrary decimal place
# Elewentwise comparison to ensure all values are equal for NaN
all_elements_are_equal_four = np.equal(expected_4, floored_series_4)
assert all_elements_are_equal_four.all()
# Test for expected value for some arbitrary decimal place
# Elewentwise comparison to ensure all floating point imprecision
# is clamped to a certain number of decimal points
all_elements_are_equal_five = np.equal(expected_5, floored_series_5)
assert all_elements_are_equal_five.all()
# Test for empty series - should raise a ValueError exception
with pytest.raises(ValueError, match="Empty series provided."):
floor_series(series_empty, number_of_decimals=2)
# Test for invalid type - should raise a TypeError exception
with pytest.raises(
TypeError,
match="Argument series must be of type pandas series, not of type list.",
):
floor_series(invalid_type, number_of_decimals=3)