mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-29 20:21:17 -07:00
Address rounding issue in Pandas series to floor numerically unstable values (#1085)
* wip - added tests - 1 failing * added check for empty series + added test * passing tests * parallelism in variable assingnment choice * resolve merge conflicts * variable name changes * cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf * cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf * revisions to handle type ambiguity * fixing tests * fix pytest * fix linting * fix pytest * reword comments * cleanup comments * cleanup comments - fix typo * added type check and corresponding test * added type check and corresponding test * language cleanup * revert * update picke fixture Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
This commit is contained in:
parent
93595b7bb4
commit
56644698ff
4 changed files with 141 additions and 6 deletions
|
@ -3,6 +3,7 @@ import json
|
|||
import pandas as pd
|
||||
|
||||
from data_pipeline.etl.base import ExtractTransformLoad
|
||||
from data_pipeline.etl.score.etl_utils import floor_series
|
||||
from data_pipeline.utils import get_module_logger, zip_files
|
||||
from data_pipeline.score import field_names
|
||||
|
||||
|
@ -207,13 +208,15 @@ class PostScoreETL(ExtractTransformLoad):
|
|||
# filter the columns on full score
|
||||
score_tiles = score_county_state_merged_df[tiles_score_column_titles]
|
||||
|
||||
# round decimals
|
||||
decimals = pd.Series(
|
||||
[constants.TILES_ROUND_NUM_DECIMALS]
|
||||
* len(constants.TILES_SCORE_FLOAT_COLUMNS),
|
||||
index=constants.TILES_SCORE_FLOAT_COLUMNS,
|
||||
score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[
|
||||
constants.TILES_SCORE_FLOAT_COLUMNS
|
||||
].apply(
|
||||
func=lambda series: floor_series(
|
||||
series=series,
|
||||
number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS,
|
||||
),
|
||||
axis=0,
|
||||
)
|
||||
score_tiles = score_tiles.round(decimals)
|
||||
|
||||
# create indexes
|
||||
score_tiles = score_tiles.rename(
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
from data_pipeline.config import settings
|
||||
from data_pipeline.utils import (
|
||||
|
@ -48,3 +51,60 @@ def check_score_data_source(
|
|||
"No local score tiles data found. Please use '-d aws` to fetch from AWS"
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
|
||||
def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
|
||||
"""Floors all non-null numerical values to a specific number of decimal points
|
||||
|
||||
Args:
|
||||
series (pd.Series): Input pandas series
|
||||
number_of_decimals (int): Number of decimal points to floor all numerical values to
|
||||
Returns:
|
||||
floored_series (pd.Series): A Pandas Series of numerical values with appropriate number of decimal points
|
||||
"""
|
||||
|
||||
# we perform many operations using the division operator
|
||||
# as well as elementwise multiplication. The result of such
|
||||
# operations can introduce such values, below, due to numerical
|
||||
# instability. This results in unsafe type inference for numpy
|
||||
# float types - exacerbated by panda's type inference engine.
|
||||
# Hence, to handle such offending values we default to None
|
||||
# Please see the reference, below, on nullable integer types for more details
|
||||
unacceptable_values = [-np.inf, np.inf, "None", np.nan]
|
||||
mapping = {
|
||||
unacceptable_value: None for unacceptable_value in unacceptable_values
|
||||
}
|
||||
|
||||
# ensure we are working with a numpy array (which is really what a pandas series is)
|
||||
if not isinstance(series, pd.Series):
|
||||
raise TypeError(
|
||||
f"Argument series must be of type pandas series, not of type {type(series).__name__}."
|
||||
)
|
||||
|
||||
# raise exception for handling empty series
|
||||
if series.empty:
|
||||
raise ValueError("Empty series provided.")
|
||||
|
||||
# if we have any values, just replace them with None
|
||||
if series.isin(unacceptable_values).any():
|
||||
series.replace(mapping, regex=False, inplace=True)
|
||||
|
||||
multiplication_factor = 10 ** number_of_decimals
|
||||
|
||||
# In order to safely cast NaNs
|
||||
# First coerce series to float type: series.astype(float)
|
||||
# Please see here:
|
||||
# https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#nullable-integer-data-type
|
||||
product_for_numerator = np.floor(
|
||||
series.astype(float) * multiplication_factor
|
||||
)
|
||||
|
||||
floored_series = np.where(
|
||||
series.isnull(),
|
||||
# For all null values default to null
|
||||
None,
|
||||
# The other default condition - floor non-null values
|
||||
product_for_numerator / multiplication_factor,
|
||||
)
|
||||
|
||||
return floored_series
|
||||
|
|
Binary file not shown.
|
@ -0,0 +1,72 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from data_pipeline.etl.score.etl_utils import floor_series
|
||||
|
||||
|
||||
def test_floor_series():
|
||||
# test examples
|
||||
series = pd.Series(data=[None, 1, 0.324534, 1.2341], dtype="float64")
|
||||
series_exponentiated = pd.Series(
|
||||
data=[
|
||||
-np.inf,
|
||||
np.inf,
|
||||
"None",
|
||||
-0.131321313123123,
|
||||
5.62322441e-15,
|
||||
1.2341123131313131312e12,
|
||||
]
|
||||
)
|
||||
series_of_nan_values = pd.Series(data=[None, None, None, None, None])
|
||||
series_empty = pd.Series(data=[], dtype="float64")
|
||||
# list of randomly generated values
|
||||
invalid_type = list(np.random.uniform(1, 1000000, size=15))
|
||||
|
||||
floored_series_1 = floor_series(series, number_of_decimals=2)
|
||||
floored_series_2 = floor_series(series, number_of_decimals=3)
|
||||
floored_series_3 = floor_series(series, number_of_decimals=1)
|
||||
floored_series_4 = floor_series(series_of_nan_values, number_of_decimals=10)
|
||||
floored_series_5 = floor_series(series_exponentiated, number_of_decimals=1)
|
||||
# expected fixtures
|
||||
expected_1 = np.array([None, 1.0, 0.32, 1.23])
|
||||
expected_2 = np.array([None, 1.00, 0.324, 1.234])
|
||||
expected_3 = np.array([None, 1.0, 0.3, 1.2])
|
||||
expected_4 = np.array([None, None, None, None, None])
|
||||
expected_5 = np.array([None, None, None, -0.2, 0.0, 1234112313131.3])
|
||||
|
||||
# Test for expected value with 2 decimal places
|
||||
# Elewentwise comparison to ensure all values are equal
|
||||
all_elements_are_equal_one = np.equal(expected_1, floored_series_1)
|
||||
assert all_elements_are_equal_one.all()
|
||||
|
||||
# Test for expected value with 3 decimal places
|
||||
# Elewentwise comparison to ensure all values are equal
|
||||
all_elements_are_equal_two = np.equal(expected_2, floored_series_2)
|
||||
assert all_elements_are_equal_two.all()
|
||||
|
||||
# Test for expected value with 1 decimal place
|
||||
# Elewentwise comparison to ensure all values are equal
|
||||
all_elements_are_equal_three = np.equal(expected_3, floored_series_3)
|
||||
assert all_elements_are_equal_three.all()
|
||||
|
||||
# Test for expected value for some arbitrary decimal place
|
||||
# Elewentwise comparison to ensure all values are equal for NaN
|
||||
all_elements_are_equal_four = np.equal(expected_4, floored_series_4)
|
||||
assert all_elements_are_equal_four.all()
|
||||
# Test for expected value for some arbitrary decimal place
|
||||
# Elewentwise comparison to ensure all floating point imprecision
|
||||
# is clamped to a certain number of decimal points
|
||||
all_elements_are_equal_five = np.equal(expected_5, floored_series_5)
|
||||
assert all_elements_are_equal_five.all()
|
||||
|
||||
# Test for empty series - should raise a ValueError exception
|
||||
with pytest.raises(ValueError, match="Empty series provided."):
|
||||
floor_series(series_empty, number_of_decimals=2)
|
||||
|
||||
# Test for invalid type - should raise a TypeError exception
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match="Argument series must be of type pandas series, not of type list.",
|
||||
):
|
||||
floor_series(invalid_type, number_of_decimals=3)
|
Loading…
Add table
Add a link
Reference in a new issue