mirror of
https://github.com/DOI-DO/j40-cejst-2.git
synced 2025-07-31 09:11:19 -07:00
Address rounding issue in Pandas series to floor numerically unstable values (#1085)
* wip - added tests - 1 failing * added check for empty series + added test * passing tests * parallelism in variable assingnment choice * resolve merge conflicts * variable name changes * cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf * cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf * revisions to handle type ambiguity * fixing tests * fix pytest * fix linting * fix pytest * reword comments * cleanup comments * cleanup comments - fix typo * added type check and corresponding test * added type check and corresponding test * language cleanup * revert * update picke fixture Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
This commit is contained in:
parent
93595b7bb4
commit
56644698ff
4 changed files with 141 additions and 6 deletions
|
@ -3,6 +3,7 @@ import json
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from data_pipeline.etl.base import ExtractTransformLoad
|
from data_pipeline.etl.base import ExtractTransformLoad
|
||||||
|
from data_pipeline.etl.score.etl_utils import floor_series
|
||||||
from data_pipeline.utils import get_module_logger, zip_files
|
from data_pipeline.utils import get_module_logger, zip_files
|
||||||
from data_pipeline.score import field_names
|
from data_pipeline.score import field_names
|
||||||
|
|
||||||
|
@ -207,13 +208,15 @@ class PostScoreETL(ExtractTransformLoad):
|
||||||
# filter the columns on full score
|
# filter the columns on full score
|
||||||
score_tiles = score_county_state_merged_df[tiles_score_column_titles]
|
score_tiles = score_county_state_merged_df[tiles_score_column_titles]
|
||||||
|
|
||||||
# round decimals
|
score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[
|
||||||
decimals = pd.Series(
|
constants.TILES_SCORE_FLOAT_COLUMNS
|
||||||
[constants.TILES_ROUND_NUM_DECIMALS]
|
].apply(
|
||||||
* len(constants.TILES_SCORE_FLOAT_COLUMNS),
|
func=lambda series: floor_series(
|
||||||
index=constants.TILES_SCORE_FLOAT_COLUMNS,
|
series=series,
|
||||||
|
number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS,
|
||||||
|
),
|
||||||
|
axis=0,
|
||||||
)
|
)
|
||||||
score_tiles = score_tiles.round(decimals)
|
|
||||||
|
|
||||||
# create indexes
|
# create indexes
|
||||||
score_tiles = score_tiles.rename(
|
score_tiles = score_tiles.rename(
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
from data_pipeline.config import settings
|
from data_pipeline.config import settings
|
||||||
from data_pipeline.utils import (
|
from data_pipeline.utils import (
|
||||||
|
@ -48,3 +51,60 @@ def check_score_data_source(
|
||||||
"No local score tiles data found. Please use '-d aws` to fetch from AWS"
|
"No local score tiles data found. Please use '-d aws` to fetch from AWS"
|
||||||
)
|
)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
|
||||||
|
"""Floors all non-null numerical values to a specific number of decimal points
|
||||||
|
|
||||||
|
Args:
|
||||||
|
series (pd.Series): Input pandas series
|
||||||
|
number_of_decimals (int): Number of decimal points to floor all numerical values to
|
||||||
|
Returns:
|
||||||
|
floored_series (pd.Series): A Pandas Series of numerical values with appropriate number of decimal points
|
||||||
|
"""
|
||||||
|
|
||||||
|
# we perform many operations using the division operator
|
||||||
|
# as well as elementwise multiplication. The result of such
|
||||||
|
# operations can introduce such values, below, due to numerical
|
||||||
|
# instability. This results in unsafe type inference for numpy
|
||||||
|
# float types - exacerbated by panda's type inference engine.
|
||||||
|
# Hence, to handle such offending values we default to None
|
||||||
|
# Please see the reference, below, on nullable integer types for more details
|
||||||
|
unacceptable_values = [-np.inf, np.inf, "None", np.nan]
|
||||||
|
mapping = {
|
||||||
|
unacceptable_value: None for unacceptable_value in unacceptable_values
|
||||||
|
}
|
||||||
|
|
||||||
|
# ensure we are working with a numpy array (which is really what a pandas series is)
|
||||||
|
if not isinstance(series, pd.Series):
|
||||||
|
raise TypeError(
|
||||||
|
f"Argument series must be of type pandas series, not of type {type(series).__name__}."
|
||||||
|
)
|
||||||
|
|
||||||
|
# raise exception for handling empty series
|
||||||
|
if series.empty:
|
||||||
|
raise ValueError("Empty series provided.")
|
||||||
|
|
||||||
|
# if we have any values, just replace them with None
|
||||||
|
if series.isin(unacceptable_values).any():
|
||||||
|
series.replace(mapping, regex=False, inplace=True)
|
||||||
|
|
||||||
|
multiplication_factor = 10 ** number_of_decimals
|
||||||
|
|
||||||
|
# In order to safely cast NaNs
|
||||||
|
# First coerce series to float type: series.astype(float)
|
||||||
|
# Please see here:
|
||||||
|
# https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#nullable-integer-data-type
|
||||||
|
product_for_numerator = np.floor(
|
||||||
|
series.astype(float) * multiplication_factor
|
||||||
|
)
|
||||||
|
|
||||||
|
floored_series = np.where(
|
||||||
|
series.isnull(),
|
||||||
|
# For all null values default to null
|
||||||
|
None,
|
||||||
|
# The other default condition - floor non-null values
|
||||||
|
product_for_numerator / multiplication_factor,
|
||||||
|
)
|
||||||
|
|
||||||
|
return floored_series
|
||||||
|
|
Binary file not shown.
|
@ -0,0 +1,72 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from data_pipeline.etl.score.etl_utils import floor_series
|
||||||
|
|
||||||
|
|
||||||
|
def test_floor_series():
|
||||||
|
# test examples
|
||||||
|
series = pd.Series(data=[None, 1, 0.324534, 1.2341], dtype="float64")
|
||||||
|
series_exponentiated = pd.Series(
|
||||||
|
data=[
|
||||||
|
-np.inf,
|
||||||
|
np.inf,
|
||||||
|
"None",
|
||||||
|
-0.131321313123123,
|
||||||
|
5.62322441e-15,
|
||||||
|
1.2341123131313131312e12,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
series_of_nan_values = pd.Series(data=[None, None, None, None, None])
|
||||||
|
series_empty = pd.Series(data=[], dtype="float64")
|
||||||
|
# list of randomly generated values
|
||||||
|
invalid_type = list(np.random.uniform(1, 1000000, size=15))
|
||||||
|
|
||||||
|
floored_series_1 = floor_series(series, number_of_decimals=2)
|
||||||
|
floored_series_2 = floor_series(series, number_of_decimals=3)
|
||||||
|
floored_series_3 = floor_series(series, number_of_decimals=1)
|
||||||
|
floored_series_4 = floor_series(series_of_nan_values, number_of_decimals=10)
|
||||||
|
floored_series_5 = floor_series(series_exponentiated, number_of_decimals=1)
|
||||||
|
# expected fixtures
|
||||||
|
expected_1 = np.array([None, 1.0, 0.32, 1.23])
|
||||||
|
expected_2 = np.array([None, 1.00, 0.324, 1.234])
|
||||||
|
expected_3 = np.array([None, 1.0, 0.3, 1.2])
|
||||||
|
expected_4 = np.array([None, None, None, None, None])
|
||||||
|
expected_5 = np.array([None, None, None, -0.2, 0.0, 1234112313131.3])
|
||||||
|
|
||||||
|
# Test for expected value with 2 decimal places
|
||||||
|
# Elewentwise comparison to ensure all values are equal
|
||||||
|
all_elements_are_equal_one = np.equal(expected_1, floored_series_1)
|
||||||
|
assert all_elements_are_equal_one.all()
|
||||||
|
|
||||||
|
# Test for expected value with 3 decimal places
|
||||||
|
# Elewentwise comparison to ensure all values are equal
|
||||||
|
all_elements_are_equal_two = np.equal(expected_2, floored_series_2)
|
||||||
|
assert all_elements_are_equal_two.all()
|
||||||
|
|
||||||
|
# Test for expected value with 1 decimal place
|
||||||
|
# Elewentwise comparison to ensure all values are equal
|
||||||
|
all_elements_are_equal_three = np.equal(expected_3, floored_series_3)
|
||||||
|
assert all_elements_are_equal_three.all()
|
||||||
|
|
||||||
|
# Test for expected value for some arbitrary decimal place
|
||||||
|
# Elewentwise comparison to ensure all values are equal for NaN
|
||||||
|
all_elements_are_equal_four = np.equal(expected_4, floored_series_4)
|
||||||
|
assert all_elements_are_equal_four.all()
|
||||||
|
# Test for expected value for some arbitrary decimal place
|
||||||
|
# Elewentwise comparison to ensure all floating point imprecision
|
||||||
|
# is clamped to a certain number of decimal points
|
||||||
|
all_elements_are_equal_five = np.equal(expected_5, floored_series_5)
|
||||||
|
assert all_elements_are_equal_five.all()
|
||||||
|
|
||||||
|
# Test for empty series - should raise a ValueError exception
|
||||||
|
with pytest.raises(ValueError, match="Empty series provided."):
|
||||||
|
floor_series(series_empty, number_of_decimals=2)
|
||||||
|
|
||||||
|
# Test for invalid type - should raise a TypeError exception
|
||||||
|
with pytest.raises(
|
||||||
|
TypeError,
|
||||||
|
match="Argument series must be of type pandas series, not of type list.",
|
||||||
|
):
|
||||||
|
floor_series(invalid_type, number_of_decimals=3)
|
Loading…
Add table
Add a link
Reference in a new issue