Address rounding issue in Pandas series to floor numerically unstable values (#1085)

* wip - added tests - 1 failing * added check for empty series + added test * passing tests * parallelism in variable assingnment choice * resolve merge conflicts * variable name changes * cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf * cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf * revisions to handle type ambiguity * fixing tests * fix pytest * fix linting * fix pytest * reword comments * cleanup comments * cleanup comments - fix typo * added type check and corresponding test * added type check and corresponding test * language cleanup * revert * update picke fixture Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
2025-07-29 20:21:17 -07:00 · 2022-01-05 17:03:37 -05:00 · 2022-01-05 17:03:37 -05:00 · 56644698ff
commit 56644698ff
parent 93595b7bb4
4 changed files with 141 additions and 6 deletions
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@ -3,6 +3,7 @@ import json
 import pandas as pd

 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.score.etl_utils import floor_series
 from data_pipeline.utils import get_module_logger, zip_files
 from data_pipeline.score import field_names

@ -207,13 +208,15 @@ class PostScoreETL(ExtractTransformLoad):
        # filter the columns on full score
        score_tiles = score_county_state_merged_df[tiles_score_column_titles]

-        # round decimals
-        decimals = pd.Series(
-            [constants.TILES_ROUND_NUM_DECIMALS]
-            * len(constants.TILES_SCORE_FLOAT_COLUMNS),
-            index=constants.TILES_SCORE_FLOAT_COLUMNS,
+        score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[
+            constants.TILES_SCORE_FLOAT_COLUMNS
+        ].apply(
+            func=lambda series: floor_series(
+                series=series,
+                number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS,
+            ),
+            axis=0,
        )
-        score_tiles = score_tiles.round(decimals)

        # create indexes
        score_tiles = score_tiles.rename(
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@ -1,6 +1,9 @@
 import os
 import sys
 from pathlib import Path
+import numpy as np
+import pandas as pd
+

 from data_pipeline.config import settings
 from data_pipeline.utils import (
@ -48,3 +51,60 @@ def check_score_data_source(
                "No local score tiles data found. Please use '-d aws` to fetch from AWS"
            )
            sys.exit()
+
+
+def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
+    """Floors all non-null numerical values to a specific number of decimal points
+
+    Args:
+        series (pd.Series): Input pandas series
+        number_of_decimals (int): Number of decimal points to floor all numerical values to
+    Returns:
+        floored_series (pd.Series): A Pandas Series of numerical values with appropriate number of decimal points
+    """
+
+    # we perform many operations using the division operator
+    # as well as elementwise multiplication. The result of such
+    # operations can introduce such values, below, due to numerical
+    # instability. This results in unsafe type inference for numpy
+    # float types - exacerbated by panda's type inference engine.
+    # Hence, to handle such offending values we default to None
+    # Please see the reference, below, on nullable integer types for more details
+    unacceptable_values = [-np.inf, np.inf, "None", np.nan]
+    mapping = {
+        unacceptable_value: None for unacceptable_value in unacceptable_values
+    }
+
+    # ensure we are working with a numpy array (which is really what a pandas series is)
+    if not isinstance(series, pd.Series):
+        raise TypeError(
+            f"Argument series must be of type pandas series, not of type {type(series).__name__}."
+        )
+
+    # raise exception for handling empty series
+    if series.empty:
+        raise ValueError("Empty series provided.")
+
+    # if we have any values, just replace them with None
+    if series.isin(unacceptable_values).any():
+        series.replace(mapping, regex=False, inplace=True)
+
+    multiplication_factor = 10 ** number_of_decimals
+
+    # In order to safely cast NaNs
+    # First coerce series to float type: series.astype(float)
+    # Please see here:
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#nullable-integer-data-type
+    product_for_numerator = np.floor(
+        series.astype(float) * multiplication_factor
+    )
+
+    floored_series = np.where(
+        series.isnull(),
+        # For all null values default to null
+        None,
+        # The other default condition - floor non-null values
+        product_for_numerator / multiplication_factor,
+    )
+
+    return floored_series
--- a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
--- a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
@ -0,0 +1,72 @@
+import pandas as pd
+import numpy as np
+import pytest
+
+from data_pipeline.etl.score.etl_utils import floor_series
+
+
+def test_floor_series():
+    # test examples
+    series = pd.Series(data=[None, 1, 0.324534, 1.2341], dtype="float64")
+    series_exponentiated = pd.Series(
+        data=[
+            -np.inf,
+            np.inf,
+            "None",
+            -0.131321313123123,
+            5.62322441e-15,
+            1.2341123131313131312e12,
+        ]
+    )
+    series_of_nan_values = pd.Series(data=[None, None, None, None, None])
+    series_empty = pd.Series(data=[], dtype="float64")
+    # list of randomly generated values
+    invalid_type = list(np.random.uniform(1, 1000000, size=15))
+
+    floored_series_1 = floor_series(series, number_of_decimals=2)
+    floored_series_2 = floor_series(series, number_of_decimals=3)
+    floored_series_3 = floor_series(series, number_of_decimals=1)
+    floored_series_4 = floor_series(series_of_nan_values, number_of_decimals=10)
+    floored_series_5 = floor_series(series_exponentiated, number_of_decimals=1)
+    # expected fixtures
+    expected_1 = np.array([None, 1.0, 0.32, 1.23])
+    expected_2 = np.array([None, 1.00, 0.324, 1.234])
+    expected_3 = np.array([None, 1.0, 0.3, 1.2])
+    expected_4 = np.array([None, None, None, None, None])
+    expected_5 = np.array([None, None, None, -0.2, 0.0, 1234112313131.3])
+
+    # Test for expected value with 2 decimal places
+    # Elewentwise comparison to ensure all values are equal
+    all_elements_are_equal_one = np.equal(expected_1, floored_series_1)
+    assert all_elements_are_equal_one.all()
+
+    # Test for expected value with 3 decimal places
+    # Elewentwise comparison to ensure all values are equal
+    all_elements_are_equal_two = np.equal(expected_2, floored_series_2)
+    assert all_elements_are_equal_two.all()
+
+    # Test for expected value with 1 decimal place
+    # Elewentwise comparison to ensure all values are equal
+    all_elements_are_equal_three = np.equal(expected_3, floored_series_3)
+    assert all_elements_are_equal_three.all()
+
+    # Test for expected value for some arbitrary decimal place
+    # Elewentwise comparison to ensure all values are equal for NaN
+    all_elements_are_equal_four = np.equal(expected_4, floored_series_4)
+    assert all_elements_are_equal_four.all()
+    # Test for expected value for some arbitrary decimal place
+    # Elewentwise comparison to ensure all floating point imprecision
+    # is clamped to a certain number of decimal points
+    all_elements_are_equal_five = np.equal(expected_5, floored_series_5)
+    assert all_elements_are_equal_five.all()
+
+    # Test for empty series - should raise a ValueError exception
+    with pytest.raises(ValueError, match="Empty series provided."):
+        floor_series(series_empty, number_of_decimals=2)
+
+    # Test for invalid type - should raise a TypeError exception
+    with pytest.raises(
+        TypeError,
+        match="Argument series must be of type pandas series, not of type list.",
+    ):
+        floor_series(invalid_type, number_of_decimals=3)