From 56644698ff40b0a155ef1f872cd798bb3462f25f Mon Sep 17 00:00:00 2001
From: Saran Ahluwalia <94847739+saran-ahluwalia@users.noreply.github.com>
Date: Wed, 5 Jan 2022 17:03:37 -0500
Subject: [PATCH] Address rounding issue in Pandas series to floor numerically
 unstable values (#1085)

* wip - added tests - 1 failing

* added check for empty series + added test

* passing tests

* parallelism in variable assingnment choice

* resolve merge conflicts

* variable name changes

* cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf

* cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf

* revisions to handle type ambiguity

* fixing tests

* fix pytest

* fix linting

* fix pytest

* reword comments

* cleanup comments

* cleanup comments - fix typo

* added type check and corresponding test

* added type check and corresponding test

* language cleanup

* revert

* update picke fixture

Co-authored-by: Jorge Escobar <jorge.e.escobar@omb.eop.gov>
---
 .../data_pipeline/etl/score/etl_score_post.py |  15 ++--
 .../data_pipeline/etl/score/etl_utils.py      |  60 +++++++++++++++
 .../tests/snapshots/tile_data_expected.pkl    | Bin 3034 -> 3085 bytes
 .../etl/score/tests/test_etl_utils.py         |  72 ++++++++++++++++++
 4 files changed, 141 insertions(+), 6 deletions(-)
 create mode 100644 data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py

diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
index 8f527c7a..244cccbe 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py
@@ -3,6 +3,7 @@ import json
 import pandas as pd
 
 from data_pipeline.etl.base import ExtractTransformLoad
+from data_pipeline.etl.score.etl_utils import floor_series
 from data_pipeline.utils import get_module_logger, zip_files
 from data_pipeline.score import field_names
 
@@ -207,13 +208,15 @@ class PostScoreETL(ExtractTransformLoad):
         # filter the columns on full score
         score_tiles = score_county_state_merged_df[tiles_score_column_titles]
 
-        # round decimals
-        decimals = pd.Series(
-            [constants.TILES_ROUND_NUM_DECIMALS]
-            * len(constants.TILES_SCORE_FLOAT_COLUMNS),
-            index=constants.TILES_SCORE_FLOAT_COLUMNS,
+        score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[
+            constants.TILES_SCORE_FLOAT_COLUMNS
+        ].apply(
+            func=lambda series: floor_series(
+                series=series,
+                number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS,
+            ),
+            axis=0,
         )
-        score_tiles = score_tiles.round(decimals)
 
         # create indexes
         score_tiles = score_tiles.rename(
diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
index c3fec035..2219ab18 100644
--- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
+++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py
@@ -1,6 +1,9 @@
 import os
 import sys
 from pathlib import Path
+import numpy as np
+import pandas as pd
+
 
 from data_pipeline.config import settings
 from data_pipeline.utils import (
@@ -48,3 +51,60 @@ def check_score_data_source(
                 "No local score tiles data found. Please use '-d aws` to fetch from AWS"
             )
             sys.exit()
+
+
+def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series:
+    """Floors all non-null numerical values to a specific number of decimal points
+
+    Args:
+        series (pd.Series): Input pandas series
+        number_of_decimals (int): Number of decimal points to floor all numerical values to
+    Returns:
+        floored_series (pd.Series): A Pandas Series of numerical values with appropriate number of decimal points
+    """
+
+    # we perform many operations using the division operator
+    # as well as elementwise multiplication. The result of such
+    # operations can introduce such values, below, due to numerical
+    # instability. This results in unsafe type inference for numpy
+    # float types - exacerbated by panda's type inference engine.
+    # Hence, to handle such offending values we default to None
+    # Please see the reference, below, on nullable integer types for more details
+    unacceptable_values = [-np.inf, np.inf, "None", np.nan]
+    mapping = {
+        unacceptable_value: None for unacceptable_value in unacceptable_values
+    }
+
+    # ensure we are working with a numpy array (which is really what a pandas series is)
+    if not isinstance(series, pd.Series):
+        raise TypeError(
+            f"Argument series must be of type pandas series, not of type {type(series).__name__}."
+        )
+
+    # raise exception for handling empty series
+    if series.empty:
+        raise ValueError("Empty series provided.")
+
+    # if we have any values, just replace them with None
+    if series.isin(unacceptable_values).any():
+        series.replace(mapping, regex=False, inplace=True)
+
+    multiplication_factor = 10 ** number_of_decimals
+
+    # In order to safely cast NaNs
+    # First coerce series to float type: series.astype(float)
+    # Please see here:
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#nullable-integer-data-type
+    product_for_numerator = np.floor(
+        series.astype(float) * multiplication_factor
+    )
+
+    floored_series = np.where(
+        series.isnull(),
+        # For all null values default to null
+        None,
+        # The other default condition - floor non-null values
+        product_for_numerator / multiplication_factor,
+    )
+
+    return floored_series
diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl
index 8c47d6538afd0dfd2b146bce3b936c243ade83eb..d066a170c285542f993355c2d18925d057ef3394 100644
GIT binary patch
delta 755
zcmca5-Ydb{z%rGIXCrF}6Qk1PC?+jNm&vV6_G}CwFlF+2c2S`46QHobWPaw)let(F
zCU0gpWMrJ2!lEhTEU+SAp;If@R{I^&(>1>P9<!fPG9@X)##_)^XtElUF$<G7Q`=-4
zZgob5$-e9oBAj4Tf#yO~Nq8rDt4`KnGGvsQ{FL3izPCrfD>W@GwWv5hPr*6AG_Ry`
zN=Bc%{kt83uUqAIxZA&v(fE3(KgQkuwJ{jD+rR1u3%T1r=dvwWe4WeP{w;{IEp)ek
z0oD}cZvW;C2%H7#0vZAo1eya=`~arr5m0wMP#;hm$OesXK!d=F<=yQc!?Zn|2?Dd+
z?VnUVbYCY@?QVY+r1tA!Bop7Ifq}dIKA59HMuSa$1!uq<y$9?aO?Ug1AcN~cM#G)|
z7H0Hgpk|;6NX~x((*}3z9he9_JYGP88c7#C7~a4X-vapp=oNd2;(7*m`@68vdk2zO
zC*p2D1?E?{ez+5120>#LsQm-fRy-mwsZ>xJP{~mDW@w#~q1iqqXv*XV>?{&8KwfC4
z^8-exIwmMR*?_HJ@(MN&8C9r&HI#OQ(x5ao*@?r1an|H=4lTy6&5JmKnHc9x{>Ej<
QxL~pdcOc`-$yMA50Le=j-~a#s

delta 698
zcmeB`xFycoz%unb_eRzbCPtOXQA}El9+O*{>=hUppkPYLl%x!m40UgY)+rg9?NfrL
zXm~Svt4w|XRH`tUoB1=N_2lo&>Wq$)#aS#EB_;>3Xx2M50JU>nUtD0z_1=D+`@<@c
zb?@zs!Qiz$i0A(Bm3>VAp|2V-&+Q9sK`P(c2knq+eI5A1{_GhLcw-ObfJE$J=0M~h
z*n_q8KeGR(0W#qQTt7tpWBWIO^&ou@p$0v%KMb^{|Ik&qL124!yt7{dbl&yF2jB*R
z-C_F*?m$hj3-{R1nh63cq2|1W(<=fNI<<0bwcjB<UE{m&F?+CouRpfGUJvs4_1E?g
z1D@D>fW7<@>YzJN3t!noLihz#<7>FjA^P4x9dZk<4iZv#?LpyJCGyT5;*ojw@?fvO
zg@=S%HCV+Pd+c#10gTAdPUi!Y7qS;H3Qp$Y@SiNlV#vbm&D1tIpIcqpKv5XtvLbd^
zB7o4043qz{NNZsv24;wX5)x45s!-Y%oFteg2eF!~P3;lzN=-{kEh^5>Q*h2N%`2&#
zk})k6l(=kw>518!5tOthmvWde&YQf7LyK|J=BpgROw0><q$f9Uh)lNUmS$WsIhs3=
K@$KZb+z9}NzZqo!

diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
new file mode 100644
index 00000000..594f4856
--- /dev/null
+++ b/data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py
@@ -0,0 +1,72 @@
+import pandas as pd
+import numpy as np
+import pytest
+
+from data_pipeline.etl.score.etl_utils import floor_series
+
+
+def test_floor_series():
+    # test examples
+    series = pd.Series(data=[None, 1, 0.324534, 1.2341], dtype="float64")
+    series_exponentiated = pd.Series(
+        data=[
+            -np.inf,
+            np.inf,
+            "None",
+            -0.131321313123123,
+            5.62322441e-15,
+            1.2341123131313131312e12,
+        ]
+    )
+    series_of_nan_values = pd.Series(data=[None, None, None, None, None])
+    series_empty = pd.Series(data=[], dtype="float64")
+    # list of randomly generated values
+    invalid_type = list(np.random.uniform(1, 1000000, size=15))
+
+    floored_series_1 = floor_series(series, number_of_decimals=2)
+    floored_series_2 = floor_series(series, number_of_decimals=3)
+    floored_series_3 = floor_series(series, number_of_decimals=1)
+    floored_series_4 = floor_series(series_of_nan_values, number_of_decimals=10)
+    floored_series_5 = floor_series(series_exponentiated, number_of_decimals=1)
+    # expected fixtures
+    expected_1 = np.array([None, 1.0, 0.32, 1.23])
+    expected_2 = np.array([None, 1.00, 0.324, 1.234])
+    expected_3 = np.array([None, 1.0, 0.3, 1.2])
+    expected_4 = np.array([None, None, None, None, None])
+    expected_5 = np.array([None, None, None, -0.2, 0.0, 1234112313131.3])
+
+    # Test for expected value with 2 decimal places
+    # Elewentwise comparison to ensure all values are equal
+    all_elements_are_equal_one = np.equal(expected_1, floored_series_1)
+    assert all_elements_are_equal_one.all()
+
+    # Test for expected value with 3 decimal places
+    # Elewentwise comparison to ensure all values are equal
+    all_elements_are_equal_two = np.equal(expected_2, floored_series_2)
+    assert all_elements_are_equal_two.all()
+
+    # Test for expected value with 1 decimal place
+    # Elewentwise comparison to ensure all values are equal
+    all_elements_are_equal_three = np.equal(expected_3, floored_series_3)
+    assert all_elements_are_equal_three.all()
+
+    # Test for expected value for some arbitrary decimal place
+    # Elewentwise comparison to ensure all values are equal for NaN
+    all_elements_are_equal_four = np.equal(expected_4, floored_series_4)
+    assert all_elements_are_equal_four.all()
+    # Test for expected value for some arbitrary decimal place
+    # Elewentwise comparison to ensure all floating point imprecision
+    # is clamped to a certain number of decimal points
+    all_elements_are_equal_five = np.equal(expected_5, floored_series_5)
+    assert all_elements_are_equal_five.all()
+
+    # Test for empty series - should raise a ValueError exception
+    with pytest.raises(ValueError, match="Empty series provided."):
+        floor_series(series_empty, number_of_decimals=2)
+
+    # Test for invalid type - should raise a TypeError exception
+    with pytest.raises(
+        TypeError,
+        match="Argument series must be of type pandas series, not of type list.",
+    ):
+        floor_series(invalid_type, number_of_decimals=3)