From 56644698ff40b0a155ef1f872cd798bb3462f25f Mon Sep 17 00:00:00 2001 From: Saran Ahluwalia <94847739+saran-ahluwalia@users.noreply.github.com> Date: Wed, 5 Jan 2022 17:03:37 -0500 Subject: [PATCH] Address rounding issue in Pandas series to floor numerically unstable values (#1085) * wip - added tests - 1 failing * added check for empty series + added test * passing tests * parallelism in variable assingnment choice * resolve merge conflicts * variable name changes * cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf * cleanup logic and move comments out of main code execution + add one more test for an extreme example eith -np.inf * revisions to handle type ambiguity * fixing tests * fix pytest * fix linting * fix pytest * reword comments * cleanup comments * cleanup comments - fix typo * added type check and corresponding test * added type check and corresponding test * language cleanup * revert * update picke fixture Co-authored-by: Jorge Escobar --- .../data_pipeline/etl/score/etl_score_post.py | 15 ++-- .../data_pipeline/etl/score/etl_utils.py | 60 +++++++++++++++ .../tests/snapshots/tile_data_expected.pkl | Bin 3034 -> 3085 bytes .../etl/score/tests/test_etl_utils.py | 72 ++++++++++++++++++ 4 files changed, 141 insertions(+), 6 deletions(-) create mode 100644 data/data-pipeline/data_pipeline/etl/score/tests/test_etl_utils.py diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py index 8f527c7a..244cccbe 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_score_post.py @@ -3,6 +3,7 @@ import json import pandas as pd from data_pipeline.etl.base import ExtractTransformLoad +from data_pipeline.etl.score.etl_utils import floor_series from data_pipeline.utils import get_module_logger, zip_files from data_pipeline.score import field_names @@ -207,13 +208,15 @@ class PostScoreETL(ExtractTransformLoad): # filter the columns on full score score_tiles = score_county_state_merged_df[tiles_score_column_titles] - # round decimals - decimals = pd.Series( - [constants.TILES_ROUND_NUM_DECIMALS] - * len(constants.TILES_SCORE_FLOAT_COLUMNS), - index=constants.TILES_SCORE_FLOAT_COLUMNS, + score_tiles[constants.TILES_SCORE_FLOAT_COLUMNS] = score_tiles[ + constants.TILES_SCORE_FLOAT_COLUMNS + ].apply( + func=lambda series: floor_series( + series=series, + number_of_decimals=constants.TILES_ROUND_NUM_DECIMALS, + ), + axis=0, ) - score_tiles = score_tiles.round(decimals) # create indexes score_tiles = score_tiles.rename( diff --git a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py index c3fec035..2219ab18 100644 --- a/data/data-pipeline/data_pipeline/etl/score/etl_utils.py +++ b/data/data-pipeline/data_pipeline/etl/score/etl_utils.py @@ -1,6 +1,9 @@ import os import sys from pathlib import Path +import numpy as np +import pandas as pd + from data_pipeline.config import settings from data_pipeline.utils import ( @@ -48,3 +51,60 @@ def check_score_data_source( "No local score tiles data found. Please use '-d aws` to fetch from AWS" ) sys.exit() + + +def floor_series(series: pd.Series, number_of_decimals: int) -> pd.Series: + """Floors all non-null numerical values to a specific number of decimal points + + Args: + series (pd.Series): Input pandas series + number_of_decimals (int): Number of decimal points to floor all numerical values to + Returns: + floored_series (pd.Series): A Pandas Series of numerical values with appropriate number of decimal points + """ + + # we perform many operations using the division operator + # as well as elementwise multiplication. The result of such + # operations can introduce such values, below, due to numerical + # instability. This results in unsafe type inference for numpy + # float types - exacerbated by panda's type inference engine. + # Hence, to handle such offending values we default to None + # Please see the reference, below, on nullable integer types for more details + unacceptable_values = [-np.inf, np.inf, "None", np.nan] + mapping = { + unacceptable_value: None for unacceptable_value in unacceptable_values + } + + # ensure we are working with a numpy array (which is really what a pandas series is) + if not isinstance(series, pd.Series): + raise TypeError( + f"Argument series must be of type pandas series, not of type {type(series).__name__}." + ) + + # raise exception for handling empty series + if series.empty: + raise ValueError("Empty series provided.") + + # if we have any values, just replace them with None + if series.isin(unacceptable_values).any(): + series.replace(mapping, regex=False, inplace=True) + + multiplication_factor = 10 ** number_of_decimals + + # In order to safely cast NaNs + # First coerce series to float type: series.astype(float) + # Please see here: + # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#nullable-integer-data-type + product_for_numerator = np.floor( + series.astype(float) * multiplication_factor + ) + + floored_series = np.where( + series.isnull(), + # For all null values default to null + None, + # The other default condition - floor non-null values + product_for_numerator / multiplication_factor, + ) + + return floored_series diff --git a/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl b/data/data-pipeline/data_pipeline/etl/score/tests/snapshots/tile_data_expected.pkl index 8c47d6538afd0dfd2b146bce3b936c243ade83eb..d066a170c285542f993355c2d18925d057ef3394 100644 GIT binary patch delta 755 zcmca5-Ydb{z%rGIXCrF}6Qk1PC?+jNm&vV6_G}CwFlF+2c2S`46QHobWPaw)let(F zCU0gpWMrJ2!lEhTEU+SAp;If@R{I^&(>1>P9W@GwWv5hPr*6AG_Ry` zN=Bc%{kt83uUqAIxZA&v(fE3(KgQkuwJ{jD+rR1u3%T1r=dvwWe4WeP{w;{IEp)ek z0oD}cZvW;C2%H7#0vZAo1eya=`~arr5m0wMP#;hm$OesXK!d=F<=yQc!?Zn|2?Dd+ z?VnUVbYCY@?QVY+r1tA!Bop7Ifq}dIKA59HMuSa$1!uq#LsQm-fRy-mwsZ>xJP{~mDW@w#~q1iqqXv*XV>?{&8KwfC4 z^8-exIwmMR*?_HJ@(MN&8C9r&HI#OQ(x5ao*@?r1an|H=4lTy6&5JmKnHc9x{>Ej< QxL~pdcOc`-$yMA50Le=j-~a#s delta 698 zcmeB`xFycoz%unb_eRzbCPtOXQA}El9+O*{>=hUppkPYLl%x!m40UgY)+rg9?NfrL zXm~Svt4w|XRH`tUoB1=N_2lo&>Wq$)#aS#EB_;>3Xx2M50JU>nUtD0z_1=D+`@<@c zb?@zs!Qiz$i0A(Bm3>VAp|2V-&+Q9sK`P(c2knq+eI5A1{_GhLcw-ObfJE$J=0M~h z*n_q8KeGR(0W#qQTt7tpWBWIO^&ou@p$0v%KMb^{|Ik&qL124!yt7{dbl&yF2jB*R z-C_F*?m$hj3-{R1nh63cq2|1W(<=fNI<<0bwcjBfW7<@>YzJN3t!noLihz#<7>FjA^P4x9dZk<4iZv#?LpyJCGyT5;*ojw@?fvO zg@=S%HCV+Pd+c#10gTAdPUi!Y7qS;H3Qp$Y@SiNlV#vbm&D1tIpIcqpKv5XtvLbd^ zB7o4043qz{NNZsv24;wX5)x45s!-Y%oFteg2eF!~P3;lzN=-{kEh^5>Q*h2N%`2&# zk})k6l(=kw>518!5tOthmvWde&YQf7LyK|J=BpgROw0>