From 2604b66cf73132be186583b43d29863eee4aff0c Mon Sep 17 00:00:00 2001 From: Saran Ahluwalia <94847739+saran-ahluwalia@users.noreply.github.com> Date: Fri, 14 Jan 2022 13:11:47 -0500 Subject: [PATCH] Fix errors and improve code quality and readability in Health Scores (#1147) * run black on health_score.py * to_numpy() versus values - see https://pandas.pydata.org/pandas-docs/version/0.24.0rc1/api/generated/pandas.Series.to_numpy.html --- .../experiment_4_weighting/health_scores.py | 102 +++++++++++------- 1 file changed, 66 insertions(+), 36 deletions(-) diff --git a/data/data-pipeline/data_pipeline/ipython/experiment_4_weighting/health_scores.py b/data/data-pipeline/data_pipeline/ipython/experiment_4_weighting/health_scores.py index f4cc7a3a..de6c2a8f 100644 --- a/data/data-pipeline/data_pipeline/ipython/experiment_4_weighting/health_scores.py +++ b/data/data-pipeline/data_pipeline/ipython/experiment_4_weighting/health_scores.py @@ -8,10 +8,11 @@ import numpy as np import pandas as pd from sklearn.preprocessing import MinMaxScaler -class HealthScores(): + +class HealthScores: """ Calculates health scores by calling the final_scaled_data() method - + Arguments: weights_1(np array) : weights of model 1. weights_2(np array) : weights of model 2. @@ -21,9 +22,18 @@ class HealthScores(): is_weighted_average(boolean) : weights calculation methodology(default is True) weightage(list) : weightage for each y-variable """ - - def __init__(self,weights_1,weights_2,weights_3,multiplied_data,geoid,is_weighted_average = True,weightage = [0.25,0.25,0.5]): - + + def __init__( + self, + weights_1, + weights_2, + weights_3, + multiplied_data, + geoid, + is_weighted_average=True, + weightage=[0.25, 0.25, 0.5], + ): + self.weights_1 = weights_1 self.weights_2 = weights_2 self.weights_3 = weights_3 @@ -31,74 +41,94 @@ class HealthScores(): self.geoid = geoid self.is_weighted_average = is_weighted_average self.weightage = weightage - self.weights = np.zeros(self.weights_1.shape).reshape(-1,1) - + self.weights = np.zeros(self.weights_1.shape).reshape(-1, 1) + def _get_weights(self): - ''' + """ Averages weights of all the models and performs transformation so that sum of all weights will be equal to 1. - + Arguments : None Returns : Averaged weights which totals to 1. - ''' - weights = np.hstack((self.weights_1.reshape(-1,1),self.weights_2.reshape(-1,1),self.weights_3.reshape(-1,1))) - weights = np.sum(weights,axis = 1) / weights.shape[1] + """ + weights = np.hstack( + ( + self.weights_1.reshape(-1, 1), + self.weights_2.reshape(-1, 1), + self.weights_3.reshape(-1, 1), + ) + ) + weights = np.sum(weights, axis=1) / weights.shape[1] self.weights = weights / np.sum(weights) return self.weights - + def _weighted_average(self): - ''' + """ weighted average: - Averages weights of all the models with specified weightage for each model. And performs transformation so that sum of + Averages weights of all the models with specified weightage for each model. And performs transformation so that sum of all weights will be equal to 1. Arguments : None Returns : Averaged weights which totals to 1.(np array) - ''' - weights = np.hstack((self.weights_1.reshape(-1,1) * self.weightage[0],self.weights_2.reshape(-1,1) * self.weightage[1],self.weights_3.reshape(-1,1) * self.weightage[2])) - weights = np.sum(weights,axis = 1) + """ + weights = np.hstack( + ( + self.weights_1.reshape(-1, 1) * self.weightage[0], + self.weights_2.reshape(-1, 1) * self.weightage[1], + self.weights_3.reshape(-1, 1) * self.weightage[2], + ) + ) + weights = np.sum(weights, axis=1) self.weights = weights / np.sum(weights) return self.weights - + def _health_score(self): - ''' + """ Converts data in (0 to 100)scale using min max scaler and multiiplying with 100. Then it calculates health scores by multiplying with the weights Returns : data frame with health score and x variables in (0 - 100)scale. - ''' - columns = list(self.multiplied_data.columns) + ['health_scores'] - scaled_data = MinMaxScaler().fit_transform(self.multiplied_data.values) * 100 - health_scores = np.dot(scaled_data,self.weights.reshape(-1,1)) + """ + columns = list(self.multiplied_data.columns) + ["health_scores"] + scaled_data = ( + MinMaxScaler().fit_transform(self.multiplied_data.to_numpy()) * 100 + ) + health_scores = np.dot(scaled_data, self.weights.reshape(-1, 1)) health_scores = MinMaxScaler().fit_transform(health_scores) * 100 - scaled_data = np.hstack((scaled_data,health_scores)) + scaled_data = np.hstack((scaled_data, health_scores)) - scaled_data = pd.DataFrame(data = scaled_data, columns = columns) + scaled_data = pd.DataFrame(data=scaled_data, columns=columns) return scaled_data - + def final_scaled_data(self): """ Calls appropriate methods in class based on arguments. Concatenates geoids and health scores. - + Arguments : None - Returns : + Returns : final_data(df) : dataframe with all health scores and geoids weights_tables(df) : dataframe with each y-variable weights and averaged weights """ - + # final scaled data if self.is_weighted_average: self._weighted_average() else: self._get_weights() - + scaled_data_100 = self._health_score() - final_data = pd.concat([self.geoid,scaled_data_100],axis = 1) - - weights_table = pd.DataFrame({'phy_health_weights' : self.weights_1, 'mntl_health_weights' : self.weights_2, 'life_expectancy_weights' : self.weights_3, 'averaged_weights' : self.weights},index = self.multiplied_data.columns) - - return final_data,weights_table + final_data = pd.concat([self.geoid, scaled_data_100], axis=1) + weights_table = pd.DataFrame( + { + "phy_health_weights": self.weights_1, + "mntl_health_weights": self.weights_2, + "life_expectancy_weights": self.weights_3, + "averaged_weights": self.weights, + }, + index=self.multiplied_data.columns, + ) + return final_data, weights_table