Add experimental Jupyter notebook with Health Scoring Methodology Example for Health Scores (#989)

Co-authored-by: Saran Ahluwalia <sarahluw@cisco.com>
This commit is contained in:
Saran Ahluwalia 2022-01-13 14:43:27 -05:00 committed by GitHub
parent 4cec1bb37e
commit 98ff4bd9d8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 1045 additions and 0 deletions

View file

@ -0,0 +1,104 @@
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
class HealthScores():
"""
Calculates health scores by calling the final_scaled_data() method
Arguments:
weights_1(np array) : weights of model 1.
weights_2(np array) : weights of model 2.
weights_3(np array) : weights of model 3.
multiplied_data(df) : all cenusus tracts data which needs to be multiplied with weights to get health scores
geoid(Series) : geoids of census tracts to concatenate with our health scores data.
is_weighted_average(boolean) : weights calculation methodology(default is True)
weightage(list) : weightage for each y-variable
"""
def __init__(self,weights_1,weights_2,weights_3,multiplied_data,geoid,is_weighted_average = True,weightage = [0.25,0.25,0.5]):
self.weights_1 = weights_1
self.weights_2 = weights_2
self.weights_3 = weights_3
self.multiplied_data = multiplied_data
self.geoid = geoid
self.is_weighted_average = is_weighted_average
self.weightage = weightage
self.weights = np.zeros(self.weights_1.shape).reshape(-1,1)
def _get_weights(self):
'''
Averages weights of all the models and performs transformation so that sum of all weights will be equal to 1.
Arguments : None
Returns : Averaged weights which totals to 1.
'''
weights = np.hstack((self.weights_1.reshape(-1,1),self.weights_2.reshape(-1,1),self.weights_3.reshape(-1,1)))
weights = np.sum(weights,axis = 1) / weights.shape[1]
self.weights = weights / np.sum(weights)
return self.weights
def _weighted_average(self):
'''
weighted average:
Averages weights of all the models with specified weightage for each model. And performs transformation so that sum of
all weights will be equal to 1.
Arguments : None
Returns : Averaged weights which totals to 1.(np array)
'''
weights = np.hstack((self.weights_1.reshape(-1,1) * self.weightage[0],self.weights_2.reshape(-1,1) * self.weightage[1],self.weights_3.reshape(-1,1) * self.weightage[2]))
weights = np.sum(weights,axis = 1)
self.weights = weights / np.sum(weights)
return self.weights
def _health_score(self):
'''
Converts data in (0 to 100)scale using min max scaler and multiiplying with 100.
Then it calculates health scores by multiplying with the weights
Returns : data frame with health score and x variables in (0 - 100)scale.
'''
columns = list(self.multiplied_data.columns) + ['health_scores']
scaled_data = MinMaxScaler().fit_transform(self.multiplied_data.values) * 100
health_scores = np.dot(scaled_data,self.weights.reshape(-1,1))
health_scores = MinMaxScaler().fit_transform(health_scores) * 100
scaled_data = np.hstack((scaled_data,health_scores))
scaled_data = pd.DataFrame(data = scaled_data, columns = columns)
return scaled_data
def final_scaled_data(self):
"""
Calls appropriate methods in class based on arguments. Concatenates geoids and health scores.
Arguments : None
Returns :
final_data(df) : dataframe with all health scores and geoids
weights_tables(df) : dataframe with each y-variable weights and averaged weights
"""
# final scaled data
if self.is_weighted_average:
self._weighted_average()
else:
self._get_weights()
scaled_data_100 = self._health_score()
final_data = pd.concat([self.geoid,scaled_data_100],axis = 1)
weights_table = pd.DataFrame({'phy_health_weights' : self.weights_1, 'mntl_health_weights' : self.weights_2, 'life_expectancy_weights' : self.weights_3, 'averaged_weights' : self.weights},index = self.multiplied_data.columns)
return final_data,weights_table

View file

@ -0,0 +1,941 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from health_scores import HealthScores\n",
"from Model import Model\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Load Master dataset in raw format (All variables we have collected) So many variables have nulls. In the code null values are handeled by imputing with the group means of first 8-digit geoids(which represents nearest census tracts). "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1172, 55)\n",
"Index(['Unnamed: 0', 'COI_FOOD', 'COI_GREEN', 'COI_WALK', 'COI_VACANCY',\n",
" 'COI_SUPRFND', 'COI_RSEI', 'COI_PM25', 'COI_OZONE', 'COI_HEAT',\n",
" 'COI_HLTHINS', 'latitude', 'longitude', 'geoid', 'countyfips',\n",
" 'TractFIPS', 'County', 'StateAbbr', 'PlaceName', 'PlaceFIPS',\n",
" 'Place_TractID', 'Population2010', 'ACCESS2_CrudePrev',\n",
" 'ARTHRITIS_CrudePrev', 'BINGE_CrudePrev', 'BPHIGH_CrudePrev',\n",
" 'BPMED_CrudePrev', 'CANCER_CrudePrev', 'CASTHMA_CrudePrev',\n",
" 'CHD_CrudePrev', 'CHECKUP_CrudePrev', 'CHOLSCREEN_CrudePrev',\n",
" 'COLON_SCREEN_CrudePrev', 'COPD_CrudePrev', 'COREM_CrudePrev',\n",
" 'COREW_CrudePrev', 'CSMOKING_CrudePrev', 'DENTAL_CrudePrev',\n",
" 'DIABETES_CrudePrev', 'HIGHCHOL_CrudePrev', 'KIDNEY_CrudePrev',\n",
" 'LPA_CrudePrev', 'MAMMOUSE_CrudePrev', 'MHLTH_CrudePrev',\n",
" 'OBESITY_CrudePrev', 'PAPTEST_CrudePrev', 'PHLTH_CrudePrev',\n",
" 'SLEEP_CrudePrev', 'STROKE_CrudePrev', 'TEETHLOST_CrudePrev',\n",
" 'life expectancy', 'Alcohol Test', 'Drug Test', 'Pedalcyclist',\n",
" 'Pedastrian'],\n",
" dtype='object')\n"
]
}
],
"source": [
"df_raw = pd.read_csv('master_raw_data.csv') #file from the 'Data collection.ipynb'\n",
"print(df_raw.shape)\n",
"print(df_raw.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"From these variables you can pick any variables to include in the model and get the results.(Method is explained below)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### More advanced imputation method "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Equal missing data to the nearest neighborhood - (distance between Latitude/Longitude points was used, see https://www.movable-type.co.uk/scripts/latlong.html for details)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>geoid</th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>ED_PRXECE</th>\n",
" <th>ED_PRXHQECE</th>\n",
" <th>ED_ECENROL</th>\n",
" <th>ED_READING</th>\n",
" <th>ED_MATH</th>\n",
" <th>ED_HSGRAD</th>\n",
" <th>ED_APENR</th>\n",
" <th>...</th>\n",
" <th>PAPTEST_CrudePrev</th>\n",
" <th>PHLTH_CrudePrev</th>\n",
" <th>SLEEP_CrudePrev</th>\n",
" <th>STROKE_CrudePrev</th>\n",
" <th>TEETHLOST_CrudePrev</th>\n",
" <th>Alcohol Test</th>\n",
" <th>Drug Test</th>\n",
" <th>Pedalcyclist</th>\n",
" <th>Pedastrian</th>\n",
" <th>life expectancy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>48085030100</td>\n",
" <td>-96.39820</td>\n",
" <td>33.29592</td>\n",
" <td>1.343954</td>\n",
" <td>-13.815511</td>\n",
" <td>30.9</td>\n",
" <td>217.85674</td>\n",
" <td>250.22748</td>\n",
" <td>94.002556</td>\n",
" <td>0.278373</td>\n",
" <td>...</td>\n",
" <td>78.30</td>\n",
" <td>11.90</td>\n",
" <td>35.40</td>\n",
" <td>2.70</td>\n",
" <td>18.80</td>\n",
" <td>0.821918</td>\n",
" <td>0.821918</td>\n",
" <td>0.000000</td>\n",
" <td>1.095890</td>\n",
" <td>76.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>48085030201</td>\n",
" <td>-96.53734</td>\n",
" <td>33.26331</td>\n",
" <td>2.069664</td>\n",
" <td>-13.815511</td>\n",
" <td>61.0</td>\n",
" <td>220.04181</td>\n",
" <td>246.44695</td>\n",
" <td>87.928993</td>\n",
" <td>0.287710</td>\n",
" <td>...</td>\n",
" <td>78.30</td>\n",
" <td>11.90</td>\n",
" <td>35.40</td>\n",
" <td>2.70</td>\n",
" <td>18.80</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>48085030202</td>\n",
" <td>-96.64279</td>\n",
" <td>33.34124</td>\n",
" <td>1.751906</td>\n",
" <td>-13.815511</td>\n",
" <td>0.0</td>\n",
" <td>233.74568</td>\n",
" <td>262.12021</td>\n",
" <td>80.740799</td>\n",
" <td>0.568353</td>\n",
" <td>...</td>\n",
" <td>81.40</td>\n",
" <td>8.80</td>\n",
" <td>35.00</td>\n",
" <td>1.60</td>\n",
" <td>9.50</td>\n",
" <td>4.464286</td>\n",
" <td>1.116071</td>\n",
" <td>0.000000</td>\n",
" <td>2.232143</td>\n",
" <td>78.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>48085030203</td>\n",
" <td>-96.54547</td>\n",
" <td>33.34176</td>\n",
" <td>2.338918</td>\n",
" <td>-13.815511</td>\n",
" <td>32.7</td>\n",
" <td>226.88499</td>\n",
" <td>261.43530</td>\n",
" <td>95.360466</td>\n",
" <td>0.290443</td>\n",
" <td>...</td>\n",
" <td>81.40</td>\n",
" <td>8.80</td>\n",
" <td>35.00</td>\n",
" <td>1.60</td>\n",
" <td>9.50</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.163881</td>\n",
" <td>0.327761</td>\n",
" <td>78.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>48085030302</td>\n",
" <td>-96.75005</td>\n",
" <td>33.24045</td>\n",
" <td>3.588619</td>\n",
" <td>-13.815511</td>\n",
" <td>62.3</td>\n",
" <td>250.81639</td>\n",
" <td>274.56683</td>\n",
" <td>96.399155</td>\n",
" <td>0.650187</td>\n",
" <td>...</td>\n",
" <td>82.05</td>\n",
" <td>9.15</td>\n",
" <td>32.95</td>\n",
" <td>1.95</td>\n",
" <td>7.75</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.407166</td>\n",
" <td>0.407166</td>\n",
" <td>82.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 65 columns</p>\n",
"</div>"
],
"text/plain": [
" geoid longitude latitude ED_PRXECE ED_PRXHQECE ED_ECENROL \\\n",
"0 48085030100 -96.39820 33.29592 1.343954 -13.815511 30.9 \n",
"1 48085030201 -96.53734 33.26331 2.069664 -13.815511 61.0 \n",
"2 48085030202 -96.64279 33.34124 1.751906 -13.815511 0.0 \n",
"3 48085030203 -96.54547 33.34176 2.338918 -13.815511 32.7 \n",
"4 48085030302 -96.75005 33.24045 3.588619 -13.815511 62.3 \n",
"\n",
" ED_READING ED_MATH ED_HSGRAD ED_APENR ... \\\n",
"0 217.85674 250.22748 94.002556 0.278373 ... \n",
"1 220.04181 246.44695 87.928993 0.287710 ... \n",
"2 233.74568 262.12021 80.740799 0.568353 ... \n",
"3 226.88499 261.43530 95.360466 0.290443 ... \n",
"4 250.81639 274.56683 96.399155 0.650187 ... \n",
"\n",
" PAPTEST_CrudePrev PHLTH_CrudePrev SLEEP_CrudePrev STROKE_CrudePrev \\\n",
"0 78.30 11.90 35.40 2.70 \n",
"1 78.30 11.90 35.40 2.70 \n",
"2 81.40 8.80 35.00 1.60 \n",
"3 81.40 8.80 35.00 1.60 \n",
"4 82.05 9.15 32.95 1.95 \n",
"\n",
" TEETHLOST_CrudePrev Alcohol Test Drug Test Pedalcyclist Pedastrian \\\n",
"0 18.80 0.821918 0.821918 0.000000 1.095890 \n",
"1 18.80 0.000000 0.000000 0.000000 0.000000 \n",
"2 9.50 4.464286 1.116071 0.000000 2.232143 \n",
"3 9.50 0.000000 0.000000 0.163881 0.327761 \n",
"4 7.75 0.000000 0.000000 0.407166 0.407166 \n",
"\n",
" life expectancy \n",
"0 76.7 \n",
"1 NaN \n",
"2 78.8 \n",
"3 78.2 \n",
"4 82.0 \n",
"\n",
"[5 rows x 65 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#More advanced imputation method\n",
"df_imputed_x = pd.read_excel('20200420_input_final.xlsx')\n",
"df_imputed_x.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Steps to get the model output:\n",
"\n",
"#### 1. Initialize Model() class by passing the data file.\n",
"#### 2. Call the model_output() method with the below arguments.\n",
" \n",
" columns_regress(list) : x variables to include in the regression model\n",
" target(string) : target variable(y) in the regression model\n",
" multiply_cols(dict): dictionary with columns and thier multiplier(-1,1) as key-value pairs to rescale all variables\n",
" as high is good.\n",
"\n",
" Default Arguments : Has default values that can be changed as per requirement.\n",
" \n",
" columns_impute(list) : columns_impute to be imputed(default is None)\n",
" winsorize_outliers(dict) : dictionary of limits for the respective columns{'col' : limit} (default is None)\n",
" winsorize_with_95(boolean) : winsorize all columns with 95 percentile(True or False) (default - False)\n",
" target_multiplier(int) : to change the direction of y variable if needed(default is 1)\n",
"\n",
"#### IMPORTANT : Follow the arguments order or specify the argument name when calling the method.\n",
"Example : model1.model_output(columns_regress,target,multiply_cols,target_multiplier = target_multiplier)\n",
"\n",
"\n",
" Returns :returns all census tracts transformed data(high is good).\n",
" :prints model summary.\n",
" :returns model weights.\n",
" \n",
"##### Store the data and model weights to get the cummulative health score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example :"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y R-squared: 0.558\n",
"Model: OLS Adj. R-squared: 0.553\n",
"Method: Least Squares F-statistic: 103.9\n",
"Date: Tue, 28 Apr 2020 Prob (F-statistic): 1.35e-165\n",
"Time: 09:32:30 Log-Likelihood: -1009.2\n",
"No. Observations: 999 AIC: 2044.\n",
"Df Residuals: 986 BIC: 2108.\n",
"Df Model: 12 \n",
"Covariance Type: nonrobust \n",
"=====================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"-------------------------------------------------------------------------------------\n",
"const 5.117e-16 0.021 2.42e-14 1.000 -0.042 0.042\n",
"HE_FOOD 0.0891 0.028 3.134 0.002 0.033 0.145\n",
"HE_WALK 0.0926 0.025 3.646 0.000 0.043 0.142\n",
"HE_VACANCY 0.1124 0.026 4.300 0.000 0.061 0.164\n",
"HE_SUPRFND 0.1049 0.029 3.579 0.000 0.047 0.162\n",
"HE_HLTHINS 0.3111 0.056 5.533 0.000 0.201 0.421\n",
"BINGE_CrudePrev 0.1404 0.055 2.573 0.010 0.033 0.247\n",
"CHECKUP_CrudePrev 0.3370 0.119 2.841 0.005 0.104 0.570\n",
"BPHIGH_CrudePrev 0.2403 0.141 1.704 0.089 -0.036 0.517\n",
"SLEEP_CrudePrev 0.3075 0.037 8.209 0.000 0.234 0.381\n",
"STROKE_CrudePrev 0.2741 0.096 2.841 0.005 0.085 0.463\n",
"Drug Test 0.0294 0.027 1.107 0.269 -0.023 0.081\n",
"Pedalcyclist 0.0648 0.027 2.408 0.016 0.012 0.118\n",
"==============================================================================\n",
"Omnibus: 17.440 Durbin-Watson: 1.833\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.414\n",
"Skew: -0.074 Prob(JB): 2.49e-07\n",
"Kurtosis: 3.842 Cond. No. 18.5\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
}
],
"source": [
"data = df_imputed_x.copy() #data\n",
"model1 = Model(data) #Initializing class variable\n",
"\n",
"# variables list to include in our regression model.\n",
"columns_regress = ['HE_FOOD', 'HE_WALK', 'HE_VACANCY', 'HE_SUPRFND','HE_HLTHINS','BINGE_CrudePrev',\n",
" 'CHECKUP_CrudePrev','BPHIGH_CrudePrev','SLEEP_CrudePrev', 'STROKE_CrudePrev', 'Drug Test', 'Pedalcyclist']\n",
"\n",
"# multipliers for each variable to rescale the variables as higher the value better for the health score.\n",
"multiply_cols = {'HE_FOOD': -1,'HE_WALK': 1,'HE_VACANCY': -1,'HE_SUPRFND':-1 , 'HE_HLTHINS': 1 ,'BINGE_CrudePrev': -1 , \n",
" 'CHECKUP_CrudePrev': 1,'BPHIGH_CrudePrev': -1,'SLEEP_CrudePrev': -1,'STROKE_CrudePrev' : -1, 'Drug Test' : -1, 'Pedalcyclist' : -1,}\n",
"\n",
"#target variable\n",
"target = 'life expectancy'\n",
"\n",
"#storing the data and model weights to calculate health score.\n",
"multiplied_zscore_data_le, params_le = model1.model_output(columns_regress,target,multiply_cols)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y R-squared: 0.917\n",
"Model: OLS Adj. R-squared: 0.916\n",
"Method: Least Squares F-statistic: 906.4\n",
"Date: Tue, 28 Apr 2020 Prob (F-statistic): 0.00\n",
"Time: 09:32:30 Log-Likelihood: -175.01\n",
"No. Observations: 999 AIC: 376.0\n",
"Df Residuals: 986 BIC: 439.8\n",
"Df Model: 12 \n",
"Covariance Type: nonrobust \n",
"=====================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"-------------------------------------------------------------------------------------\n",
"const -4.337e-18 0.009 -4.72e-16 1.000 -0.018 0.018\n",
"HE_FOOD -0.0565 0.012 -4.585 0.000 -0.081 -0.032\n",
"HE_WALK -0.0731 0.011 -6.640 0.000 -0.095 -0.052\n",
"HE_VACANCY -0.0380 0.011 -3.355 0.001 -0.060 -0.016\n",
"HE_SUPRFND -0.0610 0.013 -4.795 0.000 -0.086 -0.036\n",
"HE_HLTHINS 0.2803 0.024 11.491 0.000 0.232 0.328\n",
"BINGE_CrudePrev 0.3286 0.024 13.881 0.000 0.282 0.375\n",
"CHECKUP_CrudePrev 0.4692 0.051 9.118 0.000 0.368 0.570\n",
"BPHIGH_CrudePrev 0.5301 0.061 8.664 0.000 0.410 0.650\n",
"SLEEP_CrudePrev 0.3841 0.016 23.633 0.000 0.352 0.416\n",
"STROKE_CrudePrev 0.5059 0.042 12.089 0.000 0.424 0.588\n",
"Drug Test -0.0137 0.012 -1.191 0.234 -0.036 0.009\n",
"Pedalcyclist -0.0416 0.012 -3.560 0.000 -0.064 -0.019\n",
"==============================================================================\n",
"Omnibus: 46.489 Durbin-Watson: 1.112\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 134.286\n",
"Skew: -0.136 Prob(JB): 6.92e-30\n",
"Kurtosis: 4.775 Cond. No. 18.5\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
}
],
"source": [
"columns_regress = ['HE_FOOD', 'HE_WALK', 'HE_VACANCY', 'HE_SUPRFND','HE_HLTHINS','BINGE_CrudePrev',\n",
" 'CHECKUP_CrudePrev','BPHIGH_CrudePrev','SLEEP_CrudePrev', 'STROKE_CrudePrev', 'Drug Test', 'Pedalcyclist']\n",
"\n",
"multiply_cols = {'HE_FOOD': -1,'HE_WALK': 1,'HE_VACANCY': -1,'HE_SUPRFND':-1 , 'HE_HLTHINS': 1 ,'BINGE_CrudePrev': -1 , \n",
" 'CHECKUP_CrudePrev': 1,'BPHIGH_CrudePrev': -1,'SLEEP_CrudePrev': -1,'STROKE_CrudePrev' : -1, 'Drug Test' : -1, 'Pedalcyclist' : -1,}\n",
"\n",
"target = 'PHLTH_CrudePrev'\n",
"target_multiplier = -1\n",
"\n",
"multiplied_zscore_data_1, params_1 = model1.model_output(columns_regress,target,multiply_cols,target_multiplier = target_multiplier)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y R-squared: 0.880\n",
"Model: OLS Adj. R-squared: 0.878\n",
"Method: Least Squares F-statistic: 600.6\n",
"Date: Tue, 28 Apr 2020 Prob (F-statistic): 0.00\n",
"Time: 09:32:30 Log-Likelihood: -359.89\n",
"No. Observations: 999 AIC: 745.8\n",
"Df Residuals: 986 BIC: 809.6\n",
"Df Model: 12 \n",
"Covariance Type: nonrobust \n",
"=====================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"-------------------------------------------------------------------------------------\n",
"const 5.169e-16 0.011 4.68e-14 1.000 -0.022 0.022\n",
"HE_FOOD -0.0148 0.015 -1.001 0.317 -0.044 0.014\n",
"HE_WALK -0.0438 0.013 -3.306 0.001 -0.070 -0.018\n",
"HE_VACANCY -0.0091 0.014 -0.666 0.506 -0.036 0.018\n",
"HE_SUPRFND -0.0544 0.015 -3.553 0.000 -0.084 -0.024\n",
"HE_HLTHINS 0.2667 0.029 9.086 0.000 0.209 0.324\n",
"BINGE_CrudePrev 0.4644 0.028 16.305 0.000 0.408 0.520\n",
"CHECKUP_CrudePrev 0.6262 0.062 10.113 0.000 0.505 0.748\n",
"BPHIGH_CrudePrev 0.2980 0.074 4.047 0.000 0.154 0.443\n",
"SLEEP_CrudePrev 0.7537 0.020 38.535 0.000 0.715 0.792\n",
"STROKE_CrudePrev 0.4109 0.050 8.161 0.000 0.312 0.510\n",
"Drug Test -0.0091 0.014 -0.656 0.512 -0.036 0.018\n",
"Pedalcyclist -0.0420 0.014 -2.994 0.003 -0.070 -0.014\n",
"==============================================================================\n",
"Omnibus: 104.229 Durbin-Watson: 1.055\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 329.031\n",
"Skew: -0.500 Prob(JB): 3.56e-72\n",
"Kurtosis: 5.628 Cond. No. 18.5\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
}
],
"source": [
"#regression of MHLTH_CrudePrev \n",
"\n",
"\n",
"columns_regress = ['HE_FOOD', 'HE_WALK', 'HE_VACANCY', 'HE_SUPRFND','HE_HLTHINS','BINGE_CrudePrev',\n",
" 'CHECKUP_CrudePrev','BPHIGH_CrudePrev','SLEEP_CrudePrev', 'STROKE_CrudePrev', 'Drug Test', 'Pedalcyclist']\n",
"\n",
"multiply_cols = {'HE_FOOD': -1,'HE_WALK': 1,'HE_VACANCY': -1,'HE_SUPRFND':-1 , 'HE_HLTHINS': 1 ,'BINGE_CrudePrev': -1 , \n",
" 'CHECKUP_CrudePrev': 1,'BPHIGH_CrudePrev': -1,'SLEEP_CrudePrev': -1,'STROKE_CrudePrev' : -1, 'Drug Test' : -1, 'Pedalcyclist' : -1,}\n",
"\n",
"\n",
"target = 'MHLTH_CrudePrev'\n",
"target_multiplier = -1\n",
"\n",
"multiplied_zscore_data_2, params_2 = model1.model_output(columns_regress,target,multiply_cols,target_multiplier = target_multiplier)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Steps to get the health scores :\n",
"\n",
"#### 1. Initialize the HealthScores() by passing the below arguments.\n",
"\n",
" Arguments:\n",
" weights_1(np array) : weights of model 1 (Physical health).\n",
" weights_2(np array) : weights of model 2 (Mental health).\n",
" weights_3(np array) : weights of model 3 (Life Expectancy).\n",
" multiplied_data(df) : all cenusus tracts data which needs to be multiplied with weights to get health scores\n",
" geoid(Series) : geoids of census tracts to concatenate with our health scores data.\n",
" \n",
" Default Arguments :\n",
" \n",
" is_weighted_average(boolean) : weights calculation methodology(default is True)\n",
" weightage(list) : weightage for each y-variable(default : [0.25,0.25,0.5])\n",
" \n",
"#### 2. Call the final_scaled_data() method\n",
"\n",
" Returns : \n",
" final_data(df) : dataframe with all health scores and geoids\n",
" weights_tables(df) : dataframe with each y-variable weights and averaged weights"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.0117727 , 0.00751751, 0.01959428, 0.01042273, 0.128936 ,\n",
" 0.1184022 , 0.19513146, 0.14433487, 0.19330818, 0.16154886,\n",
" 0.00396387, 0.00506735])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"health_scores1 = HealthScores(params_1,params_2,params_le,multiplied_zscore_data_1,data['geoid'])\n",
"final_data,weights_table = health_scores1.final_scaled_data()\n",
"\n",
"health_scores1.weights #Can access the weights directly like this."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>geoid</th>\n",
" <th>HE_FOOD</th>\n",
" <th>HE_WALK</th>\n",
" <th>HE_VACANCY</th>\n",
" <th>HE_SUPRFND</th>\n",
" <th>HE_HLTHINS</th>\n",
" <th>BINGE_CrudePrev</th>\n",
" <th>CHECKUP_CrudePrev</th>\n",
" <th>BPHIGH_CrudePrev</th>\n",
" <th>SLEEP_CrudePrev</th>\n",
" <th>STROKE_CrudePrev</th>\n",
" <th>Drug Test</th>\n",
" <th>Pedalcyclist</th>\n",
" <th>health_scores</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>48085030100</td>\n",
" <td>96.091281</td>\n",
" <td>2.544479</td>\n",
" <td>76.981603</td>\n",
" <td>50.0</td>\n",
" <td>13.890877</td>\n",
" <td>74.926254</td>\n",
" <td>9.292503</td>\n",
" <td>78.357236</td>\n",
" <td>78.620690</td>\n",
" <td>79.207921</td>\n",
" <td>98.520548</td>\n",
" <td>100.000000</td>\n",
" <td>69.027647</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>48085030201</td>\n",
" <td>98.439143</td>\n",
" <td>11.524690</td>\n",
" <td>92.643096</td>\n",
" <td>50.0</td>\n",
" <td>15.042605</td>\n",
" <td>74.926254</td>\n",
" <td>9.292503</td>\n",
" <td>78.357236</td>\n",
" <td>78.620690</td>\n",
" <td>79.207921</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>71.151708</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>48085030202</td>\n",
" <td>95.919226</td>\n",
" <td>7.482860</td>\n",
" <td>67.903118</td>\n",
" <td>50.0</td>\n",
" <td>18.821577</td>\n",
" <td>57.227139</td>\n",
" <td>5.385428</td>\n",
" <td>85.658409</td>\n",
" <td>79.310345</td>\n",
" <td>90.099010</td>\n",
" <td>97.991071</td>\n",
" <td>100.000000</td>\n",
" <td>71.239647</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>48085030203</td>\n",
" <td>98.935264</td>\n",
" <td>13.103215</td>\n",
" <td>83.265171</td>\n",
" <td>50.0</td>\n",
" <td>22.212623</td>\n",
" <td>57.227139</td>\n",
" <td>5.385428</td>\n",
" <td>85.658409</td>\n",
" <td>79.310345</td>\n",
" <td>90.099010</td>\n",
" <td>100.000000</td>\n",
" <td>99.414712</td>\n",
" <td>74.373851</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>48085030302</td>\n",
" <td>96.612195</td>\n",
" <td>13.885353</td>\n",
" <td>79.536820</td>\n",
" <td>50.0</td>\n",
" <td>28.758720</td>\n",
" <td>70.058997</td>\n",
" <td>9.926082</td>\n",
" <td>80.312907</td>\n",
" <td>82.844828</td>\n",
" <td>86.633663</td>\n",
" <td>100.000000</td>\n",
" <td>98.545835</td>\n",
" <td>83.926690</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" geoid HE_FOOD HE_WALK HE_VACANCY HE_SUPRFND HE_HLTHINS \\\n",
"0 48085030100 96.091281 2.544479 76.981603 50.0 13.890877 \n",
"1 48085030201 98.439143 11.524690 92.643096 50.0 15.042605 \n",
"2 48085030202 95.919226 7.482860 67.903118 50.0 18.821577 \n",
"3 48085030203 98.935264 13.103215 83.265171 50.0 22.212623 \n",
"4 48085030302 96.612195 13.885353 79.536820 50.0 28.758720 \n",
"\n",
" BINGE_CrudePrev CHECKUP_CrudePrev BPHIGH_CrudePrev SLEEP_CrudePrev \\\n",
"0 74.926254 9.292503 78.357236 78.620690 \n",
"1 74.926254 9.292503 78.357236 78.620690 \n",
"2 57.227139 5.385428 85.658409 79.310345 \n",
"3 57.227139 5.385428 85.658409 79.310345 \n",
"4 70.058997 9.926082 80.312907 82.844828 \n",
"\n",
" STROKE_CrudePrev Drug Test Pedalcyclist health_scores \n",
"0 79.207921 98.520548 100.000000 69.027647 \n",
"1 79.207921 100.000000 100.000000 71.151708 \n",
"2 90.099010 97.991071 100.000000 71.239647 \n",
"3 90.099010 100.000000 99.414712 74.373851 \n",
"4 86.633663 100.000000 98.545835 83.926690 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>phy_health_weights</th>\n",
" <th>mntl_health_weights</th>\n",
" <th>life_expectancy_weights</th>\n",
" <th>averaged_weights</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>HE_FOOD</th>\n",
" <td>-0.056542</td>\n",
" <td>-0.014846</td>\n",
" <td>0.089070</td>\n",
" <td>0.011773</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HE_WALK</th>\n",
" <td>-0.073147</td>\n",
" <td>-0.043824</td>\n",
" <td>0.092569</td>\n",
" <td>0.007518</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HE_VACANCY</th>\n",
" <td>-0.038049</td>\n",
" <td>-0.009083</td>\n",
" <td>0.112404</td>\n",
" <td>0.019594</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HE_SUPRFND</th>\n",
" <td>-0.060990</td>\n",
" <td>-0.054371</td>\n",
" <td>0.104936</td>\n",
" <td>0.010423</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HE_HLTHINS</th>\n",
" <td>0.280300</td>\n",
" <td>0.266710</td>\n",
" <td>0.311076</td>\n",
" <td>0.128936</td>\n",
" </tr>\n",
" <tr>\n",
" <th>BINGE_CrudePrev</th>\n",
" <td>0.328551</td>\n",
" <td>0.464386</td>\n",
" <td>0.140354</td>\n",
" <td>0.118402</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CHECKUP_CrudePrev</th>\n",
" <td>0.469246</td>\n",
" <td>0.626203</td>\n",
" <td>0.336980</td>\n",
" <td>0.195131</td>\n",
" </tr>\n",
" <tr>\n",
" <th>BPHIGH_CrudePrev</th>\n",
" <td>0.530143</td>\n",
" <td>0.298016</td>\n",
" <td>0.240319</td>\n",
" <td>0.144335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SLEEP_CrudePrev</th>\n",
" <td>0.384115</td>\n",
" <td>0.753666</td>\n",
" <td>0.307548</td>\n",
" <td>0.193308</td>\n",
" </tr>\n",
" <tr>\n",
" <th>STROKE_CrudePrev</th>\n",
" <td>0.505871</td>\n",
" <td>0.410902</td>\n",
" <td>0.274058</td>\n",
" <td>0.161549</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Drug Test</th>\n",
" <td>-0.013707</td>\n",
" <td>-0.009082</td>\n",
" <td>0.029366</td>\n",
" <td>0.003964</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Pedalcyclist</th>\n",
" <td>-0.041551</td>\n",
" <td>-0.042050</td>\n",
" <td>0.064775</td>\n",
" <td>0.005067</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" phy_health_weights mntl_health_weights \\\n",
"HE_FOOD -0.056542 -0.014846 \n",
"HE_WALK -0.073147 -0.043824 \n",
"HE_VACANCY -0.038049 -0.009083 \n",
"HE_SUPRFND -0.060990 -0.054371 \n",
"HE_HLTHINS 0.280300 0.266710 \n",
"BINGE_CrudePrev 0.328551 0.464386 \n",
"CHECKUP_CrudePrev 0.469246 0.626203 \n",
"BPHIGH_CrudePrev 0.530143 0.298016 \n",
"SLEEP_CrudePrev 0.384115 0.753666 \n",
"STROKE_CrudePrev 0.505871 0.410902 \n",
"Drug Test -0.013707 -0.009082 \n",
"Pedalcyclist -0.041551 -0.042050 \n",
"\n",
" life_expectancy_weights averaged_weights \n",
"HE_FOOD 0.089070 0.011773 \n",
"HE_WALK 0.092569 0.007518 \n",
"HE_VACANCY 0.112404 0.019594 \n",
"HE_SUPRFND 0.104936 0.010423 \n",
"HE_HLTHINS 0.311076 0.128936 \n",
"BINGE_CrudePrev 0.140354 0.118402 \n",
"CHECKUP_CrudePrev 0.336980 0.195131 \n",
"BPHIGH_CrudePrev 0.240319 0.144335 \n",
"SLEEP_CrudePrev 0.307548 0.193308 \n",
"STROKE_CrudePrev 0.274058 0.161549 \n",
"Drug Test 0.029366 0.003964 \n",
"Pedalcyclist 0.064775 0.005067 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"weights_table"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}