Add experimental Jupyter notebook with Health Scoring Methodology Example for Health Scores (#989)

Co-authored-by: Saran Ahluwalia <sarahluw@cisco.com>
2025-02-23 10:04:18 -08:00 · 2022-01-13 14:43:27 -05:00 · 2022-01-13 14:43:27 -05:00 · 98ff4bd9d8
commit 98ff4bd9d8
parent 4cec1bb37e
2 changed files with 1045 additions and 0 deletions
--- a/data/data-pipeline/data_pipeline/ipython/experiment_4_weighting/health_scores.py
+++ b/data/data-pipeline/data_pipeline/ipython/experiment_4_weighting/health_scores.py
@ -0,0 +1,104 @@
 #!/usr/bin/env python
 # coding: utf-8
 # In[ ]:
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import MinMaxScaler
 class HealthScores():
    """
    Calculates health scores by calling the final_scaled_data() method
    Arguments:
    weights_1(np array) : weights of model 1.
    weights_2(np array) : weights of model 2.
    weights_3(np array) : weights of model 3.
    multiplied_data(df) : all cenusus tracts data which needs to be multiplied with weights to get health scores
    geoid(Series) : geoids of census tracts to concatenate with our health scores data.
    is_weighted_average(boolean) : weights calculation methodology(default is True)
    weightage(list) : weightage for each y-variable
    """
    def __init__(self,weights_1,weights_2,weights_3,multiplied_data,geoid,is_weighted_average = True,weightage = [0.25,0.25,0.5]):
        self.weights_1 = weights_1
        self.weights_2 = weights_2
        self.weights_3 = weights_3
        self.multiplied_data = multiplied_data
        self.geoid = geoid
        self.is_weighted_average = is_weighted_average
        self.weightage = weightage
        self.weights = np.zeros(self.weights_1.shape).reshape(-1,1)
    def _get_weights(self):
        '''
        Averages weights of all the models and performs transformation so that sum of all weights will be equal to 1.
        Arguments : None
        Returns : Averaged weights which totals to 1.
        '''
        weights = np.hstack((self.weights_1.reshape(-1,1),self.weights_2.reshape(-1,1),self.weights_3.reshape(-1,1)))
        weights = np.sum(weights,axis = 1) / weights.shape[1]
        self.weights = weights / np.sum(weights)
        return self.weights
    def _weighted_average(self):
        '''
        weighted average:
        Averages weights of all the models with specified weightage for each model. And performs transformation so that sum of 
        all weights will be equal to 1.
        Arguments : None
        Returns : Averaged weights which totals to 1.(np array)
        '''
        weights = np.hstack((self.weights_1.reshape(-1,1) * self.weightage[0],self.weights_2.reshape(-1,1) * self.weightage[1],self.weights_3.reshape(-1,1) * self.weightage[2]))
        weights = np.sum(weights,axis = 1)
        self.weights = weights / np.sum(weights)
        return self.weights
    def _health_score(self):
        '''
        Converts data in (0 to 100)scale using min max scaler and multiiplying with 100.
        Then it calculates health scores by multiplying with the weights
        Returns : data frame with health score and x variables in (0 - 100)scale.
        '''
        columns = list(self.multiplied_data.columns) + ['health_scores']
        scaled_data = MinMaxScaler().fit_transform(self.multiplied_data.values) * 100
        health_scores = np.dot(scaled_data,self.weights.reshape(-1,1))
        health_scores = MinMaxScaler().fit_transform(health_scores) * 100
        scaled_data = np.hstack((scaled_data,health_scores))
        scaled_data = pd.DataFrame(data = scaled_data, columns = columns)
        return scaled_data
    def final_scaled_data(self):
        """
        Calls appropriate methods in class based on arguments. Concatenates geoids and health scores.
        Arguments : None
        Returns : 
            final_data(df) : dataframe with all health scores and geoids
            weights_tables(df) : dataframe with each y-variable weights and averaged weights
        """
        # final scaled data
        if self.is_weighted_average:
            self._weighted_average()
        else:
            self._get_weights()
        scaled_data_100 = self._health_score()
        final_data = pd.concat([self.geoid,scaled_data_100],axis = 1)
        weights_table = pd.DataFrame({'phy_health_weights' : self.weights_1, 'mntl_health_weights' : self.weights_2, 'life_expectancy_weights' : self.weights_3, 'averaged_weights' : self.weights},index = self.multiplied_data.columns)
        return final_data,weights_table
--- a/data/data-pipeline/data_pipeline/ipython/experiment_4_weighting/weighting_model_usage_12062021.ipynb
+++ b/data/data-pipeline/data_pipeline/ipython/experiment_4_weighting/weighting_model_usage_12062021.ipynb
@ -0,0 +1,941 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from health_scores import HealthScores\n",
    "from Model import Model\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Load Master dataset in raw format (All variables we have collected) So many variables have nulls. In the code null values are handeled by imputing with the group means of first 8-digit geoids(which represents nearest census tracts). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1172, 55)\n",
      "Index(['Unnamed: 0', 'COI_FOOD', 'COI_GREEN', 'COI_WALK', 'COI_VACANCY',\n",
      "       'COI_SUPRFND', 'COI_RSEI', 'COI_PM25', 'COI_OZONE', 'COI_HEAT',\n",
      "       'COI_HLTHINS', 'latitude', 'longitude', 'geoid', 'countyfips',\n",
      "       'TractFIPS', 'County', 'StateAbbr', 'PlaceName', 'PlaceFIPS',\n",
      "       'Place_TractID', 'Population2010', 'ACCESS2_CrudePrev',\n",
      "       'ARTHRITIS_CrudePrev', 'BINGE_CrudePrev', 'BPHIGH_CrudePrev',\n",
      "       'BPMED_CrudePrev', 'CANCER_CrudePrev', 'CASTHMA_CrudePrev',\n",
      "       'CHD_CrudePrev', 'CHECKUP_CrudePrev', 'CHOLSCREEN_CrudePrev',\n",
      "       'COLON_SCREEN_CrudePrev', 'COPD_CrudePrev', 'COREM_CrudePrev',\n",
      "       'COREW_CrudePrev', 'CSMOKING_CrudePrev', 'DENTAL_CrudePrev',\n",
      "       'DIABETES_CrudePrev', 'HIGHCHOL_CrudePrev', 'KIDNEY_CrudePrev',\n",
      "       'LPA_CrudePrev', 'MAMMOUSE_CrudePrev', 'MHLTH_CrudePrev',\n",
      "       'OBESITY_CrudePrev', 'PAPTEST_CrudePrev', 'PHLTH_CrudePrev',\n",
      "       'SLEEP_CrudePrev', 'STROKE_CrudePrev', 'TEETHLOST_CrudePrev',\n",
      "       'life expectancy', 'Alcohol Test', 'Drug Test', 'Pedalcyclist',\n",
      "       'Pedastrian'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "df_raw = pd.read_csv('master_raw_data.csv') #file from the 'Data collection.ipynb'\n",
    "print(df_raw.shape)\n",
    "print(df_raw.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "From these variables you can pick any variables to include in the model and get the results.(Method is explained below)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### More advanced imputation method "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Equal missing data to the nearest neighborhood - (distance between Latitude/Longitude points was used, see https://www.movable-type.co.uk/scripts/latlong.html for details)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>geoid</th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>ED_PRXECE</th>\n",
       "      <th>ED_PRXHQECE</th>\n",
       "      <th>ED_ECENROL</th>\n",
       "      <th>ED_READING</th>\n",
       "      <th>ED_MATH</th>\n",
       "      <th>ED_HSGRAD</th>\n",
       "      <th>ED_APENR</th>\n",
       "      <th>...</th>\n",
       "      <th>PAPTEST_CrudePrev</th>\n",
       "      <th>PHLTH_CrudePrev</th>\n",
       "      <th>SLEEP_CrudePrev</th>\n",
       "      <th>STROKE_CrudePrev</th>\n",
       "      <th>TEETHLOST_CrudePrev</th>\n",
       "      <th>Alcohol Test</th>\n",
       "      <th>Drug Test</th>\n",
       "      <th>Pedalcyclist</th>\n",
       "      <th>Pedastrian</th>\n",
       "      <th>life expectancy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>48085030100</td>\n",
       "      <td>-96.39820</td>\n",
       "      <td>33.29592</td>\n",
       "      <td>1.343954</td>\n",
       "      <td>-13.815511</td>\n",
       "      <td>30.9</td>\n",
       "      <td>217.85674</td>\n",
       "      <td>250.22748</td>\n",
       "      <td>94.002556</td>\n",
       "      <td>0.278373</td>\n",
       "      <td>...</td>\n",
       "      <td>78.30</td>\n",
       "      <td>11.90</td>\n",
       "      <td>35.40</td>\n",
       "      <td>2.70</td>\n",
       "      <td>18.80</td>\n",
       "      <td>0.821918</td>\n",
       "      <td>0.821918</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.095890</td>\n",
       "      <td>76.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>48085030201</td>\n",
       "      <td>-96.53734</td>\n",
       "      <td>33.26331</td>\n",
       "      <td>2.069664</td>\n",
       "      <td>-13.815511</td>\n",
       "      <td>61.0</td>\n",
       "      <td>220.04181</td>\n",
       "      <td>246.44695</td>\n",
       "      <td>87.928993</td>\n",
       "      <td>0.287710</td>\n",
       "      <td>...</td>\n",
       "      <td>78.30</td>\n",
       "      <td>11.90</td>\n",
       "      <td>35.40</td>\n",
       "      <td>2.70</td>\n",
       "      <td>18.80</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>48085030202</td>\n",
       "      <td>-96.64279</td>\n",
       "      <td>33.34124</td>\n",
       "      <td>1.751906</td>\n",
       "      <td>-13.815511</td>\n",
       "      <td>0.0</td>\n",
       "      <td>233.74568</td>\n",
       "      <td>262.12021</td>\n",
       "      <td>80.740799</td>\n",
       "      <td>0.568353</td>\n",
       "      <td>...</td>\n",
       "      <td>81.40</td>\n",
       "      <td>8.80</td>\n",
       "      <td>35.00</td>\n",
       "      <td>1.60</td>\n",
       "      <td>9.50</td>\n",
       "      <td>4.464286</td>\n",
       "      <td>1.116071</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.232143</td>\n",
       "      <td>78.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>48085030203</td>\n",
       "      <td>-96.54547</td>\n",
       "      <td>33.34176</td>\n",
       "      <td>2.338918</td>\n",
       "      <td>-13.815511</td>\n",
       "      <td>32.7</td>\n",
       "      <td>226.88499</td>\n",
       "      <td>261.43530</td>\n",
       "      <td>95.360466</td>\n",
       "      <td>0.290443</td>\n",
       "      <td>...</td>\n",
       "      <td>81.40</td>\n",
       "      <td>8.80</td>\n",
       "      <td>35.00</td>\n",
       "      <td>1.60</td>\n",
       "      <td>9.50</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.163881</td>\n",
       "      <td>0.327761</td>\n",
       "      <td>78.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>48085030302</td>\n",
       "      <td>-96.75005</td>\n",
       "      <td>33.24045</td>\n",
       "      <td>3.588619</td>\n",
       "      <td>-13.815511</td>\n",
       "      <td>62.3</td>\n",
       "      <td>250.81639</td>\n",
       "      <td>274.56683</td>\n",
       "      <td>96.399155</td>\n",
       "      <td>0.650187</td>\n",
       "      <td>...</td>\n",
       "      <td>82.05</td>\n",
       "      <td>9.15</td>\n",
       "      <td>32.95</td>\n",
       "      <td>1.95</td>\n",
       "      <td>7.75</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.407166</td>\n",
       "      <td>0.407166</td>\n",
       "      <td>82.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 65 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         geoid  longitude  latitude  ED_PRXECE  ED_PRXHQECE  ED_ECENROL  \\\n",
       "0  48085030100  -96.39820  33.29592   1.343954   -13.815511        30.9   \n",
       "1  48085030201  -96.53734  33.26331   2.069664   -13.815511        61.0   \n",
       "2  48085030202  -96.64279  33.34124   1.751906   -13.815511         0.0   \n",
       "3  48085030203  -96.54547  33.34176   2.338918   -13.815511        32.7   \n",
       "4  48085030302  -96.75005  33.24045   3.588619   -13.815511        62.3   \n",
       "\n",
       "   ED_READING    ED_MATH  ED_HSGRAD  ED_APENR       ...         \\\n",
       "0   217.85674  250.22748  94.002556  0.278373       ...          \n",
       "1   220.04181  246.44695  87.928993  0.287710       ...          \n",
       "2   233.74568  262.12021  80.740799  0.568353       ...          \n",
       "3   226.88499  261.43530  95.360466  0.290443       ...          \n",
       "4   250.81639  274.56683  96.399155  0.650187       ...          \n",
       "\n",
       "   PAPTEST_CrudePrev  PHLTH_CrudePrev  SLEEP_CrudePrev  STROKE_CrudePrev  \\\n",
       "0              78.30            11.90            35.40              2.70   \n",
       "1              78.30            11.90            35.40              2.70   \n",
       "2              81.40             8.80            35.00              1.60   \n",
       "3              81.40             8.80            35.00              1.60   \n",
       "4              82.05             9.15            32.95              1.95   \n",
       "\n",
       "   TEETHLOST_CrudePrev  Alcohol Test  Drug Test  Pedalcyclist  Pedastrian  \\\n",
       "0                18.80      0.821918   0.821918      0.000000    1.095890   \n",
       "1                18.80      0.000000   0.000000      0.000000    0.000000   \n",
       "2                 9.50      4.464286   1.116071      0.000000    2.232143   \n",
       "3                 9.50      0.000000   0.000000      0.163881    0.327761   \n",
       "4                 7.75      0.000000   0.000000      0.407166    0.407166   \n",
       "\n",
       "   life expectancy  \n",
       "0             76.7  \n",
       "1              NaN  \n",
       "2             78.8  \n",
       "3             78.2  \n",
       "4             82.0  \n",
       "\n",
       "[5 rows x 65 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#More advanced imputation method\n",
    "df_imputed_x = pd.read_excel('20200420_input_final.xlsx')\n",
    "df_imputed_x.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Steps to get the model output:\n",
    "\n",
    "#### 1. Initialize Model() class by passing the data file.\n",
    "#### 2. Call the model_output() method with the below arguments.\n",
    "            \n",
    "        columns_regress(list) : x variables to include in the regression model\n",
    "        target(string) : target variable(y) in the regression model\n",
    "        multiply_cols(dict): dictionary with columns and thier multiplier(-1,1) as key-value pairs to rescale all variables\n",
    "        as high is good.\n",
    "\n",
    "        Default Arguments : Has default values that can be changed as per requirement.\n",
    "        \n",
    "        columns_impute(list) : columns_impute to be imputed(default is None)\n",
    "        winsorize_outliers(dict) : dictionary of limits for the respective columns{'col' : limit} (default is None)\n",
    "        winsorize_with_95(boolean) : winsorize all columns with 95 percentile(True or False) (default - False)\n",
    "        target_multiplier(int) : to change the direction of y variable if needed(default is 1)\n",
    "\n",
    "####        IMPORTANT : Follow the arguments order or specify the argument name when calling the method.\n",
    "Example : model1.model_output(columns_regress,target,multiply_cols,target_multiplier = target_multiplier)\n",
    "\n",
    "\n",
    "        Returns :returns all census tracts transformed data(high is good).\n",
    "                :prints model summary.\n",
    "                :returns model weights.\n",
    "                \n",
    "#####        Store the data and model weights to get the cummulative health score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example :"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.558\n",
      "Model:                            OLS   Adj. R-squared:                  0.553\n",
      "Method:                 Least Squares   F-statistic:                     103.9\n",
      "Date:                Tue, 28 Apr 2020   Prob (F-statistic):          1.35e-165\n",
      "Time:                        09:32:30   Log-Likelihood:                -1009.2\n",
      "No. Observations:                 999   AIC:                             2044.\n",
      "Df Residuals:                     986   BIC:                             2108.\n",
      "Df Model:                          12                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "=====================================================================================\n",
      "                        coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------------\n",
      "const              5.117e-16      0.021   2.42e-14      1.000      -0.042       0.042\n",
      "HE_FOOD               0.0891      0.028      3.134      0.002       0.033       0.145\n",
      "HE_WALK               0.0926      0.025      3.646      0.000       0.043       0.142\n",
      "HE_VACANCY            0.1124      0.026      4.300      0.000       0.061       0.164\n",
      "HE_SUPRFND            0.1049      0.029      3.579      0.000       0.047       0.162\n",
      "HE_HLTHINS            0.3111      0.056      5.533      0.000       0.201       0.421\n",
      "BINGE_CrudePrev       0.1404      0.055      2.573      0.010       0.033       0.247\n",
      "CHECKUP_CrudePrev     0.3370      0.119      2.841      0.005       0.104       0.570\n",
      "BPHIGH_CrudePrev      0.2403      0.141      1.704      0.089      -0.036       0.517\n",
      "SLEEP_CrudePrev       0.3075      0.037      8.209      0.000       0.234       0.381\n",
      "STROKE_CrudePrev      0.2741      0.096      2.841      0.005       0.085       0.463\n",
      "Drug Test             0.0294      0.027      1.107      0.269      -0.023       0.081\n",
      "Pedalcyclist          0.0648      0.027      2.408      0.016       0.012       0.118\n",
      "==============================================================================\n",
      "Omnibus:                       17.440   Durbin-Watson:                   1.833\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):               30.414\n",
      "Skew:                          -0.074   Prob(JB):                     2.49e-07\n",
      "Kurtosis:                       3.842   Cond. No.                         18.5\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    }
   ],
   "source": [
    "data = df_imputed_x.copy()  #data\n",
    "model1 = Model(data)  #Initializing class variable\n",
    "\n",
    "# variables list to include in our regression model.\n",
    "columns_regress = ['HE_FOOD', 'HE_WALK', 'HE_VACANCY', 'HE_SUPRFND','HE_HLTHINS','BINGE_CrudePrev',\n",
    "       'CHECKUP_CrudePrev','BPHIGH_CrudePrev','SLEEP_CrudePrev', 'STROKE_CrudePrev', 'Drug Test', 'Pedalcyclist']\n",
    "\n",
    "# multipliers for each variable to rescale the variables as higher the value better for the health score.\n",
    "multiply_cols = {'HE_FOOD': -1,'HE_WALK': 1,'HE_VACANCY': -1,'HE_SUPRFND':-1 , 'HE_HLTHINS': 1 ,'BINGE_CrudePrev': -1 , \n",
    "        'CHECKUP_CrudePrev': 1,'BPHIGH_CrudePrev': -1,'SLEEP_CrudePrev': -1,'STROKE_CrudePrev' : -1, 'Drug Test' : -1, 'Pedalcyclist' : -1,}\n",
    "\n",
    "#target variable\n",
    "target = 'life expectancy'\n",
    "\n",
    "#storing the data and model weights to calculate health score.\n",
    "multiplied_zscore_data_le, params_le = model1.model_output(columns_regress,target,multiply_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.917\n",
      "Model:                            OLS   Adj. R-squared:                  0.916\n",
      "Method:                 Least Squares   F-statistic:                     906.4\n",
      "Date:                Tue, 28 Apr 2020   Prob (F-statistic):               0.00\n",
      "Time:                        09:32:30   Log-Likelihood:                -175.01\n",
      "No. Observations:                 999   AIC:                             376.0\n",
      "Df Residuals:                     986   BIC:                             439.8\n",
      "Df Model:                          12                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "=====================================================================================\n",
      "                        coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------------\n",
      "const             -4.337e-18      0.009  -4.72e-16      1.000      -0.018       0.018\n",
      "HE_FOOD              -0.0565      0.012     -4.585      0.000      -0.081      -0.032\n",
      "HE_WALK              -0.0731      0.011     -6.640      0.000      -0.095      -0.052\n",
      "HE_VACANCY           -0.0380      0.011     -3.355      0.001      -0.060      -0.016\n",
      "HE_SUPRFND           -0.0610      0.013     -4.795      0.000      -0.086      -0.036\n",
      "HE_HLTHINS            0.2803      0.024     11.491      0.000       0.232       0.328\n",
      "BINGE_CrudePrev       0.3286      0.024     13.881      0.000       0.282       0.375\n",
      "CHECKUP_CrudePrev     0.4692      0.051      9.118      0.000       0.368       0.570\n",
      "BPHIGH_CrudePrev      0.5301      0.061      8.664      0.000       0.410       0.650\n",
      "SLEEP_CrudePrev       0.3841      0.016     23.633      0.000       0.352       0.416\n",
      "STROKE_CrudePrev      0.5059      0.042     12.089      0.000       0.424       0.588\n",
      "Drug Test            -0.0137      0.012     -1.191      0.234      -0.036       0.009\n",
      "Pedalcyclist         -0.0416      0.012     -3.560      0.000      -0.064      -0.019\n",
      "==============================================================================\n",
      "Omnibus:                       46.489   Durbin-Watson:                   1.112\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              134.286\n",
      "Skew:                          -0.136   Prob(JB):                     6.92e-30\n",
      "Kurtosis:                       4.775   Cond. No.                         18.5\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    }
   ],
   "source": [
    "columns_regress = ['HE_FOOD', 'HE_WALK', 'HE_VACANCY', 'HE_SUPRFND','HE_HLTHINS','BINGE_CrudePrev',\n",
    "       'CHECKUP_CrudePrev','BPHIGH_CrudePrev','SLEEP_CrudePrev', 'STROKE_CrudePrev', 'Drug Test', 'Pedalcyclist']\n",
    "\n",
    "multiply_cols = {'HE_FOOD': -1,'HE_WALK': 1,'HE_VACANCY': -1,'HE_SUPRFND':-1 , 'HE_HLTHINS': 1 ,'BINGE_CrudePrev': -1 , \n",
    "        'CHECKUP_CrudePrev': 1,'BPHIGH_CrudePrev': -1,'SLEEP_CrudePrev': -1,'STROKE_CrudePrev' : -1, 'Drug Test' : -1, 'Pedalcyclist' : -1,}\n",
    "\n",
    "target = 'PHLTH_CrudePrev'\n",
    "target_multiplier = -1\n",
    "\n",
    "multiplied_zscore_data_1, params_1 = model1.model_output(columns_regress,target,multiply_cols,target_multiplier = target_multiplier)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.880\n",
      "Model:                            OLS   Adj. R-squared:                  0.878\n",
      "Method:                 Least Squares   F-statistic:                     600.6\n",
      "Date:                Tue, 28 Apr 2020   Prob (F-statistic):               0.00\n",
      "Time:                        09:32:30   Log-Likelihood:                -359.89\n",
      "No. Observations:                 999   AIC:                             745.8\n",
      "Df Residuals:                     986   BIC:                             809.6\n",
      "Df Model:                          12                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "=====================================================================================\n",
      "                        coef    std err          t      P>|t|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------------\n",
      "const              5.169e-16      0.011   4.68e-14      1.000      -0.022       0.022\n",
      "HE_FOOD              -0.0148      0.015     -1.001      0.317      -0.044       0.014\n",
      "HE_WALK              -0.0438      0.013     -3.306      0.001      -0.070      -0.018\n",
      "HE_VACANCY           -0.0091      0.014     -0.666      0.506      -0.036       0.018\n",
      "HE_SUPRFND           -0.0544      0.015     -3.553      0.000      -0.084      -0.024\n",
      "HE_HLTHINS            0.2667      0.029      9.086      0.000       0.209       0.324\n",
      "BINGE_CrudePrev       0.4644      0.028     16.305      0.000       0.408       0.520\n",
      "CHECKUP_CrudePrev     0.6262      0.062     10.113      0.000       0.505       0.748\n",
      "BPHIGH_CrudePrev      0.2980      0.074      4.047      0.000       0.154       0.443\n",
      "SLEEP_CrudePrev       0.7537      0.020     38.535      0.000       0.715       0.792\n",
      "STROKE_CrudePrev      0.4109      0.050      8.161      0.000       0.312       0.510\n",
      "Drug Test            -0.0091      0.014     -0.656      0.512      -0.036       0.018\n",
      "Pedalcyclist         -0.0420      0.014     -2.994      0.003      -0.070      -0.014\n",
      "==============================================================================\n",
      "Omnibus:                      104.229   Durbin-Watson:                   1.055\n",
      "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              329.031\n",
      "Skew:                          -0.500   Prob(JB):                     3.56e-72\n",
      "Kurtosis:                       5.628   Cond. No.                         18.5\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    }
   ],
   "source": [
    "#regression of MHLTH_CrudePrev \n",
    "\n",
    "\n",
    "columns_regress = ['HE_FOOD', 'HE_WALK', 'HE_VACANCY', 'HE_SUPRFND','HE_HLTHINS','BINGE_CrudePrev',\n",
    "       'CHECKUP_CrudePrev','BPHIGH_CrudePrev','SLEEP_CrudePrev', 'STROKE_CrudePrev', 'Drug Test', 'Pedalcyclist']\n",
    "\n",
    "multiply_cols = {'HE_FOOD': -1,'HE_WALK': 1,'HE_VACANCY': -1,'HE_SUPRFND':-1 , 'HE_HLTHINS': 1 ,'BINGE_CrudePrev': -1 , \n",
    "        'CHECKUP_CrudePrev': 1,'BPHIGH_CrudePrev': -1,'SLEEP_CrudePrev': -1,'STROKE_CrudePrev' : -1, 'Drug Test' : -1, 'Pedalcyclist' : -1,}\n",
    "\n",
    "\n",
    "target = 'MHLTH_CrudePrev'\n",
    "target_multiplier = -1\n",
    "\n",
    "multiplied_zscore_data_2, params_2 = model1.model_output(columns_regress,target,multiply_cols,target_multiplier = target_multiplier)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Steps to get the health scores :\n",
    "\n",
    "#### 1. Initialize the HealthScores() by passing the below arguments.\n",
    "\n",
    "        Arguments:\n",
    "        weights_1(np array) : weights of model 1 (Physical health).\n",
    "        weights_2(np array) : weights of model 2 (Mental health).\n",
    "        weights_3(np array) : weights of model 3 (Life Expectancy).\n",
    "        multiplied_data(df) : all cenusus tracts data which needs to be multiplied with weights to get health scores\n",
    "        geoid(Series) : geoids of census tracts to concatenate with our health scores data.\n",
    "        \n",
    "        Default Arguments :\n",
    "        \n",
    "        is_weighted_average(boolean) : weights calculation methodology(default is True)\n",
    "        weightage(list) : weightage for each y-variable(default : [0.25,0.25,0.5])\n",
    "        \n",
    "#### 2. Call the final_scaled_data() method\n",
    "\n",
    "        Returns : \n",
    "            final_data(df) : dataframe with all health scores and geoids\n",
    "            weights_tables(df) : dataframe with each y-variable weights and averaged weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.0117727 , 0.00751751, 0.01959428, 0.01042273, 0.128936  ,\n",
       "       0.1184022 , 0.19513146, 0.14433487, 0.19330818, 0.16154886,\n",
       "       0.00396387, 0.00506735])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "health_scores1 = HealthScores(params_1,params_2,params_le,multiplied_zscore_data_1,data['geoid'])\n",
    "final_data,weights_table = health_scores1.final_scaled_data()\n",
    "\n",
    "health_scores1.weights #Can access the weights directly like this."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>geoid</th>\n",
       "      <th>HE_FOOD</th>\n",
       "      <th>HE_WALK</th>\n",
       "      <th>HE_VACANCY</th>\n",
       "      <th>HE_SUPRFND</th>\n",
       "      <th>HE_HLTHINS</th>\n",
       "      <th>BINGE_CrudePrev</th>\n",
       "      <th>CHECKUP_CrudePrev</th>\n",
       "      <th>BPHIGH_CrudePrev</th>\n",
       "      <th>SLEEP_CrudePrev</th>\n",
       "      <th>STROKE_CrudePrev</th>\n",
       "      <th>Drug Test</th>\n",
       "      <th>Pedalcyclist</th>\n",
       "      <th>health_scores</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>48085030100</td>\n",
       "      <td>96.091281</td>\n",
       "      <td>2.544479</td>\n",
       "      <td>76.981603</td>\n",
       "      <td>50.0</td>\n",
       "      <td>13.890877</td>\n",
       "      <td>74.926254</td>\n",
       "      <td>9.292503</td>\n",
       "      <td>78.357236</td>\n",
       "      <td>78.620690</td>\n",
       "      <td>79.207921</td>\n",
       "      <td>98.520548</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>69.027647</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>48085030201</td>\n",
       "      <td>98.439143</td>\n",
       "      <td>11.524690</td>\n",
       "      <td>92.643096</td>\n",
       "      <td>50.0</td>\n",
       "      <td>15.042605</td>\n",
       "      <td>74.926254</td>\n",
       "      <td>9.292503</td>\n",
       "      <td>78.357236</td>\n",
       "      <td>78.620690</td>\n",
       "      <td>79.207921</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>71.151708</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>48085030202</td>\n",
       "      <td>95.919226</td>\n",
       "      <td>7.482860</td>\n",
       "      <td>67.903118</td>\n",
       "      <td>50.0</td>\n",
       "      <td>18.821577</td>\n",
       "      <td>57.227139</td>\n",
       "      <td>5.385428</td>\n",
       "      <td>85.658409</td>\n",
       "      <td>79.310345</td>\n",
       "      <td>90.099010</td>\n",
       "      <td>97.991071</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>71.239647</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>48085030203</td>\n",
       "      <td>98.935264</td>\n",
       "      <td>13.103215</td>\n",
       "      <td>83.265171</td>\n",
       "      <td>50.0</td>\n",
       "      <td>22.212623</td>\n",
       "      <td>57.227139</td>\n",
       "      <td>5.385428</td>\n",
       "      <td>85.658409</td>\n",
       "      <td>79.310345</td>\n",
       "      <td>90.099010</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>99.414712</td>\n",
       "      <td>74.373851</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>48085030302</td>\n",
       "      <td>96.612195</td>\n",
       "      <td>13.885353</td>\n",
       "      <td>79.536820</td>\n",
       "      <td>50.0</td>\n",
       "      <td>28.758720</td>\n",
       "      <td>70.058997</td>\n",
       "      <td>9.926082</td>\n",
       "      <td>80.312907</td>\n",
       "      <td>82.844828</td>\n",
       "      <td>86.633663</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>98.545835</td>\n",
       "      <td>83.926690</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         geoid    HE_FOOD    HE_WALK  HE_VACANCY  HE_SUPRFND  HE_HLTHINS  \\\n",
       "0  48085030100  96.091281   2.544479   76.981603        50.0   13.890877   \n",
       "1  48085030201  98.439143  11.524690   92.643096        50.0   15.042605   \n",
       "2  48085030202  95.919226   7.482860   67.903118        50.0   18.821577   \n",
       "3  48085030203  98.935264  13.103215   83.265171        50.0   22.212623   \n",
       "4  48085030302  96.612195  13.885353   79.536820        50.0   28.758720   \n",
       "\n",
       "   BINGE_CrudePrev  CHECKUP_CrudePrev  BPHIGH_CrudePrev  SLEEP_CrudePrev  \\\n",
       "0        74.926254           9.292503         78.357236        78.620690   \n",
       "1        74.926254           9.292503         78.357236        78.620690   \n",
       "2        57.227139           5.385428         85.658409        79.310345   \n",
       "3        57.227139           5.385428         85.658409        79.310345   \n",
       "4        70.058997           9.926082         80.312907        82.844828   \n",
       "\n",
       "   STROKE_CrudePrev   Drug Test  Pedalcyclist  health_scores  \n",
       "0         79.207921   98.520548    100.000000      69.027647  \n",
       "1         79.207921  100.000000    100.000000      71.151708  \n",
       "2         90.099010   97.991071    100.000000      71.239647  \n",
       "3         90.099010  100.000000     99.414712      74.373851  \n",
       "4         86.633663  100.000000     98.545835      83.926690  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>phy_health_weights</th>\n",
       "      <th>mntl_health_weights</th>\n",
       "      <th>life_expectancy_weights</th>\n",
       "      <th>averaged_weights</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>HE_FOOD</th>\n",
       "      <td>-0.056542</td>\n",
       "      <td>-0.014846</td>\n",
       "      <td>0.089070</td>\n",
       "      <td>0.011773</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HE_WALK</th>\n",
       "      <td>-0.073147</td>\n",
       "      <td>-0.043824</td>\n",
       "      <td>0.092569</td>\n",
       "      <td>0.007518</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HE_VACANCY</th>\n",
       "      <td>-0.038049</td>\n",
       "      <td>-0.009083</td>\n",
       "      <td>0.112404</td>\n",
       "      <td>0.019594</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HE_SUPRFND</th>\n",
       "      <td>-0.060990</td>\n",
       "      <td>-0.054371</td>\n",
       "      <td>0.104936</td>\n",
       "      <td>0.010423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HE_HLTHINS</th>\n",
       "      <td>0.280300</td>\n",
       "      <td>0.266710</td>\n",
       "      <td>0.311076</td>\n",
       "      <td>0.128936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BINGE_CrudePrev</th>\n",
       "      <td>0.328551</td>\n",
       "      <td>0.464386</td>\n",
       "      <td>0.140354</td>\n",
       "      <td>0.118402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CHECKUP_CrudePrev</th>\n",
       "      <td>0.469246</td>\n",
       "      <td>0.626203</td>\n",
       "      <td>0.336980</td>\n",
       "      <td>0.195131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BPHIGH_CrudePrev</th>\n",
       "      <td>0.530143</td>\n",
       "      <td>0.298016</td>\n",
       "      <td>0.240319</td>\n",
       "      <td>0.144335</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SLEEP_CrudePrev</th>\n",
       "      <td>0.384115</td>\n",
       "      <td>0.753666</td>\n",
       "      <td>0.307548</td>\n",
       "      <td>0.193308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>STROKE_CrudePrev</th>\n",
       "      <td>0.505871</td>\n",
       "      <td>0.410902</td>\n",
       "      <td>0.274058</td>\n",
       "      <td>0.161549</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Drug Test</th>\n",
       "      <td>-0.013707</td>\n",
       "      <td>-0.009082</td>\n",
       "      <td>0.029366</td>\n",
       "      <td>0.003964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pedalcyclist</th>\n",
       "      <td>-0.041551</td>\n",
       "      <td>-0.042050</td>\n",
       "      <td>0.064775</td>\n",
       "      <td>0.005067</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   phy_health_weights  mntl_health_weights  \\\n",
       "HE_FOOD                     -0.056542            -0.014846   \n",
       "HE_WALK                     -0.073147            -0.043824   \n",
       "HE_VACANCY                  -0.038049            -0.009083   \n",
       "HE_SUPRFND                  -0.060990            -0.054371   \n",
       "HE_HLTHINS                   0.280300             0.266710   \n",
       "BINGE_CrudePrev              0.328551             0.464386   \n",
       "CHECKUP_CrudePrev            0.469246             0.626203   \n",
       "BPHIGH_CrudePrev             0.530143             0.298016   \n",
       "SLEEP_CrudePrev              0.384115             0.753666   \n",
       "STROKE_CrudePrev             0.505871             0.410902   \n",
       "Drug Test                   -0.013707            -0.009082   \n",
       "Pedalcyclist                -0.041551            -0.042050   \n",
       "\n",
       "                   life_expectancy_weights  averaged_weights  \n",
       "HE_FOOD                           0.089070          0.011773  \n",
       "HE_WALK                           0.092569          0.007518  \n",
       "HE_VACANCY                        0.112404          0.019594  \n",
       "HE_SUPRFND                        0.104936          0.010423  \n",
       "HE_HLTHINS                        0.311076          0.128936  \n",
       "BINGE_CrudePrev                   0.140354          0.118402  \n",
       "CHECKUP_CrudePrev                 0.336980          0.195131  \n",
       "BPHIGH_CrudePrev                  0.240319          0.144335  \n",
       "SLEEP_CrudePrev                   0.307548          0.193308  \n",
       "STROKE_CrudePrev                  0.274058          0.161549  \n",
       "Drug Test                         0.029366          0.003964  \n",
       "Pedalcyclist                      0.064775          0.005067  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "weights_table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }