Unified Parkinson Disease Rating Scale (UPDRS) score prediction model

Introduction
Functions
Methods
Results
Discussions
Appendix

Introduction

For our analysis, we will be using the Oxford Parkinson’s Disease Telemonitoring Dataset from the data file parkinsons_updrs.csv.

The dataset contains observations of biomedical voice measurements for remote symptom progression from 42 persons with early-stage parkinsons disease over a period of six months.

parkinsons_updrs = read_csv("parkinsons_updrs.csv")

#Remove unused variables - subject#, motor_UPDRS
parkinsons_updrs = subset(x = parkinsons_updrs, select = -c(`subject#`, motor_UPDRS))

#Coerce to factor
parkinsons_updrs$sex = as.factor(parkinsons_updrs$sex)

#Simplified Headers
colnames(parkinsons_updrs) = c("age", "sex", "test_time", "total_updrs",
                               "jit_p", "jit_abs", "jit_rap",
                               "jit_ppq5", "jit_ddp", 
                               "shim", "shim_db", "shim_apq3", 
                               "shim_apq5", "shim_apq11", "shim_dda", 
                               "nhr", "hnr", "rpde", "dfa","ppe")

The dataset has 20 attributes and 5875 observations.
The dataset contains following attributes (18 out of 20 are listed)
- age : Age of the subject
- sex : Sex of the subject [0 - male, 1 - female]
- test_time : Number of days since recruitment into the trial
- total_updrs : Total Unified Parkinson Disease Rating Scale (UPDRS) score
- jit_p, jit_abs, jit_rap, jit_ppq5, jit_ddp
  - Measures of variation in fundamental frequency
- shim, shim_db, shim_apq3, shim_apq5, shim_apq11, shim_dda
  - Measures of variation in amplitude
- rpde : Dynamical complexity measure
- dfa : Signal fractal scaling exponent
- ppe : Fundamental frequency variation
The dataset was created by Athanasios Tsanas (tsanasthanasis ‘@’ gmail.com) and Max Little (littlem ‘@’ physics.ox.ac.uk) of the University of Oxford, in collaboration with 10 medical centers in the US and Intel Corporation who developed the telemonitoring device to record the speech signals.
The complete dataset and more details about it can be accessed online at http://archive.ics.uci.edu/ml/datasets/Parkinsons+Telemonitoring
UPDRS score is the measure of disablity of a subject due to Parkinsons disease.
The motivation behind the selection of this dataset is to try and build a model, which can predict the UPDRS score based on remote measurements of jitter and shimmer of the subject at an early stage.
This might prove helpful in better prognosis for Parkinson disease for the subject.

Functions

#Prepare training and test sets
train_index = sample(1:nrow(parkinsons_updrs), 3000)
pk_trn = parkinsons_updrs[train_index, ]
pk_tst = parkinsons_updrs[-train_index, ]

#Function to get LEAVE ONE OUT CROSS VALIDATION RMSE for a model fitted with least square method
loocv_rmse = function(model){
  sqrt(mean((resid(model) / (1 - hatvalues(model)))^2))
}

#Function to get RMSE for a model fitted with least square method
get_rmse = function(actual, fitted){
  sqrt(sum((actual - fitted) ^ 2) / length(actual))
}

# Utility functions to analyze high leverage points using hatvalues [h > 2*mean(h)]
leverage_high = function(model, return = "values" ){
  n = length(resid(model))
  h = hatvalues(model)
  
  if (return == "values"){
    h[h > 2 * mean(h)] 
  }else if(return == "prop"){
    mean(h > 2 * mean(h))
  }else{
    which(h > 2 * mean(h))
  }
}

# Utility function to extract lambda value from BOX-COX transformation
get_lambda_from_boxcox = function(model, plotit = FALSE){
  bx = boxcox(model, plotit = plotit)
  bx$x[which.max(bx$y)]
}

# Utility functions to analyze influential points using cooks.distance [cd > 4/n]
cooksdist_high = function(model, return = "values"){
  n = length(resid(model))
  cd = cooks.distance(model)
  
  if (return == "values"){
    cd[(cd > 4 / n)]
  }else if(return == "prop"){
    mean(cd > 4 / n)
  }else{
    which(cd > 4 / n)
  }
}

#Partial Correlation with added predictor
pcorr = function(response, test_predictor, dataset){
  model1 = lm(as.formula(paste(test_predictor, paste(". - ", response) ,sep=" ~ ")), 
              data = dataset)
  model2 = lm(as.formula(paste(response, paste(". - ", test_predictor) ,sep=" ~ ")), 
              data = dataset)
  cor(resid(model1), resid(model2))
}

#Performance Metrics with RMSE, LOOVC rmse, Cooks Distance, Adjusted R^2, Shapiro Wilk test and BP test
performance_metrics = function(model1, model2= 0, model3 = 0, trainset, testset, single_mod = FALSE){
  
  if (single_mod == TRUE){
       df = data.frame(Train.RMSE = c(get_rmse(trainset$total_updrs, fitted(model1))),
                      Test.RMSE = c(get_rmse(testset$total_updrs, predict(model1, testset))),
                      LOOCV.rmse = c(loocv_rmse(model1)),
                      Adj.R2 = c(summary(model1)$adj.r.squared),
                      Shapiro.Wilk = c(format(shapiro.test(resid(model1))$p.value, digits=2)),
                      BP.test = c(format(bptest(model1)$p.value, digits=2)))
       colnames(df) = c("Train RMSE", "Test RMSE", "LOOCV RMSE", "Adj R2", "Shapiro Wilk p-val", "BP p-val")
       rownames(df) = c("")
  }else{
      df = data.frame(Train.RMSE = c(get_rmse(trainset$total_updrs, fitted(model1)), 
                                     get_rmse(trainset$total_updrs, fitted(model2)),
                                     get_rmse(trainset$total_updrs, fitted(model3))),
                      Test.RMSE = c(get_rmse(testset$total_updrs, predict(model1, testset)), 
                                    get_rmse(testset$total_updrs, predict(model2, testset)),
                                    get_rmse(testset$total_updrs, predict(model3, testset))),
                      LOOCV.rmse = c(loocv_rmse(model1), loocv_rmse(model2),  loocv_rmse(model3)),
                      Adj.R2 = c(summary(model1)$adj.r.squared,
                                 summary(model2)$adj.r.squared,
                                 summary(model3)$adj.r.squared),
                      Shapiro.Wilk = c(format(shapiro.test(resid(model1))$p.value, digits=2),
                                       format(shapiro.test(resid(model2))$p.value, digits=2),
                                       format(shapiro.test(resid(model3))$p.value, digits=2)),
                      BP.test = c(format(bptest(model1)$p.value, digits=2),
                                  format(bptest(model2)$p.value, digits=2),
                                  format(bptest(model3)$p.value, digits=2)))
      
      rownames(df) = c("Small Model", "Interaction Model", "Int Model w/o Influential points")
      colnames(df) = c("Train RMSE", "Test RMSE", "LOOCV RMSE", "Adj R2", "Shapiro Wilk p-val", "BP p-val")
  }
  kable(df)
}

# Fitted-Residuals and Q-Q Plot
diagnostics = function(model, pcol = "darkorange4", lcol = "chartreuse4", plotit = TRUE, testit = TRUE) {
  if (plotit == TRUE) {
    # side-by-side plots (one row, two columns)
    par(mfrow = c(1, 2))
    # fitted versus residuals
    plot(fitted(model), resid(model), 
         col = pcol, pch = 20, cex = .5, 
         xlab = "Fitted", ylab = "Residuals", 
         main = "Fitted versus Residuals")
    abline(h = 0, col = lcol, lwd = 3)
    grid()
    # qq-plot
    qqnorm(resid(model), col = pcol, pch = 20, cex = .5)
    qqline(resid(model), col = lcol, lwd = 3)
    grid()
  }
  if (testit == TRUE) {
    sw_p_val = shapiro.test(resid(model))$p.value
    bp_p_val = bptest(model)$p.value
    adj_r_sq = summary(model)$adj.r.squared
    list(sw_p_val = sw_p_val, bp_p_val = bp_p_val, adj_r_sq = adj_r_sq)
  }
}

# Convert correlation into heat map
# Slice diagonally on correlation matrix
slice_upper_tri = function(corpak){
  corpak[lower.tri(corpak)] = NA
  corpak
}

reorder_matrix = function(corpk){
    dist = as.dist((1-corpk)/2)
    hcl = hclust(dist)
    corpk = corpk[hcl$order, hcl$order]
}

cor_heat_map_reorder = function(corpk){
  
  corpk = reorder_matrix(corpk)
  upper_tri = slice_upper_tri(corpk)
  # Melt the correlation matrix
  corpk_melt = melt(upper_tri, na.rm = TRUE)
  # ggheatmap
  ggheatmap = ggplot(corpk_melt, aes(Var2, Var1, fill = value)) +
    geom_tile(color = "black") +
    scale_fill_gradient2(low = "navy", high = "darkorange", mid = "white",
                         midpoint = 0, limit = c(-1,1), space = "Lab",
                         name="Correlation") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 30, vjust = 1,
                                     size = 12, hjust = 1)) + coord_fixed()
    ggheatmap + 
    geom_text(aes(Var2, Var1, label = value), color = "darkblue", size = 2.5) +
    theme(
       axis.title.x = element_blank(), axis.title.y = element_blank(),
       legend.justification = c(1, 0), legend.position = c(0.45, 0.6),
       legend.direction = "vertical") +
          guides(fill = guide_colorbar(barwidth = 1, barheight = 6,
                                 title.position = "top", title.hjust = 0.7))
}

#Partial Correlation with added predictor
pcorr_plot = function(orig_ml, dataset, added_predictor, plotit=TRUE, show_formula=FALSE){
  
  coef = names(summary(orig_ml)$coef[,1])[-1]
  pdt_string = ""
  for (i in 1:length(coef)){
    if (i != length(coef)){
      pdt_string = paste(pdt_string, paste(coef[i],"+"))
    }else{
      pdt_string = paste(pdt_string, coef[i])
    }
  }
  
  pdt_string = gsub("sex1","sex",pdt_string)
  
  # Fit new predictor 
  formula = as.formula(paste(added_predictor, pdt_string ,sep=" ~ "))
  ml_new = lm(formula, dataset)
  
  #Plot
  if (plotit == TRUE){
    #par(mfrow=c(1,1))
    #cor(resid(orig_ml), resid(ml_new))
    plot(resid(orig_ml) ~ resid(ml_new), col = "dodgerblue",
         cex=0.5, pch = 20,
         ylab = "Residuals - Original Model",
         xlab = paste("Residuals - Added Predictor", added_predictor, sep = " - "),
         main = "Variable Added Plot")
    abline(v=0, lty = 2)
    abline(h=0, lty = 2)
    abline(lm(resid(orig_ml) ~ resid(ml_new)), col='orange', lwd = 2)
  }
  
  if (show_formula != TRUE){
    list(pcor = cor(resid(orig_ml), resid(ml_new)))
  }else{
    formula
  }
}

Methods

For our analysis and model building, we will be using Least-Square approach.We start our analysis with full additive model.

full_model = lm(total_updrs ~ . , data = pk_trn)
diagnostics(full_model, testit = FALSE, plotit = TRUE)

Figure.1 - Additive Model (Fitted-Residuals & Q-Q Plot)

As we can observe in the above Fitted vs Residuals and Q-Q plot, the model seems to be voilating Normality assumption and Equal Variance assumption.
We start with a look at the possible collinearity issues in the model.

vif(full_model)

##        age       sex1  test_time      jit_p    jit_abs    jit_rap   jit_ppq5 
##  1.101e+00  1.353e+00  1.016e+00  1.029e+02  7.252e+00  1.560e+06  3.639e+01 
##    jit_ddp       shim    shim_db  shim_apq3  shim_apq5 shim_apq11   shim_dda 
##  1.560e+06  1.682e+02  7.623e+01  2.459e+07  5.808e+01  1.790e+01  2.459e+07 
##        nhr        hnr       rpde        dfa        ppe 
##  9.047e+00  5.548e+00  2.063e+00  1.625e+00  4.452e+00

It is very evident from the above results of vif that our model has collinearity issues with vif values much larger than 5 for many predictors.
We can have a quick look at the correlated predictors using below heatmap.

corpk = round(cor(pk_trn[, -2]), 2) #remove non-numeric, Sex variable
cor_heat_map_reorder(corpk)

Figure.1b - Variable Correlation

Clearly, many predictors are correlated among themselves.
We remove the predictors which are related with high vif values to obtain a new additive model.

full_model_non_collinear = lm(total_updrs ~  age + sex + test_time + jit_p +shim_apq11 + hnr + rpde + dfa + ppe, 
                              data = pk_trn)
vif(full_model_non_collinear)

##        age       sex1  test_time      jit_p shim_apq11        hnr       rpde 
##      1.073      1.106      1.009      2.362      2.851      4.668      1.877 
##        dfa        ppe 
##      1.255      3.293

Upon examining the vif values again, we can observe that the collinearity issue is fixed now. All the vif values are satisfactorily small.
We can confirm this by having a look at the partial correlation coefficient values for the selected predictors.

predictors = c("age", "test_time", "jit_p", "shim_apq11", "hnr", "rpde", "dfa", "ppe")
predictors = predictors[!predictors %in% c("total_updrs", "sex")]
sort(sapply(predictors, pcorr, response = "total_updrs", dataset = pk_trn ))

##        dfa        hnr      jit_p       rpde shim_apq11        ppe  test_time 
## -0.1681017 -0.1168197 -0.0006618  0.0232970  0.0502840  0.0568783  0.0912523 
##        age 
##  0.2444024

All the pcorr values are quite small in magnitude which indicates that the model does not have collinearity issues.
We can have a look at the diagnostics again for the improved model.

diagnostics(full_model_non_collinear, testit = FALSE, plotit = TRUE)

performance_metrics(full_model_non_collinear, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)

	Train RMSE	Test RMSE	LOOCV RMSE	Adj R2	Shapiro Wilk p-val	BP p-val
	9.835	9.834	9.868	0.141	9.2e-21	9.1e-17

The model still seems to voilate the assumptions as we can see on the plot above which is also confirmed by the p-values for both Shapiro.wilk and bptest for the model.
We try to further improve our model by using transformations.
We can use box-cox method to get the appropriate ransformation function for response transformation.

lambda = get_lambda_from_boxcox(model = full_model_non_collinear, plotit = TRUE )

We get the best lambda value for response as 0.6263 which is close to .5 . For simplicity, we consider sqrt as the function for response transformation.

resp_xformed_model = lm(sqrt(total_updrs) ~  age + sex + test_time + jit_p +shim_apq11 + hnr + rpde + dfa + ppe, data = pk_trn)
diagnostics(resp_xformed_model, testit = FALSE, plotit = TRUE)

performance_metrics(resp_xformed_model, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)

	Train RMSE	Test RMSE	LOOCV RMSE	Adj R2	Shapiro Wilk p-val	BP p-val
	26.1	25.79	0.9362	0.1547	2.8e-14	1.2e-17

There is definitely an improvement in the Q-Q plot for this model. The value for shapiro.wilk test has reduced considerably too.
However, the Equal variance assumption still seems to be voilated and even Normality assumption is suspect.
To look for the possibility of additional transformations with the available predictors, we can have a quick visual look at the relationships using pairs.

pairs(pk_trn[, c("age", "test_time", "jit_p", "shim_apq11", "hnr", "rpde", "dfa", "ppe")], col = "darkorange4")

After trying for a lot of combinations of different transformations for the predictors and multiple variable selection mesaures, below model shows significant improvement.

start_model = lm(sqrt(total_updrs) ~  (jit_p + shim_db + hnr + dfa + sqrt(ppe)), data = pk_trn)
xformed_model  = step(start_model, trace = 0)
diagnostics(xformed_model, testit = FALSE, plotit = TRUE)

The Fitted vs Residuals and Q-Q plot above for the model looks much better than the previous models.
We can also have a look at the p-values for shapiro.wilk and bptest values.

performance_metrics(xformed_model, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)

	Train RMSE	Test RMSE	LOOCV RMSE	Adj R2	Shapiro Wilk p-val	BP p-val
	26.13	25.83	0.9778	0.0767	8.1e-11	9e-07

The p-values for shapiro.wilk and bptest are still low, however, they have improved greatly compared to the previous models.
The summary of the selected model after transformations and variable selection is below.

summary(xformed_model)

## 
## Call:
## lm(formula = sqrt(total_updrs) ~ (jit_p + shim_db + hnr + dfa + 
##     sqrt(ppe)), data = pk_trn)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.078 -0.673 -0.090  0.740  2.396 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.79477    0.34799   22.40  < 2e-16 ***
## jit_p       -13.11839    4.54926   -2.88  0.00396 ** 
## shim_db      -0.48129    0.14491   -3.32  0.00091 ***
## hnr          -0.05322    0.00867   -6.14  9.5e-10 ***
## dfa          -3.15169    0.27693  -11.38  < 2e-16 ***
## sqrt(ppe)     2.08467    0.32104    6.49  9.8e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.977 on 2994 degrees of freedom
## Multiple R-squared:  0.0782, Adjusted R-squared:  0.0767 
## F-statistic: 50.8 on 5 and 2994 DF,  p-value: <2e-16

We can see the p-value for the predictor jit_p is comparatively higher which indicates that jit_p might be non-significant.
We can performe F test or t test to test the significance of the predictor jit_p.

xformed_small_model = lm(sqrt(total_updrs)  ~ shim_db + hnr + dfa+ sqrt(ppe), data = pk_trn)
(aov = anova(xformed_small_model, xformed_model))

## Analysis of Variance Table
## 
## Model 1: sqrt(total_updrs) ~ shim_db + hnr + dfa + sqrt(ppe)
## Model 2: sqrt(total_updrs) ~ (jit_p + shim_db + hnr + dfa + sqrt(ppe))
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)   
## 1   2995 2865                            
## 2   2994 2857  1      7.93 8.32  0.004 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Based on above test results , we fail to reject the hypothesis \(H_0: \beta_{jit_p} =0\) at \(\alpha = .01\).
Hence we choose smaller model for our analysis.
We can have a look at the possible interactions of the predictors within the model.

xformed_int_model = lm(sqrt(total_updrs)  ~ (shim_db + hnr + dfa+ sqrt(ppe))^2, 
                  data = pk_trn)
summary(xformed_int_model)

## 
## Call:
## lm(formula = sqrt(total_updrs) ~ (shim_db + hnr + dfa + sqrt(ppe))^2, 
##     data = pk_trn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1572 -0.6391 -0.0445  0.7193  2.6142 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        13.1149     2.6532    4.94  8.1e-07 ***
## shim_db            -0.5607     1.2419   -0.45   0.6517    
## hnr                -0.0764     0.0732   -1.04   0.2972    
## dfa                -0.9517     4.0264   -0.24   0.8132    
## sqrt(ppe)         -22.5828     3.5619   -6.34  2.6e-10 ***
## shim_db:hnr         0.0960     0.0193    4.98  6.7e-07 ***
## shim_db:dfa        -8.1018     1.9656   -4.12  3.9e-05 ***
## shim_db:sqrt(ppe)   8.6883     1.4986    5.80  7.4e-09 ***
## hnr:dfa            -0.3517     0.1178   -2.98   0.0029 ** 
## hnr:sqrt(ppe)       0.5099     0.0733    6.95  4.4e-12 ***
## dfa:sqrt(ppe)      15.7993     4.0092    3.94  8.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.962 on 2989 degrees of freedom
## Multiple R-squared:  0.107,  Adjusted R-squared:  0.104 
## F-statistic: 35.9 on 10 and 2989 DF,  p-value: <2e-16

performance_metrics(xformed_int_model, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)

	Train RMSE	Test RMSE	LOOCV RMSE	Adj R2	Shapiro Wilk p-val	BP p-val
	26.12	25.82	0.9634	0.1042	5.6e-10	1.6e-08

We can see that in the interaction model, there is an improvement in adjusted r-squared while p-values for the diagnostics are similar.

-We can perform F test to test the significance of the model.

(aov1 = anova(xformed_small_model, xformed_int_model))

## Analysis of Variance Table
## 
## Model 1: sqrt(total_updrs) ~ shim_db + hnr + dfa + sqrt(ppe)
## Model 2: sqrt(total_updrs) ~ (shim_db + hnr + dfa + sqrt(ppe))^2
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)    
## 1   2995 2865                             
## 2   2989 2767  6      97.9 17.6 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Based on the above result of anova, p-value for F test = 3.700910^{-20} is very low. We reject the null hypothesis.
We can further analyze for influential points that might have impacted the model.

xformed_wo_infl_model = lm(sqrt(total_updrs)  ~ (shim_db + hnr + dfa+ sqrt(ppe))^2, 
                  data = pk_trn,
                  subset = cooks.distance(xformed_int_model) <= 4 / length(resid(xformed_int_model)))

performance_metrics(xformed_wo_infl_model, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)

	Train RMSE	Test RMSE	LOOCV RMSE	Adj R2	Shapiro Wilk p-val	BP p-val
	26.14	25.8	0.9189	0.1322	8.9e-09	2.3e-11

After removing the influential points, we do see some improvemnt in the adj r-squared and loocv values.
The model now looks pretty good. We can have one more iteration of variable selection. We choose BIC for this.

bic_model = step(xformed_wo_infl_model, k = log(length(resid(xformed_int_model))), trace = 0)
diagnostics(bic_model, testit = FALSE, plotit = TRUE)

summary(bic_model)

## 
## Call:
## lm(formula = sqrt(total_updrs) ~ (shim_db + hnr + dfa + sqrt(ppe))^2, 
##     data = pk_trn, subset = cooks.distance(xformed_int_model) <= 
##         4/length(resid(xformed_int_model)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8431 -0.6343 -0.0459  0.6904  2.5407 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        11.13461    2.76688    4.02  5.9e-05 ***
## shim_db             0.03299    1.46918    0.02  0.98209    
## hnr                 0.00942    0.07707    0.12  0.90274    
## dfa                 4.66837    4.15891    1.12  0.26174    
## sqrt(ppe)         -25.62689    3.63702   -7.05  2.3e-12 ***
## shim_db:hnr         0.12145    0.02203    5.51  3.8e-08 ***
## shim_db:dfa       -11.85481    2.36239   -5.02  5.5e-07 ***
## shim_db:sqrt(ppe)  11.51079    1.65014    6.98  3.8e-12 ***
## hnr:dfa            -0.57912    0.12242   -4.73  2.3e-06 ***
## hnr:sqrt(ppe)       0.63091    0.07486    8.43  < 2e-16 ***
## dfa:sqrt(ppe)      15.44803    4.03293    3.83  0.00013 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.918 on 2904 degrees of freedom
## Multiple R-squared:  0.135,  Adjusted R-squared:  0.132 
## F-statistic: 45.4 on 10 and 2904 DF,  p-value: <2e-16

performance_metrics(bic_model, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)

	Train RMSE	Test RMSE	LOOCV RMSE	Adj R2	Shapiro Wilk p-val	BP p-val
	26.14	25.8	0.9189	0.1322	8.9e-09	2.3e-11

We see that BIC gives us back the model with same values, as the one we start with. This indicates that the model might not need any more variable selection procedure.

Results

Based on our analysis we find 3 good candidate models for prediction of UPDRS (out of lots of models. See appendix for other tried models).

performance_metrics(xformed_small_model, xformed_int_model, xformed_wo_infl_model, trainset = pk_trn, testset = pk_tst)

	Train RMSE	Test RMSE	LOOCV RMSE	Adj R2	Shapiro Wilk p-val	BP p-val
Small Model	26.13	25.83	0.9787	0.0744	7.2e-11	1.2e-06
Interaction Model	26.12	25.82	0.9634	0.1042	5.6e-10	1.6e-08
Int Model w/o Influential points	26.14	25.80	0.9189	0.1322	8.9e-09	2.3e-11

The small model have good numbers for shapiro.wilk and bptest, it has comparatively lower value for adj r-squared. Also, loocv value is higher compared to other models.
The interaction model has better numbers compared to small model in all almost all the indiactors.
The interaction model with influential points removed has even more improved values for adj r-squared and loocv.
However, all the models have smaller p-values for both shapiro.wilk and bptest for our liking, but they are much much better compared to full model.
Though, the Fitted vs Residual and Q-Q plot are kind of ok. Not excellent but still okay.

diagnostics(xformed_int_model, testit = FALSE, plotit = TRUE)

It might also indicate we might need to apply other methods than least squares method to build an even better model.
Also, though the model with influential points removed has slightly better number, we generally tend to keep all the points in our model.
Based on the above metrics, we have 3 candidate models very similar to each other. We prefer the interaction model with transformed response and predictors.

Discussions

For our model, we have used mostly non-invasive indicators recorded remotely which can prove very effective for easy identification and better prognosis.
With our model, we show that updrs value may be predicted with reasonable accuracy even with the simplest methods like least squares.
The total UPDRS value ranges from 7 to 54.992. With our model, we got LOOCV RMSE = 0.9634 which is less that 1.
This value is very small compared to the range of updrs values which indicate that the predictions might prove very useful.
We started with a full model with total_updrs as the response and all other available variables as predictor.
Based on the diagnostics run for the full model we can see that normality as well as equal variance are suspect here looking at the plots.
Based on the fitted residuals plot, the variance was distributed to left side of the group.
The similar suspect on normality was also indicated in the Q-Q plot. Both ends of the curve are getting off from the line.
So based on the observations the full model is not a good model as it doesn’t satisfy the normality and equal variance assumption completely.
So next we tried finding if there is any correlation between the variables. We also found the VIF of the full model and we found very hight VIFs.
Hence to eliminate correlation and build a better model with lower vifs we eliminated some variables and tried building various models.
After finding the vif for initial, we removed predictors with higher vif and build a model. We ran the diagnostics and still didn’t find much change in the values. Hence we tried using boxcox and transforming the response variable
With this model we saw some change in the diagnostics. The p-values for Shapiro test increases as well as there was a bit change in adjusted R-squared.
We further tried different transformations with different predictors. We did see some improvements but not that drastic.
So next we tried removing the age predictor from the model and performed aic and ran the diagnostics we saw some good change in the p values for both the test.
We next tried eliminating the predictor sex from the model and performed aic and saw further improvement in the model.
We further applied more transformations on the model and performed aic and saw some improvements.
After adding various transformations we removed the influential points from the data and checked the diagnostics. It improved the p values much this time. The loocv rmse decreased as well.
After removing the influential points we tried adding 3 way interactions between the predictors. We saw improvements with this model based on the diagnostics.
We finally performed BIC on the model, to ensure minimal model and rejection of un-neccessary predictors. BIC resulted in the already selected model.
To obtain a good model we found out the correlation between the predictors removed some correlated variables. Used Box Cox transformation on the response. Applied various transformations on the predictors. Used vif to find and eliminate predictors. Added interactions and removed the influential points and performed aic and bic to find a model with is better than others.
To conclude, using least squares method, we attempted to get best model for prediction of total_updrs and the selected model does a reasonably good job.

Appendix

Below are some of the different models that we tried during our analysis:

vif(test_model)
test_model1 = lm(total_updrs ~  age + sex + test_time + jit_p + shim_db +shim_apq5+ shim_apq11 + hnr + rpde + dfa + ppe, data = pk_trn)
diagnostics(test_model1, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model1)

test_model4 = lm(sqrt(total_updrs) ~  age + sex + test_time + log(jit_p) +log(shim_apq11) + hnr + rpde + dfa + ppe, data = pk_trn)
diagnostics(test_model4, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model4)

start_model = lm(sqrt(total_updrs) ~  age + sex + test_time + log(jit_p) +log(shim_apq11) + hnr + rpde + dfa + ppe, data = pk_trn)
test_model5  = step(start_model, trace = 0)
diagnostics(test_model5, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model5)
summary(test_model5)

###Check again
start_model = lm(sqrt(total_updrs) ~ sex + test_time + log(jit_p) + shim_apq11 + hnr + log(rpde) + dfa + sqrt(ppe), data = pk_trn)
test_model6  = step(start_model, trace = 0)
diagnostics(test_model6, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model6)
get_rmse(pk_trn$total_updrs, fitted(test_model6))
get_rmse(pk_tst$total_updrs, predict(test_model6, pk_tst))

start_model = lm(sqrt(total_updrs) ~  log(jit_p) + shim_db + hnr + log(rpde) + dfa + sqrt(ppe), data = pk_trn)
test_model7  = step(start_model, trace = 0)
diagnostics(test_model7, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model7)
get_rmse(pk_trn$total_updrs, fitted(test_model7))
get_rmse(pk_tst$total_updrs, predict(test_model7, pk_tst))

start_model = lm(sqrt(total_updrs) ~  (log(jit_p) + shim_db + hnr + log(rpde) + dfa + sqrt(ppe)), data = pk_trn)
test_model8  = step(start_model, trace = 0)
diagnostics(test_model8, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model8)
get_rmse(pk_trn$total_updrs, fitted(test_model8))
get_rmse(pk_tst$total_updrs, predict(test_model8, pk_tst))

start_model = lm(sqrt(total_updrs) ~  (log(jit_p)+ jit_p + shim_db + hnr + dfa + sqrt(ppe) + ppe), data = pk_trn)
test_model10  = step(start_model, trace = 0)
diagnostics(test_model10, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model10)
get_rmse(pk_trn$total_updrs, fitted(test_model10))
get_rmse(pk_tst$total_updrs, predict(test_model10, pk_tst))

mod_small_init = lm(total_updrs ~
                      age + log(jit_abs)+ log(shim_apq5)
                      + hnr + dfa + ppe, data = pk_trn)

diagnostics(mod_small_init)

mod_small = lm(total_updrs ~
                age + log(jit_abs) + log(shim_apq5) + log(shim_apq11)
                + hnr + dfa + ppe, data = pk_trn)

diagnostics(mod_small)

mod_big = lm(total_updrs ~
             (age + rpde+ jit_abs + shim + nhr + hnr + dfa + ppe)^4
           + (log(jit_abs) + log(shim) + log(nhr) + log(dfa) + log(ppe) + log(nhr) + log(rpde))^2, pk_trn)

mod_big_aic = step(mod_big, trace = 0) # AIC backward

diagnostics(mod_big_aic)

mod_good = mod_sqrt = lm(formula = sqrt(total_updrs) ~ shim + log(jit_abs) + log(nhr) +
     log(dfa) + sqrt(ppe) + shim:log(nhr) + shim:log(dfa) + shim:sqrt(ppe) +
     log(jit_abs):log(dfa) + log(jit_abs):sqrt(ppe) + log(dfa):sqrt(ppe),
   data = pk_trn)

diagnostics(mod_good)

Team Members :

Prashanti Nilayam
Heta Desai
Oran Chan