Introduction

For our analysis, we will be using the Oxford Parkinson’s Disease Telemonitoring Dataset from the data file parkinsons_updrs.csv.

The dataset contains observations of biomedical voice measurements for remote symptom progression from 42 persons with early-stage parkinsons disease over a period of six months.

parkinsons_updrs = read_csv("parkinsons_updrs.csv")

#Remove unused variables - subject#, motor_UPDRS
parkinsons_updrs = subset(x = parkinsons_updrs, select = -c(`subject#`, motor_UPDRS))

#Coerce to factor
parkinsons_updrs$sex = as.factor(parkinsons_updrs$sex)

#Simplified Headers
colnames(parkinsons_updrs) = c("age", "sex", "test_time", "total_updrs",
                               "jit_p", "jit_abs", "jit_rap",
                               "jit_ppq5", "jit_ddp", 
                               "shim", "shim_db", "shim_apq3", 
                               "shim_apq5", "shim_apq11", "shim_dda", 
                               "nhr", "hnr", "rpde", "dfa","ppe")

Functions

#Prepare training and test sets
train_index = sample(1:nrow(parkinsons_updrs), 3000)
pk_trn = parkinsons_updrs[train_index, ]
pk_tst = parkinsons_updrs[-train_index, ]
#Function to get LEAVE ONE OUT CROSS VALIDATION RMSE for a model fitted with least square method
loocv_rmse = function(model){
  sqrt(mean((resid(model) / (1 - hatvalues(model)))^2))
}
#Function to get RMSE for a model fitted with least square method
get_rmse = function(actual, fitted){
  sqrt(sum((actual - fitted) ^ 2) / length(actual))
}
# Utility functions to analyze high leverage points using hatvalues [h > 2*mean(h)]
leverage_high = function(model, return = "values" ){
  n = length(resid(model))
  h = hatvalues(model)
  
  if (return == "values"){
    h[h > 2 * mean(h)] 
  }else if(return == "prop"){
    mean(h > 2 * mean(h))
  }else{
    which(h > 2 * mean(h))
  }
}
# Utility function to extract lambda value from BOX-COX transformation
get_lambda_from_boxcox = function(model, plotit = FALSE){
  bx = boxcox(model, plotit = plotit)
  bx$x[which.max(bx$y)]
}
# Utility functions to analyze influential points using cooks.distance [cd > 4/n]
cooksdist_high = function(model, return = "values"){
  n = length(resid(model))
  cd = cooks.distance(model)
  
  if (return == "values"){
    cd[(cd > 4 / n)]
  }else if(return == "prop"){
    mean(cd > 4 / n)
  }else{
    which(cd > 4 / n)
  }
}
#Partial Correlation with added predictor
pcorr = function(response, test_predictor, dataset){
  model1 = lm(as.formula(paste(test_predictor, paste(". - ", response) ,sep=" ~ ")), 
              data = dataset)
  model2 = lm(as.formula(paste(response, paste(". - ", test_predictor) ,sep=" ~ ")), 
              data = dataset)
  cor(resid(model1), resid(model2))
}
#Performance Metrics with RMSE, LOOVC rmse, Cooks Distance, Adjusted R^2, Shapiro Wilk test and BP test
performance_metrics = function(model1, model2= 0, model3 = 0, trainset, testset, single_mod = FALSE){
  
  if (single_mod == TRUE){
       df = data.frame(Train.RMSE = c(get_rmse(trainset$total_updrs, fitted(model1))),
                      Test.RMSE = c(get_rmse(testset$total_updrs, predict(model1, testset))),
                      LOOCV.rmse = c(loocv_rmse(model1)),
                      Adj.R2 = c(summary(model1)$adj.r.squared),
                      Shapiro.Wilk = c(format(shapiro.test(resid(model1))$p.value, digits=2)),
                      BP.test = c(format(bptest(model1)$p.value, digits=2)))
       colnames(df) = c("Train RMSE", "Test RMSE", "LOOCV RMSE", "Adj R2", "Shapiro Wilk p-val", "BP p-val")
       rownames(df) = c("")
  }else{
      df = data.frame(Train.RMSE = c(get_rmse(trainset$total_updrs, fitted(model1)), 
                                     get_rmse(trainset$total_updrs, fitted(model2)),
                                     get_rmse(trainset$total_updrs, fitted(model3))),
                      Test.RMSE = c(get_rmse(testset$total_updrs, predict(model1, testset)), 
                                    get_rmse(testset$total_updrs, predict(model2, testset)),
                                    get_rmse(testset$total_updrs, predict(model3, testset))),
                      LOOCV.rmse = c(loocv_rmse(model1), loocv_rmse(model2),  loocv_rmse(model3)),
                      Adj.R2 = c(summary(model1)$adj.r.squared,
                                 summary(model2)$adj.r.squared,
                                 summary(model3)$adj.r.squared),
                      Shapiro.Wilk = c(format(shapiro.test(resid(model1))$p.value, digits=2),
                                       format(shapiro.test(resid(model2))$p.value, digits=2),
                                       format(shapiro.test(resid(model3))$p.value, digits=2)),
                      BP.test = c(format(bptest(model1)$p.value, digits=2),
                                  format(bptest(model2)$p.value, digits=2),
                                  format(bptest(model3)$p.value, digits=2)))
      
      rownames(df) = c("Small Model", "Interaction Model", "Int Model w/o Influential points")
      colnames(df) = c("Train RMSE", "Test RMSE", "LOOCV RMSE", "Adj R2", "Shapiro Wilk p-val", "BP p-val")
  }
  kable(df)
}
# Fitted-Residuals and Q-Q Plot
diagnostics = function(model, pcol = "darkorange4", lcol = "chartreuse4", plotit = TRUE, testit = TRUE) {
  if (plotit == TRUE) {
    # side-by-side plots (one row, two columns)
    par(mfrow = c(1, 2))
    # fitted versus residuals
    plot(fitted(model), resid(model), 
         col = pcol, pch = 20, cex = .5, 
         xlab = "Fitted", ylab = "Residuals", 
         main = "Fitted versus Residuals")
    abline(h = 0, col = lcol, lwd = 3)
    grid()
    # qq-plot
    qqnorm(resid(model), col = pcol, pch = 20, cex = .5)
    qqline(resid(model), col = lcol, lwd = 3)
    grid()
  }
  if (testit == TRUE) {
    sw_p_val = shapiro.test(resid(model))$p.value
    bp_p_val = bptest(model)$p.value
    adj_r_sq = summary(model)$adj.r.squared
    list(sw_p_val = sw_p_val, bp_p_val = bp_p_val, adj_r_sq = adj_r_sq)
  }
}
# Convert correlation into heat map
# Slice diagonally on correlation matrix
slice_upper_tri = function(corpak){
  corpak[lower.tri(corpak)] = NA
  corpak
}

reorder_matrix = function(corpk){
    dist = as.dist((1-corpk)/2)
    hcl = hclust(dist)
    corpk = corpk[hcl$order, hcl$order]
}

cor_heat_map_reorder = function(corpk){
  
  corpk = reorder_matrix(corpk)
  upper_tri = slice_upper_tri(corpk)
  # Melt the correlation matrix
  corpk_melt = melt(upper_tri, na.rm = TRUE)
  # ggheatmap
  ggheatmap = ggplot(corpk_melt, aes(Var2, Var1, fill = value)) +
    geom_tile(color = "black") +
    scale_fill_gradient2(low = "navy", high = "darkorange", mid = "white",
                         midpoint = 0, limit = c(-1,1), space = "Lab",
                         name="Correlation") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 30, vjust = 1,
                                     size = 12, hjust = 1)) + coord_fixed()
    ggheatmap + 
    geom_text(aes(Var2, Var1, label = value), color = "darkblue", size = 2.5) +
    theme(
       axis.title.x = element_blank(), axis.title.y = element_blank(),
       legend.justification = c(1, 0), legend.position = c(0.45, 0.6),
       legend.direction = "vertical") +
          guides(fill = guide_colorbar(barwidth = 1, barheight = 6,
                                 title.position = "top", title.hjust = 0.7))
}

#Partial Correlation with added predictor
pcorr_plot = function(orig_ml, dataset, added_predictor, plotit=TRUE, show_formula=FALSE){
  
  coef = names(summary(orig_ml)$coef[,1])[-1]
  pdt_string = ""
  for (i in 1:length(coef)){
    if (i != length(coef)){
      pdt_string = paste(pdt_string, paste(coef[i],"+"))
    }else{
      pdt_string = paste(pdt_string, coef[i])
    }
  }
  
  pdt_string = gsub("sex1","sex",pdt_string)
  
  # Fit new predictor 
  formula = as.formula(paste(added_predictor, pdt_string ,sep=" ~ "))
  ml_new = lm(formula, dataset)
  
  #Plot
  if (plotit == TRUE){
    #par(mfrow=c(1,1))
    #cor(resid(orig_ml), resid(ml_new))
    plot(resid(orig_ml) ~ resid(ml_new), col = "dodgerblue",
         cex=0.5, pch = 20,
         ylab = "Residuals - Original Model",
         xlab = paste("Residuals - Added Predictor", added_predictor, sep = " - "),
         main = "Variable Added Plot")
    abline(v=0, lty = 2)
    abline(h=0, lty = 2)
    abline(lm(resid(orig_ml) ~ resid(ml_new)), col='orange', lwd = 2)
  }
  
  if (show_formula != TRUE){
    list(pcor = cor(resid(orig_ml), resid(ml_new)))
  }else{
    formula
  }
}


Methods

For our analysis and model building, we will be using Least-Square approach.We start our analysis with full additive model.

full_model = lm(total_updrs ~ . , data = pk_trn)
diagnostics(full_model, testit = FALSE, plotit = TRUE)
Figure.1 - Additive Model (Fitted-Residuals & Q-Q Plot)


vif(full_model)
##        age       sex1  test_time      jit_p    jit_abs    jit_rap   jit_ppq5 
##  1.101e+00  1.353e+00  1.016e+00  1.029e+02  7.252e+00  1.560e+06  3.639e+01 
##    jit_ddp       shim    shim_db  shim_apq3  shim_apq5 shim_apq11   shim_dda 
##  1.560e+06  1.682e+02  7.623e+01  2.459e+07  5.808e+01  1.790e+01  2.459e+07 
##        nhr        hnr       rpde        dfa        ppe 
##  9.047e+00  5.548e+00  2.063e+00  1.625e+00  4.452e+00
corpk = round(cor(pk_trn[, -2]), 2) #remove non-numeric, Sex variable
cor_heat_map_reorder(corpk)

Figure.1b - Variable Correlation


full_model_non_collinear = lm(total_updrs ~  age + sex + test_time + jit_p +shim_apq11 + hnr + rpde + dfa + ppe, 
                              data = pk_trn)
vif(full_model_non_collinear)
##        age       sex1  test_time      jit_p shim_apq11        hnr       rpde 
##      1.073      1.106      1.009      2.362      2.851      4.668      1.877 
##        dfa        ppe 
##      1.255      3.293
predictors = c("age", "test_time", "jit_p", "shim_apq11", "hnr", "rpde", "dfa", "ppe")
predictors = predictors[!predictors %in% c("total_updrs", "sex")]
sort(sapply(predictors, pcorr, response = "total_updrs", dataset = pk_trn ))
##        dfa        hnr      jit_p       rpde shim_apq11        ppe  test_time 
## -0.1681017 -0.1168197 -0.0006618  0.0232970  0.0502840  0.0568783  0.0912523 
##        age 
##  0.2444024
diagnostics(full_model_non_collinear, testit = FALSE, plotit = TRUE)

performance_metrics(full_model_non_collinear, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)
Train RMSE Test RMSE LOOCV RMSE Adj R2 Shapiro Wilk p-val BP p-val
9.835 9.834 9.868 0.141 9.2e-21 9.1e-17
lambda = get_lambda_from_boxcox(model = full_model_non_collinear, plotit = TRUE )

resp_xformed_model = lm(sqrt(total_updrs) ~  age + sex + test_time + jit_p +shim_apq11 + hnr + rpde + dfa + ppe, data = pk_trn)
diagnostics(resp_xformed_model, testit = FALSE, plotit = TRUE)

performance_metrics(resp_xformed_model, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)
Train RMSE Test RMSE LOOCV RMSE Adj R2 Shapiro Wilk p-val BP p-val
26.1 25.79 0.9362 0.1547 2.8e-14 1.2e-17
pairs(pk_trn[, c("age", "test_time", "jit_p", "shim_apq11", "hnr", "rpde", "dfa", "ppe")], col = "darkorange4")

start_model = lm(sqrt(total_updrs) ~  (jit_p + shim_db + hnr + dfa + sqrt(ppe)), data = pk_trn)
xformed_model  = step(start_model, trace = 0)
diagnostics(xformed_model, testit = FALSE, plotit = TRUE)

performance_metrics(xformed_model, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)
Train RMSE Test RMSE LOOCV RMSE Adj R2 Shapiro Wilk p-val BP p-val
26.13 25.83 0.9778 0.0767 8.1e-11 9e-07
summary(xformed_model)
## 
## Call:
## lm(formula = sqrt(total_updrs) ~ (jit_p + shim_db + hnr + dfa + 
##     sqrt(ppe)), data = pk_trn)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.078 -0.673 -0.090  0.740  2.396 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.79477    0.34799   22.40  < 2e-16 ***
## jit_p       -13.11839    4.54926   -2.88  0.00396 ** 
## shim_db      -0.48129    0.14491   -3.32  0.00091 ***
## hnr          -0.05322    0.00867   -6.14  9.5e-10 ***
## dfa          -3.15169    0.27693  -11.38  < 2e-16 ***
## sqrt(ppe)     2.08467    0.32104    6.49  9.8e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.977 on 2994 degrees of freedom
## Multiple R-squared:  0.0782, Adjusted R-squared:  0.0767 
## F-statistic: 50.8 on 5 and 2994 DF,  p-value: <2e-16
xformed_small_model = lm(sqrt(total_updrs)  ~ shim_db + hnr + dfa+ sqrt(ppe), data = pk_trn)
(aov = anova(xformed_small_model, xformed_model))
## Analysis of Variance Table
## 
## Model 1: sqrt(total_updrs) ~ shim_db + hnr + dfa + sqrt(ppe)
## Model 2: sqrt(total_updrs) ~ (jit_p + shim_db + hnr + dfa + sqrt(ppe))
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)   
## 1   2995 2865                            
## 2   2994 2857  1      7.93 8.32  0.004 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
xformed_int_model = lm(sqrt(total_updrs)  ~ (shim_db + hnr + dfa+ sqrt(ppe))^2, 
                  data = pk_trn)
summary(xformed_int_model)
## 
## Call:
## lm(formula = sqrt(total_updrs) ~ (shim_db + hnr + dfa + sqrt(ppe))^2, 
##     data = pk_trn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1572 -0.6391 -0.0445  0.7193  2.6142 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        13.1149     2.6532    4.94  8.1e-07 ***
## shim_db            -0.5607     1.2419   -0.45   0.6517    
## hnr                -0.0764     0.0732   -1.04   0.2972    
## dfa                -0.9517     4.0264   -0.24   0.8132    
## sqrt(ppe)         -22.5828     3.5619   -6.34  2.6e-10 ***
## shim_db:hnr         0.0960     0.0193    4.98  6.7e-07 ***
## shim_db:dfa        -8.1018     1.9656   -4.12  3.9e-05 ***
## shim_db:sqrt(ppe)   8.6883     1.4986    5.80  7.4e-09 ***
## hnr:dfa            -0.3517     0.1178   -2.98   0.0029 ** 
## hnr:sqrt(ppe)       0.5099     0.0733    6.95  4.4e-12 ***
## dfa:sqrt(ppe)      15.7993     4.0092    3.94  8.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.962 on 2989 degrees of freedom
## Multiple R-squared:  0.107,  Adjusted R-squared:  0.104 
## F-statistic: 35.9 on 10 and 2989 DF,  p-value: <2e-16
performance_metrics(xformed_int_model, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)
Train RMSE Test RMSE LOOCV RMSE Adj R2 Shapiro Wilk p-val BP p-val
26.12 25.82 0.9634 0.1042 5.6e-10 1.6e-08

-We can perform F test to test the significance of the model.

(aov1 = anova(xformed_small_model, xformed_int_model))
## Analysis of Variance Table
## 
## Model 1: sqrt(total_updrs) ~ shim_db + hnr + dfa + sqrt(ppe)
## Model 2: sqrt(total_updrs) ~ (shim_db + hnr + dfa + sqrt(ppe))^2
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)    
## 1   2995 2865                             
## 2   2989 2767  6      97.9 17.6 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
xformed_wo_infl_model = lm(sqrt(total_updrs)  ~ (shim_db + hnr + dfa+ sqrt(ppe))^2, 
                  data = pk_trn,
                  subset = cooks.distance(xformed_int_model) <= 4 / length(resid(xformed_int_model)))
performance_metrics(xformed_wo_infl_model, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)
Train RMSE Test RMSE LOOCV RMSE Adj R2 Shapiro Wilk p-val BP p-val
26.14 25.8 0.9189 0.1322 8.9e-09 2.3e-11
bic_model = step(xformed_wo_infl_model, k = log(length(resid(xformed_int_model))), trace = 0)
diagnostics(bic_model, testit = FALSE, plotit = TRUE)

summary(bic_model)
## 
## Call:
## lm(formula = sqrt(total_updrs) ~ (shim_db + hnr + dfa + sqrt(ppe))^2, 
##     data = pk_trn, subset = cooks.distance(xformed_int_model) <= 
##         4/length(resid(xformed_int_model)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8431 -0.6343 -0.0459  0.6904  2.5407 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        11.13461    2.76688    4.02  5.9e-05 ***
## shim_db             0.03299    1.46918    0.02  0.98209    
## hnr                 0.00942    0.07707    0.12  0.90274    
## dfa                 4.66837    4.15891    1.12  0.26174    
## sqrt(ppe)         -25.62689    3.63702   -7.05  2.3e-12 ***
## shim_db:hnr         0.12145    0.02203    5.51  3.8e-08 ***
## shim_db:dfa       -11.85481    2.36239   -5.02  5.5e-07 ***
## shim_db:sqrt(ppe)  11.51079    1.65014    6.98  3.8e-12 ***
## hnr:dfa            -0.57912    0.12242   -4.73  2.3e-06 ***
## hnr:sqrt(ppe)       0.63091    0.07486    8.43  < 2e-16 ***
## dfa:sqrt(ppe)      15.44803    4.03293    3.83  0.00013 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.918 on 2904 degrees of freedom
## Multiple R-squared:  0.135,  Adjusted R-squared:  0.132 
## F-statistic: 45.4 on 10 and 2904 DF,  p-value: <2e-16
performance_metrics(bic_model, trainset = pk_trn, testset = pk_tst, single_mod = TRUE)
Train RMSE Test RMSE LOOCV RMSE Adj R2 Shapiro Wilk p-val BP p-val
26.14 25.8 0.9189 0.1322 8.9e-09 2.3e-11

Results

Based on our analysis we find 3 good candidate models for prediction of UPDRS (out of lots of models. See appendix for other tried models).

performance_metrics(xformed_small_model, xformed_int_model, xformed_wo_infl_model, trainset = pk_trn, testset = pk_tst)
Train RMSE Test RMSE LOOCV RMSE Adj R2 Shapiro Wilk p-val BP p-val
Small Model 26.13 25.83 0.9787 0.0744 7.2e-11 1.2e-06
Interaction Model 26.12 25.82 0.9634 0.1042 5.6e-10 1.6e-08
Int Model w/o Influential points 26.14 25.80 0.9189 0.1322 8.9e-09 2.3e-11
diagnostics(xformed_int_model, testit = FALSE, plotit = TRUE)

Discussions

Appendix

Below are some of the different models that we tried during our analysis:

vif(test_model)
test_model1 = lm(total_updrs ~  age + sex + test_time + jit_p + shim_db +shim_apq5+ shim_apq11 + hnr + rpde + dfa + ppe, data = pk_trn)
diagnostics(test_model1, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model1)
test_model4 = lm(sqrt(total_updrs) ~  age + sex + test_time + log(jit_p) +log(shim_apq11) + hnr + rpde + dfa + ppe, data = pk_trn)
diagnostics(test_model4, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model4)
start_model = lm(sqrt(total_updrs) ~  age + sex + test_time + log(jit_p) +log(shim_apq11) + hnr + rpde + dfa + ppe, data = pk_trn)
test_model5  = step(start_model, trace = 0)
diagnostics(test_model5, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model5)
summary(test_model5)
###Check again
start_model = lm(sqrt(total_updrs) ~ sex + test_time + log(jit_p) + shim_apq11 + hnr + log(rpde) + dfa + sqrt(ppe), data = pk_trn)
test_model6  = step(start_model, trace = 0)
diagnostics(test_model6, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model6)
get_rmse(pk_trn$total_updrs, fitted(test_model6))
get_rmse(pk_tst$total_updrs, predict(test_model6, pk_tst))
start_model = lm(sqrt(total_updrs) ~  log(jit_p) + shim_db + hnr + log(rpde) + dfa + sqrt(ppe), data = pk_trn)
test_model7  = step(start_model, trace = 0)
diagnostics(test_model7, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model7)
get_rmse(pk_trn$total_updrs, fitted(test_model7))
get_rmse(pk_tst$total_updrs, predict(test_model7, pk_tst))
start_model = lm(sqrt(total_updrs) ~  (log(jit_p) + shim_db + hnr + log(rpde) + dfa + sqrt(ppe)), data = pk_trn)
test_model8  = step(start_model, trace = 0)
diagnostics(test_model8, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model8)
get_rmse(pk_trn$total_updrs, fitted(test_model8))
get_rmse(pk_tst$total_updrs, predict(test_model8, pk_tst))
start_model = lm(sqrt(total_updrs) ~  (log(jit_p)+ jit_p + shim_db + hnr + dfa + sqrt(ppe) + ppe), data = pk_trn)
test_model10  = step(start_model, trace = 0)
diagnostics(test_model10, testit = TRUE, plotit = FALSE)
loocv_rmse(test_model10)
get_rmse(pk_trn$total_updrs, fitted(test_model10))
get_rmse(pk_tst$total_updrs, predict(test_model10, pk_tst))
mod_small_init = lm(total_updrs ~
                      age + log(jit_abs)+ log(shim_apq5)
                      + hnr + dfa + ppe, data = pk_trn)

diagnostics(mod_small_init)
mod_small = lm(total_updrs ~
                age + log(jit_abs) + log(shim_apq5) + log(shim_apq11)
                + hnr + dfa + ppe, data = pk_trn)

diagnostics(mod_small)
mod_big = lm(total_updrs ~
             (age + rpde+ jit_abs + shim + nhr + hnr + dfa + ppe)^4
           + (log(jit_abs) + log(shim) + log(nhr) + log(dfa) + log(ppe) + log(nhr) + log(rpde))^2, pk_trn)

mod_big_aic = step(mod_big, trace = 0) # AIC backward

diagnostics(mod_big_aic)
mod_good = mod_sqrt = lm(formula = sqrt(total_updrs) ~ shim + log(jit_abs) + log(nhr) +
     log(dfa) + sqrt(ppe) + shim:log(nhr) + shim:log(dfa) + shim:sqrt(ppe) +
     log(jit_abs):log(dfa) + log(jit_abs):sqrt(ppe) + log(dfa):sqrt(ppe),
   data = pk_trn)

diagnostics(mod_good)

Team Members :

  • Prashanti Nilayam
  • Heta Desai
  • Oran Chan