1 Structured Big Data Analytics Case-Study

library(caret)
library(mice)
library(foreach)
library(doParallel)
library(randomForest)
library(rpart)
library(rpart.plot)
library(rattle)
library(misc3d)

Next, we will look at another interesting example of a large structured tabular dataset. The goal remains the same - examine the effects of indexing complex data only using kime-order (time) and comparing the data representations as well as the subsequent data analytics. In this case-study, we will use the UK Biobank (UKBB) data.

A previous investigation [CITE 2018 SOCR UKBB paper], based on \(7,614\) imaging, clinical, and phenotypic features and neuroimaging data of \(9,914\) UKBB subjects reported the twenty most salient derived imaging biomarkers. By jointly representing and modeling the significant clinical and demographic variables along with specific salient neuroimaging features, the researchers predicted the presence and progression of depression and mental health of participating volunteers. We will explore the effects of kime-direction on the findings based on the same data and methods. For ease of demonstration, efficient calculations, and direct interpretation, we start by transforming the data into a tighter computable object of dimensions \(9,914\times 107\).

1.1 Data Source Type

For more information about the data, please refer to: https://www.nature.com/articles/s41598-019-41634-y

UKBB_data <- get(load("UKBB_data_cluster_label.Rdata")) 
# str(UKBB_data)
UKBB_Colnames <- colnames(UKBB_data); View(UKBB_Colnames); dim(UKBB_data)   # 9914 7615
# Extract the top-50 derived NI biomarkers (data_summary_cluster_2.xlsx), per
# https://drive.google.com/drive/folders/1SdAtefp_taabNL70JvwJZSexTkzXiEKD 
top50_NI_Biomarkers <- c("lh_BA_exvivo_area__lh_WhiteSurfArea_area", "rh_BA_exvivo_area__rh_WhiteSurfArea_area", "rh_aparc.a2009s_area__rh_WhiteSurfArea_area", "rh_aparc_area__rh_WhiteSurfArea_area", "lh_aparc_area__lh_WhiteSurfArea_area", "lh_aparc.a2009s_area__lh_WhiteSurfArea_area", "aseg__SupraTentorialVol", "aseg__SupraTentorialVolNotVent", "aseg__SupraTentorialVolNotVentVox", "aseg__BrainSegVol", "aseg__BrainSegVolNotVentSurf", "aseg__BrainSegVolNotVent", "aseg__CortexVol", "aseg__rhCortexVol", "aseg__lhCortexVol", "aseg__TotalGrayVol", "aseg__MaskVol", "rh_aparc.DKTatlas_area__rh_superiortemporal_area", "rh_aparc.DKTatlas_area__rh_superiorfrontal_area", "lh_aparc.DKTatlas_area__lh_superiorfrontal_area", "lh_aparc.DKTatlas_area__lh_lateralorbitofrontal_area", "lh_aparc.DKTatlas_area__lh_superiortemporal_area", "aseg__EstimatedTotalIntraCranialVol", "lh_aparc_area__lh_lateralorbitofrontal_area", "rh_aparc.DKTatlas_area__rh_lateralorbitofrontal_area", "lh_aparc_area__lh_superiorfrontal_area", "rh_aparc_area__rh_superiortemporal_area", "rh_aparc.a2009s_area__rh_G.S_cingul.Ant_area", "rh_aparc_area__rh_superiorfrontal_area", "lh_aparc_area__lh_rostralmiddlefrontal_area", "wmparc__wm.lh.lateralorbitofrontal", "wmparc__wm.lh.insula", "rh_aparc_area__rh_medialorbitofrontal_area", "lh_BA_exvivo_area__lh_BA3b_exvivo_area", "lh_aparc.DKTatlas_area__lh_postcentral_area", "lh_aparc.DKTatlas_volume__lh_lateralorbitofrontal_volume", "lh_aparc.DKTatlas_area__lh_insula_area", "aseg__SubCortGrayVol", "lh_aparc.a2009s_area__lh_G_orbital_area", "lh_aparc_area__lh_superiortemporal_area", "rh_aparc.DKTatlas_area__rh_insula_area", "lh_aparc.DKTatlas_area__lh_precentral_area", "lh_aparc.pial_area__lh_lateralorbitofrontal_area", "lh_aparc.DKTatlas_area__lh_rostralmiddlefrontal_area", "lh_aparc_area__lh_postcentral_area", "lh_aparc.pial_area__lh_superiorfrontal_area", "rh_aparc_area__rh_rostralmiddlefrontal_area", "wmparc__wm.lh.superiortemporal", "lh_aparc.pial_area__lh_rostralmiddlefrontal_area", "rh_aparc.DKTatlas_volume__rh_lateralorbitofrontal_volume")

# Extract the main clinical features (binary/dichotomous and categorical/polytomous)
#### binary
top25_BinaryClinical_Biomarkers <- 
  c("X1200.0.0", "X1200.2.0", "X1170.0.0", "X1190.2.0", "X1170.2.0","X2080.0.0", "X6138.2.2",
    "X20117.0.0", "X6138.0.2", "X2877.0.0", "X20117.2.0", "X2877.2.0","X1190.0.0", "X4968.2.0",
    "X1249.2.0", "X1190.1.0", "X1170.1.0", "X2080.2.0", "X4292.2.0","X2050.0.0", "X1628.0.0",
    "X1200.1.0", "X20018.2.0", "X4292.0.0", "X3446.0.0")
#### polytomous
top31_PolytomousClinical_Biomarkers <- 
  c("X31.0.0", "X22001.0.0", "X1950.0.0", "X1950.2.0", "X1980.0.0", "X2040.2.0", "X1980.2.0",
    "X2030.0.0", "X2090.0.0", "X2040.0.0", "X1618.2.0", "X1618.0.0", "X1210.0.0", "X2030.2.0",
    "X2000.0.0", "X1930.0.0", "X2090.2.0", "X2000.2.0", "X1210.2.0", "X1618.1.0", "X4653.2.0",
    "X1970.2.0", "X1970.0.0", "X1980.1.0", "X1930.2.0", "X4598.2.0", "X4598.0.0", "X4653.0.0",
    "X2090.1.0", "X2040.1.0", "X4631.2.0")

# Extract derived computed phenotype 
derivedComputedPhenotype <- UKBB_Colnames[length(UKBB_Colnames)]

# Construct the Computable data object including all salient predictors and derived cluster phenotype
ColNameList <- 
  c(top50_NI_Biomarkers, top25_BinaryClinical_Biomarkers, 
    top31_PolytomousClinical_Biomarkers, derivedComputedPhenotype)
length(ColNameList)
col.index <- which(colnames(UKBB_data) %in% ColNameList)
length(col.index)
tight107_UKBB_data <- UKBB_data[ , col.index]; dim(tight107_UKBB_data)
## View(tight107_UKBB_data[1:10, ])   # Confirm the tight data object organization

We can investigate the effects of the kime-phase on the resulting data analytic inference obtained using the UKBB data.

################################ 
#  1. First deal with the missing values  # summary(tight107_UKBB_data)
##### "mi" imputation - LONG
#library(mi) # use pmm (predictive mean matching) imputation method for the missing variables.
#mdf <- missing_data.frame(tight107_UKBB_data)
#show(mdf)   # ; head(mdf); dim(mdf)
#options(mc.cores = 4)
#imputations <- mi(mdf, n.iter=4, n.chains=1, verbose=T)
#imp.data.frames <- complete(imputations, 1)
#imp_tight107_UKBB_data <- imp.data.frames[[1]]

# split-off the computed phenotype (avoid manipulationg hte outcome that will be predicted during analysis phase
colnames(tight107_UKBB_data)[length(tight107_UKBB_data)] <- "cluster_2_cluster"
colnames(tight107_UKBB_data)[length(tight107_UKBB_data)]   # fix the problem with $ in variable name
y_pheno  <- tight107_UKBB_data[ , length(tight107_UKBB_data)]
tight106_UKBB_data <- tight107_UKBB_data[, -length(tight107_UKBB_data)]
dim(tight106_UKBB_data)

# MICE imputation (With parallel core computing)
# all predictors with absolute correlation over 0.4 AND at least 30% usable cases
# library(mice)
# library(foreach)
# library(doParallel)
# set-up local parallel cluster
number_cores <- detectCores() - 2
clustUKBB <- makeCluster(number_cores)
clusterSetRNGStream(clustUKBB, 1234)
registerDoParallel(clustUKBB)

imp_tight106_UKBB_data <-
  foreach(no = 1:number_cores, 
          .combine = ibind, 
          .export = "tight106_UKBB_data",
          .packages = "mice") %dopar%
{
  mice(tight106_UKBB_data, m=2,maxit=3, printFlag=T, seed=1234, method = 'cart')
}

# imp_tight106_UKBB_data <- mice(tight106_UKBB_data, m=2,maxit=3, printFlag=T, seed=1234, method = 'cart')
### MICE NOTE: When the column features have a number of unbalanced factors, these categorical variables are transformed into dummy indicator-variables.
### There is a high probability that the resulting columns may be linear combinations of one-another. 
### The default MICE imputation methods, linear regression, may not be able to solve the linear matrix equations due to matrix low rank, i.e., matrices may cannot be inverted causing errors like "system is computationally singular". RTo solve that problem, we can change the method to "cart". 
### Also use seed before/during imputation for reproducible results.
comp_imp_tight106_UKBB_data <- 
  as.matrix(complete(imp_tight106_UKBB_data), 
            dimnames = list(NULL, colnames(tight106_UKBB_data)))

Data preprocessing is basically done here, readers can just load “Fig5.10_to_13.Rdata” to start here.

1.2 Finish Data Imputation

# imp_tight106_UKBB_data and comp_imp_tight106_UKBB_data are saved in "Fig5.10_to_13.Rdata"
load("Fig5.10_to_13.Rdata")

################################################
##  2. Split the UKBB into 9 epochs
# First normalize all 50 derived NI biomarkers *using scale* as the NI biomarkers have vastly different distributions
for (i in 1:50) {
  comp_imp_tight106_UKBB_data[, i] <- scale(comp_imp_tight106_UKBB_data[, i])
}

# Next configure the 11 epochs
comp_imp_tight106_UKBB_data <- comp_imp_tight106_UKBB_data[1:9900, ] # remove the last 14 cases to make the 11 epochs of size 900 observations yeach
is.matrix(comp_imp_tight106_UKBB_data)

## [1] TRUE

dim(comp_imp_tight106_UKBB_data)

## [1] 9900  106

dim(comp_imp_tight106_UKBB_data) <- c(11, 900, dim(comp_imp_tight106_UKBB_data)[2])
dim(comp_imp_tight106_UKBB_data)

## [1]  11 900 106

# double check epoch split worked 
# identical(comp_imp_tight106_UKBB_data[10, 900, 12], as.matrix(complete(imp_tight106_UKBB_data))[10*900, 12])
epochs_tight106_UKBB_data_1 <- comp_imp_tight106_UKBB_data[1, , ]
dim(epochs_tight106_UKBB_data_1)

## [1] 900 106

# 3. Transform all 9 epochs (Big datasets/signals) to k-space (Fourier domain)
x1 <- c(1:900)
FT_epochs_tight106_UKBB <- array(complex(), c(11, 900, dim(comp_imp_tight106_UKBB_data)[3]))
mag_FT_epochs_tight106_UKBB <- array(complex(), c(11, 900, dim(comp_imp_tight106_UKBB_data)[3]))
phase_FT_epochs_tight106_UKBB <- array(complex(), c(11, 900, dim(comp_imp_tight106_UKBB_data)[3]))
for (i in 1:11) {
  FT_epochs_tight106_UKBB[i, , ] <- fft(comp_imp_tight106_UKBB_data[i, , ])
  X2 <- FT_epochs_tight106_UKBB[i, , ]
  # plot(fftshift1D(log(Re(X2)+2)), main = "log(fftshift1D(Re(FFT(tight106_UKBB))))") 
  mag_FT_epochs_tight106_UKBB[i, , ] <- sqrt(Re(X2)^2+Im(X2)^2); 
  # plot(log(fftshift1D(Re(X2_mag))), main = "log(Magnitude(FFT(tight106_UKBB)))") 
  phase_FT_epochs_tight106_UKBB[i, , ] <- atan2(Im(X2), Re(X2)); 
  # plot(fftshift1D(X2_phase), main = "Shift(Phase(FFT(tight106_UKBB)))")
}

# Compute the Average Phase of all 11 epochs (this will be needed later to confirm better data analytics)
avgPhase_FT_epochs_tight106_UKBB <- apply(phase_FT_epochs_tight106_UKBB, c(2,3), mean)
dim(avgPhase_FT_epochs_tight106_UKBB)

## [1] 900 106

### Test the process to confirm calculations
# X2<-FT_epochs_tight106_UKBB[1,,];X2_mag<-mag_FT_epochs_tight106_UKBB[1,,];X2_phase<-phase_FT_epochs_tight106_UKBB[1,,]
# Real2 = X2_mag * cos(X2_phase)
# Imaginary2 = X2_mag * sin(X2_phase)
# man_hat_X2 = Re(fft(Real2 + 1i*Imaginary2, inverse = T)/length(X2))
# ifelse(abs(man_hat_X2[5,10] - comp_imp_tight106_UKBB_data[1, 5, 10]) < 0.001, "Perfect Syntesis", "Problems!!!")
#######

##### 4. Invert back to spacetime the epochs_tight106_UKBB_data_1 signal with NIL and Average phase
# Start with Nil Phase
Real = mag_FT_epochs_tight106_UKBB[1, , ] * cos(0)  # cos(mag_FT_epochs_tight106_UKBB[1, , ])
Imaginary = mag_FT_epochs_tight106_UKBB[1, , ] * sin(0)   # sin(mag_FT_epochs_tight106_UKBB[1, , ])
ift_NilPhase_X2mag = Re(fft(Real+1i*Imaginary, inverse = T)/length(mag_FT_epochs_tight106_UKBB[1,,]))
# display(ift_NilPhase_X2mag, method = "raster")
# dim(ift_NilPhase_X2mag); View(ift_NilPhase_X2mag); # compare to View(comp_imp_tight106_UKBB_data[1, , ])

# Next, synthesize the data using Average Phase
Real_Avg = mag_FT_epochs_tight106_UKBB[1, , ] * cos(avgPhase_FT_epochs_tight106_UKBB)
Imaginary_Avg = mag_FT_epochs_tight106_UKBB[1, , ] * sin(avgPhase_FT_epochs_tight106_UKBB)
ift_AvgPhase_X2mag = 
  Re(fft(Real_Avg+1i*Imaginary_Avg, inverse = T)/length(mag_FT_epochs_tight106_UKBB[1,,]))
# is.complex(ift_AvgPhase_X2mag); dim(ift_AvgPhase_X2mag)

# To Transform the entire UKBB data to k-space (Fourier domain)
# library(EBImage)
#FT_UKBB_data <- fft(comp_imp_tight106_UKBB_data)
#X2 <- FT_UKBB_data  # display(FT_UKBB_data, method = "raster") 
#mag_FT_UKBB_data <- sqrt(Re(X2)^2+Im(X2)^2) 
###  # plot(log(fftshift1D(Re(X2_mag))), main = "log(Magnitude(FFT(timeseries)))") 
#phase_FT_UKBB_data <- atan2(Im(X2), Re(X2)) 
### Test the process to confirm calculations
# X2<-FT_UKBB_data; X2_mag <- mag_FT_UKBB_data; X2_phase<-phase_FT_UKBB_data
# Real2 = X2_mag * cos(X2_phase)
# Imaginary2 = X2_mag * sin(X2_phase)
# man_hat_X2 = Re(fft(Real2 + 1i*Imaginary2, inverse = T)/length(X2))
# ifelse(abs(man_hat_X2[5,10] - comp_imp_tight106_UKBB_data[5, 10]) < 0.001, "Perfect Syntesis", "Problems!!!")
#######
# Then we can Invert back the complete UKBB FT data into spacetime using nil phase
#Real = mag_FT_UKBB_data * cos(0)  # cos(phase_FT_UKBB_data)
#Imaginary = mag_FT_UKBB_data * sin(0)   # sin(phase_FT_UKBB_data)
#ift_NilPhase_X2mag = Re(fft(Real+1i*Imaginary, inverse = T)/length(FT_UKBB_data))
# display(ift_NilPhase_X2mag, method = "raster")
# dim(ift_NilPhase_X2mag); View(ift_NilPhase_X2mag); # compare to View(aqi_data1)
#summary(comp_imp_tight106_UKBB_data); summary(ift_NilPhase_X2mag)

# 5. Epoch 1: Perform Random Forest prediction (based on ift_TruePhase_X2mag==Original==epochs_tight106_UKBB_data_1) of:
##### Ever depressed for a whole week 1 #########################################
# library(randomForest)
# y_pheno <- comp_imp_tight106_UKBB_data[,"X4598.2.0"] ### Ever depressed for a whole week 1
# y_pheno <- as.factor(y_pheno)
colnames(epochs_tight106_UKBB_data_1) <- colnames(tight106_UKBB_data)
set.seed(1234)
rf_depressed <- 
  randomForest(as.factor(epochs_tight106_UKBB_data_1[ ,"X4598.2.0"]) ~ . , 
               data=epochs_tight106_UKBB_data_1[ , !(colnames(epochs_tight106_UKBB_data_1) %in%
                                                       c("X4598.0.0", "X4598.2.0"))])
rf_depressed

## 
## Call:
##  randomForest(formula = as.factor(epochs_tight106_UKBB_data_1[,      "X4598.2.0"]) ~ ., data = epochs_tight106_UKBB_data_1[, !(colnames(epochs_tight106_UKBB_data_1) %in%      c("X4598.0.0", "X4598.2.0"))]) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##         OOB estimate of  error rate: 21.44%
## Confusion matrix:
##     0   1 class.error
## 0 370  71   0.1609977
## 1 122 337   0.2657952

pred1 = predict(rf_depressed, type="class")
confusionMatrix(pred1, as.factor(epochs_tight106_UKBB_data_1[ ,"X4598.2.0"]))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 370 122
##          1  71 337
##                                          
##                Accuracy : 0.7856         
##                  95% CI : (0.7573, 0.812)
##     No Information Rate : 0.51           
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5719         
##                                          
##  Mcnemar's Test P-Value : 0.0003193      
##                                          
##             Sensitivity : 0.8390         
##             Specificity : 0.7342         
##          Pos Pred Value : 0.7520         
##          Neg Pred Value : 0.8260         
##              Prevalence : 0.4900         
##          Detection Rate : 0.4111         
##    Detection Prevalence : 0.5467         
##       Balanced Accuracy : 0.7866         
##                                          
##        'Positive' Class : 0              
##

############## Accuracy : 0.7911 #############################

##### plot a simple decision tree
# library(rpart); library(rpart.plot)
## there is an error in installing the rattle package
# library(rattle)  ## this is for the fancyRpartPlot
label <- as.factor(epochs_tight106_UKBB_data_1[ ,"X4598.2.0"])
data1 <- 
  as.data.frame(epochs_tight106_UKBB_data_1[ , !(colnames(epochs_tight106_UKBB_data_1) %in%
                                                   c("X4598.0.0", "X4598.2.0", "cluster_2_cluster"))])
data1$label <- label
depress_tree <- rpart(label ~ ., control=rpart.control(minsplit=30, cp=0.001, maxdepth=30), 
                      data=data1)
# windows(width=12, height=12) # For windows users
x11(width=10, height=8) # For Mac users

1.3 Figure 5.10A

rpart.plot(depress_tree, type = 4, extra = 1, clip.right.labs = F, tweak=1.3)

## quartz_off_screen 
##                 2

pred1 = predict(depress_tree, type="class")

# table(pred1, label)
# library(caret)
confusionMatrix(pred1, label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 362  60
##          1  79 399
##                                           
##                Accuracy : 0.8456          
##                  95% CI : (0.8203, 0.8686)
##     No Information Rate : 0.51            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6907          
##                                           
##  Mcnemar's Test P-Value : 0.1268          
##                                           
##             Sensitivity : 0.8209          
##             Specificity : 0.8693          
##          Pos Pred Value : 0.8578          
##          Neg Pred Value : 0.8347          
##              Prevalence : 0.4900          
##          Detection Rate : 0.4022          
##    Detection Prevalence : 0.4689          
##       Balanced Accuracy : 0.8451          
##                                           
##        'Positive' Class : 0               
##

########################### Accuracy : 0.8511  ##############################

### Prune decision tree
prune_depress_tree <- 
  prune(depress_tree,
        cp=depress_tree$cptable[which.min(depress_tree$cptable[,"xerror"]),"CP"])

# plot the pruned tree
# windows(width=12, height=12) # For windows users
x11(width=10, height=8) # For Mac users
plot(prune_depress_tree, uniform=TRUE, main="Pruned UKBB Decision Tree")
text(prune_depress_tree, use.n=TRUE, all=TRUE, cex=.8)

# windows(width=12, height=12) # For windows users
x11(width=10, height=8) # For Mac users

1.4 Figure 5.10B

rpart.plot(prune_depress_tree, type = 4, extra = 1, clip.right.labs = F, tweak=2)

pred2 = predict(prune_depress_tree, type="class")
confusionMatrix(pred2, label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 398 129
##          1  43 330
##                                           
##                Accuracy : 0.8089          
##                  95% CI : (0.7817, 0.8341)
##     No Information Rate : 0.51            
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6191          
##                                           
##  Mcnemar's Test P-Value : 9.1e-11         
##                                           
##             Sensitivity : 0.9025          
##             Specificity : 0.7190          
##          Pos Pred Value : 0.7552          
##          Neg Pred Value : 0.8847          
##              Prevalence : 0.4900          
##          Detection Rate : 0.4422          
##    Detection Prevalence : 0.5856          
##       Balanced Accuracy : 0.8107          
##                                           
##        'Positive' Class : 0               
##

######################### Accuracy : 0.7867 ###############################

## quartz_off_screen 
##                 2

###### 6. FOR the COMPLETE UKBB data: plot a simple decision tree
#library(rpart); library(rpart.plot)
#library(rattle)  ## this is for the fancyRpartPlot
dim(comp_imp_tight106_UKBB_data) <- c(11*900, dim(comp_imp_tight106_UKBB_data)[3])
colnames(comp_imp_tight106_UKBB_data) <- colnames(tight106_UKBB_data)
label3 <- as.factor(comp_imp_tight106_UKBB_data[ ,"X4598.2.0"])
data3 <- 
  as.data.frame(comp_imp_tight106_UKBB_data[ , !(colnames(comp_imp_tight106_UKBB_data) %in%
                                                   c("X4598.0.0", "X4598.2.0", "cluster_2_cluster"))])

depress_tree3 <- 
  rpart(label3 ~ ., control=rpart.control(minsplit=30, cp=0.001, maxdepth=30), data=data3) 

# windows(width=12, height=12) # For windows users
x11(width=10, height=8) # For Mac users
rpart.plot(depress_tree3, type = 4, extra = 1, clip.right.labs = F, tweak=2)

## Warning: labs do not fit even at cex 0.15, there may be some overplotting

pred3 = predict(depress_tree3, type="class")
confusionMatrix(pred3, label3)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4412 1256
##          1  465 3767
##                                           
##                Accuracy : 0.8262          
##                  95% CI : (0.8186, 0.8336)
##     No Information Rate : 0.5074          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6531          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9047          
##             Specificity : 0.7500          
##          Pos Pred Value : 0.7784          
##          Neg Pred Value : 0.8901          
##              Prevalence : 0.4926          
##          Detection Rate : 0.4457          
##    Detection Prevalence : 0.5725          
##       Balanced Accuracy : 0.8273          
##                                           
##        'Positive' Class : 0               
##

#################### Accuracy : 0.8238 ###########################

### Prune decision tree
prune_depress_tree3 <- 
  prune(depress_tree3,
        cp=depress_tree3$cptable[which.min(depress_tree3$cptable[,"xerror"]),"CP"])
# windows(width=12, height=12) # For windows users
x11(width=10, height=8) # For Mac users
rpart.plot(prune_depress_tree3, type = 4, extra = 1, clip.right.labs = F, tweak=2,  main="Pruned UKBB Decision Tree")

pred3 = predict(prune_depress_tree3, type="class")
confusionMatrix(pred3, label3)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4274 1281
##          1  603 3742
##                                           
##                Accuracy : 0.8097          
##                  95% CI : (0.8018, 0.8174)
##     No Information Rate : 0.5074          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6201          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8764          
##             Specificity : 0.7450          
##          Pos Pred Value : 0.7694          
##          Neg Pred Value : 0.8612          
##              Prevalence : 0.4926          
##          Detection Rate : 0.4317          
##    Detection Prevalence : 0.5611          
##       Balanced Accuracy : 0.8107          
##                                           
##        'Positive' Class : 0               
##

#################### Accuracy : 0.8132 ###########################

# RF
rf_depressed_Complete <- randomForest(label3 ~ . ,  data=data3)
rf_depressed_Complete

## 
## Call:
##  randomForest(formula = label3 ~ ., data = data3) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##         OOB estimate of  error rate: 19.22%
## Confusion matrix:
##      0    1 class.error
## 0 4211  666   0.1365594
## 1 1237 3786   0.2462672

pred3 = predict(rf_depressed_Complete, type="class")
confusionMatrix(pred3, label3)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4211 1237
##          1  666 3786
##                                           
##                Accuracy : 0.8078          
##                  95% CI : (0.7999, 0.8155)
##     No Information Rate : 0.5074          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6161          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8634          
##             Specificity : 0.7537          
##          Pos Pred Value : 0.7729          
##          Neg Pred Value : 0.8504          
##              Prevalence : 0.4926          
##          Detection Rate : 0.4254          
##    Detection Prevalence : 0.5503          
##       Balanced Accuracy : 0.8086          
##                                           
##        'Positive' Class : 0               
##

# In random forests, there is no need for cross-validation or a separate test set to get an unbiased estimate of the test set error. The out-of-bag (oob) error estimate is internally computed during the run... In particular, when newdata is not provided, the *predict.randomForest()==predict()* method automatically returns the out-of-bag prediction.
##################### Accuracy : 0.808 #########################

# 7. Perform Random Forest prediction (based on ift_AvgPhase_X2mag) of:
##### Ever depressed for a whole week 1 #########################################
# library("randomForest")
# y_pheno <- comp_imp_tight106_UKBB_data[,"X4598.2.0"] ### Ever depressed for a whole week 1
# y_pheno <- as.factor(y_pheno)
label2 <- as.factor(epochs_tight106_UKBB_data_1[ ,"X4598.2.0"]) 
# use the real outcome not synthesized (as.factor(epochs_tight106_UKBB_data_1[ ,"X4598.2.0"]))
colnames(ift_AvgPhase_X2mag) <- colnames(tight106_UKBB_data)
data2 <- 
  as.data.frame(ift_AvgPhase_X2mag[ , !(colnames(ift_AvgPhase_X2mag) %in%
                                          c("X4598.0.0", "X4598.2.0", "cluster_2_cluster"))])
set.seed(1234)
rf_depressed_AvgPhase <- randomForest(label2 ~ . , importance=T, nodesize=30, mtry=100, ntree= 10000, data=data2)
rf_depressed_AvgPhase

## 
## Call:
##  randomForest(formula = label2 ~ ., data = data2, importance = T,      nodesize = 30, mtry = 100, ntree = 10000) 
##                Type of random forest: classification
##                      Number of trees: 10000
## No. of variables tried at each split: 100
## 
##         OOB estimate of  error rate: 50.22%
## Confusion matrix:
##     0   1 class.error
## 0 192 249   0.5646259
## 1 203 256   0.4422658

pred1_AvgPhase <- predict(rf_depressed_AvgPhase, type="class")
confusionMatrix(pred1_AvgPhase, label2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 192 203
##          1 249 256
##                                          
##                Accuracy : 0.4978         
##                  95% CI : (0.4646, 0.531)
##     No Information Rate : 0.51           
##     P-Value [Acc > NIR] : 0.77842        
##                                          
##                   Kappa : -0.0069        
##                                          
##  Mcnemar's Test P-Value : 0.03429        
##                                          
##             Sensitivity : 0.4354         
##             Specificity : 0.5577         
##          Pos Pred Value : 0.4861         
##          Neg Pred Value : 0.5069         
##              Prevalence : 0.4900         
##          Detection Rate : 0.2133         
##    Detection Prevalence : 0.4389         
##       Balanced Accuracy : 0.4966         
##                                          
##        'Positive' Class : 0              
##

##### plot a simple decision tree
# library(rpart); library(rpart.plot)
## there is an error in installing the rattle package
# library(rattle)  ## this is for the fancyRpartPlot
set.seed(1234)
depress_tree_AvgPhase <- rpart(label2 ~ ., control=rpart.control(minsplit=30,cp=0.001,maxdepth=30),data=data2) 
pred1_AvgPhase <- predict(depress_tree_AvgPhase, type="class")
# library(caret)
confusionMatrix(pred1_AvgPhase, label2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 354  85
##          1  87 374
##                                           
##                Accuracy : 0.8089          
##                  95% CI : (0.7817, 0.8341)
##     No Information Rate : 0.51            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6176          
##                                           
##  Mcnemar's Test P-Value : 0.9392          
##                                           
##             Sensitivity : 0.8027          
##             Specificity : 0.8148          
##          Pos Pred Value : 0.8064          
##          Neg Pred Value : 0.8113          
##              Prevalence : 0.4900          
##          Detection Rate : 0.3933          
##    Detection Prevalence : 0.4878          
##       Balanced Accuracy : 0.8088          
##                                           
##        'Positive' Class : 0               
##

# windows(width=12, height=12) # For windows users
x11(width=10, height=8) # For Mac users

1.5 Figure 5.11A

rpart.plot(depress_tree_AvgPhase, type = 2, extra = 1, clip.right.labs = F, varlen=5, faclen=5, tweak=4)

################### Accuracy : 0.7956 #############################

## quartz_off_screen 
##                 2

### Prune decision tree
prune_depress_tree_AvgPhase <- 
  prune(depress_tree_AvgPhase,
        cp=depress_tree_AvgPhase$cptable[which.min(depress_tree_AvgPhase$cptable[,"xerror"]),"CP"]/(1.5))
# plot the pruned tree
#plot(prune_depress_tree_AvgPhase, uniform=TRUE, main="Pruned UKBB Decision Tree (Avg-Phase Synthesis)")
#text(prune_depress_tree_AvgPhase, use.n=TRUE, all=TRUE, cex=.8)
# windows(width=12, height=12) # For windows users
x11(width=10, height=8) # For Mac users

1.6 Figure 5.11B

rpart.plot(prune_depress_tree_AvgPhase, type = 4, extra = 1, clip.right.labs = F, varlen=5, faclen=5, tweak=1.5)

pred2_AvgPhase = predict(prune_depress_tree_AvgPhase, type="class")
confusionMatrix(pred2_AvgPhase, label2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 354  85
##          1  87 374
##                                           
##                Accuracy : 0.8089          
##                  95% CI : (0.7817, 0.8341)
##     No Information Rate : 0.51            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6176          
##                                           
##  Mcnemar's Test P-Value : 0.9392          
##                                           
##             Sensitivity : 0.8027          
##             Specificity : 0.8148          
##          Pos Pred Value : 0.8064          
##          Neg Pred Value : 0.8113          
##              Prevalence : 0.4900          
##          Detection Rate : 0.3933          
##    Detection Prevalence : 0.4878          
##       Balanced Accuracy : 0.8088          
##                                           
##        'Positive' Class : 0               
##

## quartz_off_screen 
##                 2

# 8. Perform Random Forest prediction (based on ift_NilPhase_X2mag) of:
##### Ever depressed for a whole week 1 #########################################
# library("randomForest")
# y_pheno <- comp_imp_tight106_UKBB_data[,"X4598.2.0"] ### Ever depressed for a whole week 1
# y_pheno <- as.factor(y_pheno)
label2 <- as.factor(epochs_tight106_UKBB_data_1[ ,"X4598.2.0"]) 
# use the real outcome not synthesized (as.factor(epochs_tight106_UKBB_data_1[ ,"X4598.2.0"]))
colnames(ift_NilPhase_X2mag) <- colnames(tight106_UKBB_data)
data2 <- 
  as.data.frame(ift_NilPhase_X2mag[ , !(colnames(ift_NilPhase_X2mag) %in%
                      c("X4598.0.0", "X4598.2.0", "cluster_2_cluster"))])
set.seed(1234)
rf_depressed_NilPhase <- 
  randomForest(label2 ~ . , importance=T, nodesize=30, mtry=100, ntree= 10000, data=data2)
rf_depressed_NilPhase

## 
## Call:
##  randomForest(formula = label2 ~ ., data = data2, importance = T,      nodesize = 30, mtry = 100, ntree = 10000) 
##                Type of random forest: classification
##                      Number of trees: 10000
## No. of variables tried at each split: 100
## 
##         OOB estimate of  error rate: 50.33%
## Confusion matrix:
##     0   1 class.error
## 0 193 248   0.5623583
## 1 205 254   0.4466231

pred1_NilPhase <- predict(rf_depressed_NilPhase, type="class")
confusionMatrix(pred1_NilPhase, label2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 193 205
##          1 248 254
##                                           
##                Accuracy : 0.4967          
##                  95% CI : (0.4635, 0.5299)
##     No Information Rate : 0.51            
##     P-Value [Acc > NIR] : 0.79773         
##                                           
##                   Kappa : -0.009          
##                                           
##  Mcnemar's Test P-Value : 0.04846         
##                                           
##             Sensitivity : 0.4376          
##             Specificity : 0.5534          
##          Pos Pred Value : 0.4849          
##          Neg Pred Value : 0.5060          
##              Prevalence : 0.4900          
##          Detection Rate : 0.2144          
##    Detection Prevalence : 0.4422          
##       Balanced Accuracy : 0.4955          
##                                           
##        'Positive' Class : 0               
##

##### plot a simple decision tree
# library(rpart); library(rpart.plot)
## there is an error in installing the rattle package
# library(rattle)  ## this is for the fancyRpartPlot
set.seed(1234)
depress_tree_NilPhase <- rpart(label2 ~ ., control=rpart.control(minsplit=30,cp=0.001,maxdepth=30),data=data2) 
pred1_NilPhase <- predict(depress_tree_NilPhase, type="class")
# library(caret)
confusionMatrix(pred1_NilPhase, label2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 341  86
##          1 100 373
##                                           
##                Accuracy : 0.7933          
##                  95% CI : (0.7654, 0.8193)
##     No Information Rate : 0.51            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5862          
##                                           
##  Mcnemar's Test P-Value : 0.3405          
##                                           
##             Sensitivity : 0.7732          
##             Specificity : 0.8126          
##          Pos Pred Value : 0.7986          
##          Neg Pred Value : 0.7886          
##              Prevalence : 0.4900          
##          Detection Rate : 0.3789          
##    Detection Prevalence : 0.4744          
##       Balanced Accuracy : 0.7929          
##                                           
##        'Positive' Class : 0               
##

# windows(width=12, height=12) # For windows users
x11(width=10, height=8) # For Mac users

1.7 Figure 5.12A

rpart.plot(depress_tree_NilPhase, type = 2, extra = 1, clip.right.labs = F, varlen=5, faclen=5, tweak=2)

################### Accuracy : 0.79  #############################

## quartz_off_screen 
##                 2

### Prune decision tree
prune_depress_tree_NilPhase <- 
  prune(depress_tree_NilPhase,
        cp=depress_tree_NilPhase$cptable[which.min(depress_tree_NilPhase$cptable[,"xerror"]),"CP"]/(1.5))
# plot the pruned tree
#plot(prune_depress_tree_NilPhase, uniform=TRUE, main="Pruned UKBB Decision Tree (Nil-Phase Synthesis)")
#text(prune_depress_tree_NilPhase, use.n=TRUE, all=TRUE, cex=.8)
# windows(width=12, height=12) # For windows users
x11(width=10, height=8) # For Mac users

1.8 Figure 5.12B

rpart.plot(prune_depress_tree_NilPhase, type = 4, extra = 1, clip.right.labs = F, varlen=5, faclen=5, tweak=2)

pred2_NilPhase = predict(prune_depress_tree_AvgPhase, type="class")
confusionMatrix(pred2_NilPhase, label2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 354  85
##          1  87 374
##                                           
##                Accuracy : 0.8089          
##                  95% CI : (0.7817, 0.8341)
##     No Information Rate : 0.51            
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6176          
##                                           
##  Mcnemar's Test P-Value : 0.9392          
##                                           
##             Sensitivity : 0.8027          
##             Specificity : 0.8148          
##          Pos Pred Value : 0.8064          
##          Neg Pred Value : 0.8113          
##              Prevalence : 0.4900          
##          Detection Rate : 0.3933          
##    Detection Prevalence : 0.4878          
##       Balanced Accuracy : 0.8088          
##                                           
##        'Positive' Class : 0               
##

## quartz_off_screen 
##                 2

# 9. Compare the analytics results from #3, ..., #8!
# Compare the original against nil-phase and avg-phase synthesized data
origNilNil_6rows_Compare <- rbind(head(epochs_tight106_UKBB_data_1), head(ift_NilPhase_X2mag), head(ift_AvgPhase_X2mag))
x1 <- 1:dim(epochs_tight106_UKBB_data_1)[2]
orig.mean <- apply(epochs_tight106_UKBB_data_1, 2, mean)
nilPhase.mean <- apply(ift_NilPhase_X2mag, 2, mean)
avgPhase.mean <- apply(ift_AvgPhase_X2mag, 2, mean)
# windows(width=12, height=12) # For windows users
x11(width=10, height=8) # For Mac users

1.9 Figure 5.13

plot(x1, orig.mean, main = "Comparing original UKBB against nil-phase and avg-phase synthesized data",
     col="green", lwd = 3, type="l", lty=1, xlab = "Features", ylab = "Averages across cases")
lines(x1, nilPhase.mean, col = "red", lwd = 3, lty=1)
lines(x1, avgPhase.mean, col = "blue", lwd = 3, lty=1)
legend("top", bty="n", legend=c(
  sprintf("Original"), sprintf("Nil-Phase Reconstruction"), 
  sprintf("Average-Phase Reconstruction")), 
  col=c("green", "red", "blue"), lty=c(1,1,1), lwd=c(3,3,3), cex=0.9)

## quartz_off_screen 
##                 2

stopCluster(clustUKBB)

As we have the benefit of hind-side, the prior study [Cite SOCR UKBB Paper] had identified the most salient derived neuroimaging and clinical biomarkers (\(k=107\)) in this study, including the physician identified clinical outcomes and the computed phenotypes using unsupervised machine learning methods. Therefore, for simplicity, this demonstration only focuses on these features, without a previous feature-selection preprocessing step. We are examining the effects of the kime phase on the scientific inference.

Note that averaging the phases in the Fourier domain assumes that either we have a very large number of samples that effectively span the range of kime-angles, or that the sampling is uniform on the kime-angles space. When these assumptions are violated, other phase-aggregation or phase-ensembling methods (e.g., weighted mean, non-parametric measures of centrality, or Bayesian strategies) may need to utilized to ensure the reliability of the final inference.

Figure 5.10_to_13

Back To Index

SOCR Team

June 2019