ensembl.rand.patients <- function(data,data.y,b){ # b is the number os bootstrap samples nvar.selected <- matrix(0,1,b) var.selected <- vector() MSE <- matrix(0,1,b) Miscl <- matrix(0,dim(data)[1],b) CookD <- matrix(0,dim(data)[1],b) CookD.influent <- matrix(0,dim(data)[1],b) for (i in 1:b){ data.boot.id <- sample(1:dim(data)[1],round(nrow(data)*0.8),replace=FALSE) data.boot <- data[data.boot.id,] data.y.boot <- data.y[data.boot.id] # Logistic model fitting with EN regularization, 10-fold cross-validation: fit.cv <- cv.glmnet(as.matrix(data.boot),as.factor(data.y.boot),family="binomial",alpha=0.7) variables <- which(fit.cv$glmnet.fit$beta[,which(fit.cv$cvm == min(fit.cv$cvm))] != 0) nvar.selected[i] <- length(variables) var.selected <- c(var.selected,variables) # Predictions obtained by model i: pred.cv <- predict(fit.cv,as.matrix(data),s="lambda.min",type="response") # Mean squared error of prediction (MSE) MSE[i] <- mean((data.y-pred.cv)^2) # Misclassified individuals (‘1’) based on method i: Miscl[,i] <- abs(data.y - round(pred.cv)) # The Cook’s distance for each individual based on method i: V <- diag(as.vector(sqrt(pred.cv*(1-pred.cv)))) H <- V %*% data[,variables] %*% (solve(t(data[,variables]) %*% V %*% data[,variables])) %*% t(data[,variables]) %*% V # the hat matrix CookD[,i] <- (data.y - round(pred.cv))^2 * diag(H) / ((pred.cv*(-pred.cv+1))*((-diag(H)+1))^2) CookD.influent[which(CookD[,i] > (4/nrow(data))),i] <- 1 } # The rank product: id <- which(apply(CookD,1,sum)!=0) CookD_nonzero <- CookD[id,] rank.matrix <- apply(CookD_nonzero,2, function(CookD_nonzero) rank(-(CookD_nonzero),ties.method = "first")) rank.product <- apply(rank.matrix, 1, prod) rho <-rank.product n <- dim(data[id,])[1] k <- dim(rank.matrix)[2] # The p-values: pvalues <- as.vector(rankprodbounds(rho,n,k,Delta ='geometric')) # The q-values: pvalues[pvalues>1] <- 1 qobj <- qvalue(pvalues) qvalues <- qobj$qvalues # n most influent by the Cook's Distance CookD.influent.by.method <- apply(CookD,2,function(CookD) length(which(CookD > (4/nrow(data))))) # Misclassifications Miscl.percent <- apply(Miscl,1, function(Miscl) round((sum(Miscl)/k)*100)) Miscl.by.method <- apply(Miscl,2, sum) # Building the rank matrix outliers.rank <-as.data.frame(cbind(id, rank.matrix,rank.product,pvalues,qvalues,Miscl.percent)) outliers.rank <- outliers.rank[order(qvalues),] list(nvariables.selected=nvar.selected, var.selected = var.selected, MSE = MSE, outliers.rank=outliers.rank, Misclassifications=Miscl, Misclassifications.percent=Miscl.percent, Misclassifications.by.method=Miscl.by.method, CookDistance=CookD, CookDistance.influent=CookD.influent, CookDistance.influent.by.method=CookD.influent.by.method) }