1 The BRCA dataset
- 1.1 TNBC dataset construction
2 Exploratory data analysis
3 Ensemble outlier detection

The data analysis described below requires R version 3.3.2 and Bioconductor version 3.4.

1 The BRCA dataset

The Cancer Genome Atlas Breast Invasive Carcinoma (TCGA-BRCA) data collection is part of a larger effort to build a research community focused on connecting cancer phenotypes to genotypes by providing clinical images matched to subjects from The Cancer Genome Atlas (TCGA).

The BRCA data is publicly available (https://cancergenome.nih.gov/) and is decomposed into two datasets:

the gene expression data, composed of 57251 variables for a total of 1222 samples with 1097 individuals. From those samples, 1102 with primary solid tumor, 7 metastatic and 113 with normal tissue;
the clinical data is composed of 114 variables obtained from the same individuals (much more is available in the gdc data variable, such as follow_up, drug, radiation, …)

The BRCA datasets will be imported using the package ‘brca.data’.

# Package developed by our group
#library(devtools)
#devtools::install_url('https://github.com/averissimo/brca.data/releases/download/1.0/brca.data_1.0.tar.gz')
library(brca.data)

1.1 TNBC dataset construction

Triple negative breast cancer (TNBC) refers to any BRCA that does not express the genes for the estrogen receptor (ESR1; ENSG00000091831), progesterone receptor (PGR; ENSG00000082175) and which does not overexpress the human epidermal growth factor receptor 2 (HER2; ENSG00000141736).

From the clinical data, the columns related to gene expression status of ESR1, PGR and HER2 must be selected. There are three possible variables for HER2:

‘her2_immunohistochemistry_level_result’, which refers to the ImmunoHistoChemistry (IHC) HER2 result into levels ‘0’ (negative), ‘1’ (negative), ‘2+’ (indeterminate) and ‘3+’(positive);
‘lab_proc_her2_neu_immunohistochemistry_receptor_status’, corresponding to the IHC result into status indeterminate, equivocal, negative and positive;
‘lab_procedure_her2_neu_in_situ_hybrid_outcome_type’, the HER2 individuals’ classiﬁcation by the ﬂuorescence in situ hybridization (FISH) method, considered more accurate and often performed when IHC results do not show a clear classiﬁcation of cells into HER2-positive or negative.

data("fpkm.per.tissue","fpkm.per.tissue.barcode", 'clinical', package = 'brca.data')

tnbc.vars <- c('breast_carcinoma_estrogen_receptor_status',
               'breast_carcinoma_progesterone_receptor_status',
               'her2_immunohistochemistry_level_result',
               'lab_proc_her2_neu_immunohistochemistry_receptor_status',
               'lab_procedure_her2_neu_in_situ_hybrid_outcome_type'
)
clinical.tnbc <- clinical$primary.solid.tumor[, tnbc.vars]

clinical.tnbc.aux <- clinical.tnbc
names(clinical.tnbc.aux) <- c("ESR1","PGR","HER2_level","HER2_status","HER2_FISH")

summary(clinical.tnbc.aux)

##             ESR1                PGR      HER2_level        HER2_status 
##               : 48                : 49       :471                :177  
##  Indeterminate:  2   Indeterminate:  4   0   : 60   Equivocal    :178  
##  Negative     :237   Negative     :343   1+  :271   Indeterminate: 12  
##  Positive     :803   Positive     :694   2+  :198   Negative     :559  
##  NA's         :  1   NA's         :  1   3+  : 90   Positive     :164  
##                                          NA's:  1   NA's         :  1  
##          HER2_FISH  
##               :673  
##  Equivocal    :  5  
##  Indeterminate:  4  
##  Negative     :330  
##  Positive     : 78  
##  NA's         :  1

which(is.na(clinical.tnbc.aux[,1]))

## [1] 321

clinical.tnbc.aux <- clinical.tnbc.aux[-(which(is.na(clinical.tnbc.aux[,1]))),]

Finding the concordance between the HER2 variables measured.

Concordance between HER2 (IHC) level and HER2 (IHC) status:

table(as.factor(clinical.tnbc.aux[,3]),clinical.tnbc.aux[,4])

##     
##          Equivocal Indeterminate Negative Positive
##      176         7             6      241       41
##   0    0         0             0       60        0
##   1+   0         4             1      255       11
##   2+   0       166             4        1       27
##   3+   1         1             1        2       85

Creating a vector of HER2 (IHC) status based on HER2 (ICH) level:

clinical.tnbc.aux$my_HER2_level <- as.character(clinical.tnbc.aux$HER2_level)

clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '0'),6] <- 'Negative'
clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '1+'),6] <- 'Negative'
clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '2+'),6] <- 'Indeterminate'
clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '3+'),6] <- 'Positive'

Individuals for which HER2 (IHC) status different from myHER2 (IHC) status (based on HER2 IHC level)

HER2status.dif.HER2level <- clinical.tnbc.aux[c(rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,6] == 'Negative' & clinical.tnbc.aux[,4] == 'Positive'),]),rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,6] == 'Positive' & clinical.tnbc.aux[,4] == 'Negative'),])),]

# getting the individuals short name, as in the clinical data
clinical.tnbc.barcode.pattern <- '(TCGA-[A-Z0-9a-z]{2}-[a-zA-Z0-9]{4})-([0-9]{2}).+'
clinical.tnbc.shortname <- gsub(clinical.tnbc.barcode.pattern,'\\1',rownames(clinical.tnbc))

# individuals with discordant HER2 (IHC) status and HER2 (IHC) level
HER2status.dif.HER2level.names <- clinical.tnbc.shortname[which(clinical.tnbc.shortname %in% rownames(HER2status.dif.HER2level))]
length(HER2status.dif.HER2level.names)

## [1] 13

Concordance between HER2 (IHC) status and HER2 (FISH):

table(clinical.tnbc.aux[,5],clinical.tnbc.aux[,4])

##                
##                     Equivocal Indeterminate Negative Positive
##                 115        18             4      432      104
##   Equivocal       0         3             0        0        2
##   Indeterminate   0         0             0        3        1
##   Negative       53       138             6      121       12
##   Positive        9        19             2        3       45

Individuals for which HER2 (IHC) status different from HER2 (FISH):

HER2status.dif.FISH <- clinical.tnbc.aux[c(rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,4] == 'Negative' & clinical.tnbc.aux[,5] == 'Positive'),]),rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,4] == 'Positive' & clinical.tnbc.aux[,5] == 'Negative'),])),]

HER2status.dif.HER2FISH.names <- clinical.tnbc.shortname[which(clinical.tnbc.shortname %in% rownames( HER2status.dif.FISH))]
length(HER2status.dif.HER2FISH.names)

## [1] 15

# suspect individuals with discordante HER2 labels from different lab testing
suspect <- c(HER2status.dif.HER2level.names,HER2status.dif.HER2FISH.names)
length(suspect)

## [1] 28

Replacing HER2 (IHC) status by HER2 (FISH) (a more accurate test) whenever available:

clinical.tnbc.aux$HER2_status_plus_FISH <- clinical.tnbc.aux[,4]
clinical.tnbc.aux[clinical.tnbc.aux[,5] !='',7] <- clinical.tnbc.aux[clinical.tnbc.aux[,5] !='',5]

Concordance between HER2 (IHC) status and HER2 (IHC status + FISH):

table(clinical.tnbc.aux[,4],clinical.tnbc.aux[,7])

##                
##                     Equivocal Indeterminate Negative Positive
##                 115         0             0       53        9
##   Equivocal       0        21             0      138       19
##   Indeterminate   0         0             4        6        2
##   Negative        0         0             3      553        3
##   Positive        0         2             1       12      149

Building the final clinical TNBC dataset, composed of all individuals and gene expression status of ESR1, PGR and HER2:

clinical.tnbc <- clinical.tnbc.aux[,c(1,2,7)]

# replace all '' or NA with 'Indeterminate'
clinical.tnbc[is.na(clinical.tnbc) | clinical.tnbc == '' | clinical.tnbc == 'Equivocal'] <- 'Indeterminate'

# individuals with at least one 'Positive' and keep all are marked as not TNBC
tnbc.status.pos <- clinical.tnbc == 'Positive'
#tnbc.status.pos <- clinical.tnbc[,c(1,2,3)] == 'Positive'
not.tnbc.id     <- rownames(tnbc.status.pos[which(sapply(seq(nrow(tnbc.status.pos)), function(ix) { any(tnbc.status.pos[ix,]) })),])

# individuals with at least one 'Indeterminate' and excluded from dataset
tnbc.status.ind <- clinical.tnbc[!(rownames(clinical.tnbc) %in% not.tnbc.id),] == 'Indeterminate'
not.tnbc.id     <- rownames(tnbc.status.ind[which(sapply(seq(nrow(tnbc.status.ind)), function(ix) { any(tnbc.status.ind[ix,]) })),])
clinical.tnbc <- clinical.tnbc[!(rownames(clinical.tnbc) %in% not.tnbc.id), ]

tnbc.status.neg <- clinical.tnbc == 'Negative'
tnbc.ix <- sapply(seq(nrow(clinical.tnbc)), function(ix) { all(tnbc.status.neg[ix,]) })

# set cases with TNBC
clinical.tnbc$tnbc <- 'NO_TNBC'
clinical.tnbc[tnbc.ix, 'tnbc'] <- 'TNBC'

Building the Y (binary data, i.e., TNBC vs. nonTNBC) response vector:

# building the ydata (binary)
ydata <- data.frame(tnbc = clinical.tnbc$tnbc, row.names = rownames(clinical.tnbc))
ydata$tnbc <- factor(ydata$tnbc)

Protein coding genes catalog for building the X gene expression matrix

I. Install packages

source("http://bioconductor.org/biocLite.R")

## Bioconductor version 3.6 (BiocInstaller 1.28.0), ?biocLite for help

biocLite(c('ensembldb', 'EnsDb.Hsapiens.v86'))

## BioC_mirror: https://bioconductor.org

## Using Bioconductor 3.6 (BiocInstaller 1.28.0), R 3.4.2 (2017-09-28).

## Installing package(s) 'ensembldb', 'EnsDb.Hsapiens.v86'

## Old packages: 'biomaRt', 'chron', 'circlize', 'cowplot', 'dbplyr',
##   'digest', 'dnet', 'dotCall64', 'DRR', 'DT', 'edgeR', 'ellipse', 'fgsea',
##   'fpc', 'GenomicRanges', 'git2r', 'haven', 'hexbin', 'Hmisc', 'hms',
##   'htmlTable', 'htmlwidgets', 'irlba', 'knitr', 'lava', 'lhs', 'limma',
##   'littler', 'lme4', 'MASS', 'matrixStats', 'mgcv', 'microbenchmark',
##   'mvabund', 'pcaPP', 'psichomics', 'Rcpp', 'RCurl', 'recipes', 'rlang',
##   'rms', 'rpart', 'rprojroot', 'rtracklayer', 'shinyjs', 'sp', 'spam',
##   'SummarizedExperiment', 'TCGAbiolinks', 'tibble', 'TTR', 'viridis',
##   'withr', 'xml2', 'xts', 'zoo'

# Load libraries

suppressPackageStartupMessages({
  library(ensembldb)
  library(EnsDb.Hsapiens.v86)
  library(futile.logger)
  .Last.value <- flog.layout(layout.format('~m'))
})

Load ensembl data

Ensembl is a genome browser for vertebrate genomes that supports research in comparative genomics, evolution, sequence variation and transcriptional regulation. Ensembl annotate genes, computes multiple alignments, predicts regulatory function and collects disease data. Ensembl tools include BLAST, BLAT, BioMart and the Variant Effect Predictor (VEP) for all supported species.

Home page

edb <- EnsDb.Hsapiens.v86
# hasProteinData(edb) # from devel only, as of 3.4
ensembl.protein.coding <- genes(edb, 
                                filter = list(GenebiotypeFilter('protein_coding')), 
                                columns = c('gene_id', 'gene_name'))
{
  flog.info('         Granges: %d', nrow(ensembl.protein.coding@elementMetadata))
  flog.info('Metadata columns: %d', ncol(ensembl.protein.coding@elementMetadata))
}

##          Granges: 22285
## Metadata columns: 3

Load ccds data

The Consensus CDS (CCDS) project is a collaborative effort to identify a core set of human and mouse protein coding regions that are consistently annotated and of high quality. The long term goal is to support convergence towards a standard set of gene annotations.

ccds <- read.table(url('ftp://ftp.ncbi.nih.gov/pub/CCDS/current_human/CCDS.current.txt'),
                   sep = '\t', 
                   header = T, 
                   comment.char = "|", # necessary as header line has a # character
                   stringsAsFactors = FALSE)
flog.info('Size of ccds: %d x %d', nrow(ccds), ncol(ccds))

## Size of ccds: 34245 x 11

Build unique gene set

ensembl.genes         <- sort(unique(ensembl.protein.coding@elementMetadata$gene_name))
ensembl.genes.ensg.id <- sort(ensembl.protein.coding@elementMetadata$gene_id)

ccds.genes       <- sort(unique(ccds$gene))
ccds.extra.genes <- sort(ccds.genes[(!ccds.genes %in% ensembl.genes)])
ccds.extra.genes.ensg.id <- genes(edb, filter = list(GenenameFilter(ccds.extra.genes)), 
                                  columns = c('gene_id', 'gene_name'))

ensg.id <- sort(unique(c(ensembl.protein.coding@elementMetadata$gene_id, ensembl.genes.ensg.id)))

all.genes <- sort(unique(c(ensembl.genes, ccds.extra.genes)))

Building the X gene expression data matrix:

xdata <- t(fpkm.per.tissue$primary.solid.tumor[which(rownames(fpkm.per.tissue$primary.solid.tumor) %in% ensg.id),-which(duplicated(fpkm.per.tissue.barcode$primary.solid.tumor))])
common.rows <- getParticipantCode(rownames(xdata)) %in% rownames(ydata)
xdata <- xdata[common.rows,]

# removing variables with sd=0
xdata.sd <- sapply(seq(ncol(xdata)), function(ix) {sd(xdata[,ix])})
xdata <- xdata[,xdata.sd != 0]

# Gene expression data for ESR1, PGR and HER2 from the suspect (potential outlier) cases identified above

# Clinical and gene expression data of individuals with discordant HER2 level and HER2 status
xdata.shortname <- getParticipantCode(rownames(xdata))

xdata.plus.clinical.HER2status.dif.HER2level <- cbind(xdata[which(xdata.shortname %in% HER2status.dif.HER2level.names), c("ENSG00000091831","ENSG00000082175","ENSG00000141736")],clinical.tnbc.aux[which(rownames(clinical.tnbc.aux) %in% HER2status.dif.HER2level.names),],ydata$tnbc[which(rownames(ydata) %in% HER2status.dif.HER2level.names)])

# clinical and gene expression data of individuals with discordant HER2 status and HER2 FISH
xdata.shortname <- getParticipantCode(rownames(xdata))
xdata.plus.clinical.HER2status.dif.HER2FISH <- cbind(xdata[which(xdata.shortname %in% HER2status.dif.HER2FISH.names), c("ENSG00000091831","ENSG00000082175","ENSG00000141736")],clinical.tnbc.aux[which(rownames(clinical.tnbc.aux) %in% HER2status.dif.HER2FISH.names),],ydata$tnbc[which(rownames(ydata) %in% HER2status.dif.HER2FISH.names)])

Data transformation to be used in further analysis:

data.Y <- as.matrix(ydata)
data.Y[data.Y =="NO_TNBC"] <- 0
data.Y[data.Y =="TNBC"] <- 1
data.Y <- as.numeric(data.Y)

# log-transform xdata
data <- log2(xdata+1)

# normalizing xdata
data <- (data-matrix(apply(data,2,mean),nrow(data),ncol(data),byrow=TRUE))/matrix(apply(data,2,sd),nrow(data),ncol(data),byrow=TRUE)

2 Exploratory data analysis

Summary of TNBC biomarkers:

summary(xdata[which(data.Y=="1"),c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])

##  ENSG00000091831    ENSG00000082175     ENSG00000141736  
##  Min.   : 0.01938   Min.   : 0.001169   Min.   :  1.561  
##  1st Qu.: 0.16063   1st Qu.: 0.037205   1st Qu.: 13.964  
##  Median : 0.35070   Median : 0.079403   Median : 19.776  
##  Mean   : 1.52980   Mean   : 0.711833   Mean   : 21.991  
##  3rd Qu.: 0.82760   3rd Qu.: 0.186198   3rd Qu.: 26.058  
##  Max.   :29.97893   Max.   :22.977867   Max.   :103.689

summary(xdata[which(data.Y=="0"),c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])

##  ENSG00000091831     ENSG00000082175    ENSG00000141736    
##  Min.   :  0.01585   Min.   :  0.0079   Min.   :   0.6054  
##  1st Qu.: 16.14359   1st Qu.:  0.6003   1st Qu.:  26.5798  
##  Median : 36.66739   Median :  4.2282   Median :  38.7319  
##  Mean   : 47.88104   Mean   : 12.0115   Mean   :  99.7406  
##  3rd Qu.: 69.64894   3rd Qu.: 15.3263   3rd Qu.:  58.8010  
##  Max.   :272.20332   Max.   :327.9133   Max.   :1668.3526

# Building the training and test sets; keep in the training set observations ESR1- and PGR- that would have the opposite classification if HER2 (FISH) had not replaced the HER2 (IHC) status; and observations ESR1- and PGR- for which FISH was not measured and have no concordance between HER2 (IHC) status and HER2 (IHC) level (having a potential impact on the final TNBC/nonTNBC classification)

# taking individuals ID from the 'HER2status.dif.FISH' and 'HER2status.dif.HER2level' created above
HER2status.dif.FISH

##                  ESR1      PGR HER2_level HER2_status HER2_FISH
## TCGA-LL-A5YP Positive Negative         1+    Negative  Positive
## TCGA-AO-A0JL Negative Negative         1+    Negative  Positive
## TCGA-A2-A04U Negative Negative         1+    Negative  Positive
## TCGA-A2-A0EQ Negative Negative         3+    Positive  Negative
## TCGA-BH-A18T Negative Negative         2+    Positive  Negative
## TCGA-AN-A0XV Positive Positive         2+    Positive  Negative
## TCGA-AO-A12C Positive Positive         3+    Positive  Negative
## TCGA-BH-A18Q Negative Negative               Positive  Negative
## TCGA-LL-A5YL Positive Negative         3+    Positive  Negative
## TCGA-E2-A10A Positive Positive         2+    Positive  Negative
## TCGA-AO-A03L Positive Positive         2+    Positive  Negative
## TCGA-AO-A12G Positive Positive         2+    Positive  Negative
## TCGA-BH-A1EX Positive Positive               Positive  Negative
## TCGA-BH-A0AU Positive Positive               Positive  Negative
## TCGA-AN-A0XW Positive Positive         2+    Positive  Negative
##              my_HER2_level
## TCGA-LL-A5YP      Negative
## TCGA-AO-A0JL      Negative
## TCGA-A2-A04U      Negative
## TCGA-A2-A0EQ      Positive
## TCGA-BH-A18T Indeterminate
## TCGA-AN-A0XV Indeterminate
## TCGA-AO-A12C      Positive
## TCGA-BH-A18Q              
## TCGA-LL-A5YL      Positive
## TCGA-E2-A10A Indeterminate
## TCGA-AO-A03L Indeterminate
## TCGA-AO-A12G Indeterminate
## TCGA-BH-A1EX              
## TCGA-BH-A0AU              
## TCGA-AN-A0XW Indeterminate

HER2status.dif.HER2level

##                  ESR1      PGR HER2_level HER2_status HER2_FISH
## TCGA-AC-A8OS Positive Positive         1+    Positive          
## TCGA-AN-A0FL Negative Negative         1+    Positive          
## TCGA-AN-A0FX Negative Negative         1+    Positive          
## TCGA-AN-A0FK Positive Positive         1+    Positive          
## TCGA-E9-A295 Positive Positive         1+    Positive          
## TCGA-AC-A3YI Positive Positive         1+    Positive          
## TCGA-JL-A3YW Positive Positive         1+    Positive          
## TCGA-AN-A0FS Positive Negative         1+    Positive          
## TCGA-AN-A0FN Positive Positive         1+    Positive          
## TCGA-AN-A0FJ Positive Negative         1+    Positive          
## TCGA-AN-A03X Positive Positive         1+    Positive          
## TCGA-LL-A73Y Negative Negative         3+    Negative          
## TCGA-AC-A3W6 Positive Positive         3+    Negative          
##              my_HER2_level
## TCGA-AC-A8OS      Negative
## TCGA-AN-A0FL      Negative
## TCGA-AN-A0FX      Negative
## TCGA-AN-A0FK      Negative
## TCGA-E9-A295      Negative
## TCGA-AC-A3YI      Negative
## TCGA-JL-A3YW      Negative
## TCGA-AN-A0FS      Negative
## TCGA-AN-A0FN      Negative
## TCGA-AN-A0FJ      Negative
## TCGA-AN-A03X      Negative
## TCGA-LL-A73Y      Positive
## TCGA-AC-A3W6      Positive

set.seed(2010)

test.id <- sample((1:nrow(data))[-which(getParticipantCode(rownames(data)) %in% c("TCGA-A2-A0EQ","TCGA-BH-A18T","TCGA-AO-A0JL","TCGA-BH-A18Q","TCGA-A2-A04U","TCGA-LL-A73Y","TCGA-AN-A0FL","TCGA-AN-A0FX"))],round(nrow(data)/4),replace=FALSE)

data.train <- data[-test.id,]
data.Y.train <- data.Y[-test.id]

data.test <- data[test.id,]
data.Y.test <- data.Y[test.id]

Fitting a logistic model based on the three biomarker variables (ESR1, PGR and HER2):

glm.data.train.3var <- data.frame(data.Y.train,data.train[,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
colnames(glm.data.train.3var)[1] <- 'data.y'

# without interaction
glm.fit.3var <- glm(data.y~.,family=binomial,data=glm.data.train.3var,control = glm.control(maxit = 1000))
summary(glm.fit.3var)

## 
## Call:
## glm(formula = data.y ~ ., family = binomial, data = glm.data.train.3var, 
##     control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.92289  -0.17395  -0.08124  -0.03853   3.15187  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.94274    0.35293 -11.171  < 2e-16 ***
## ENSG00000091831 -2.76484    0.35260  -7.841 4.46e-15 ***
## ENSG00000082175 -0.05407    0.37973  -0.142    0.887    
## ENSG00000141736 -1.04076    0.18347  -5.673 1.40e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 667.69  on 763  degrees of freedom
## Residual deviance: 233.49  on 760  degrees of freedom
## AIC: 241.49
## 
## Number of Fisher Scoring iterations: 7

# with interaction
glm.fit.3var <- glm(data.y~ENSG00000091831*ENSG00000082175*ENSG00000141736,family=binomial,data=glm.data.train.3var,control = glm.control(maxit = 1000))
summary(glm.fit.3var)

## 
## Call:
## glm(formula = data.y ~ ENSG00000091831 * ENSG00000082175 * ENSG00000141736, 
##     family = binomial, data = glm.data.train.3var, control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.93100  -0.17283  -0.09933  -0.04878   3.13543  
## 
## Coefficients:
##                                                 Estimate Std. Error
## (Intercept)                                     -4.10968    0.42940
## ENSG00000091831                                 -2.22048    0.62773
## ENSG00000082175                                 -0.18193    0.48687
## ENSG00000141736                                 -1.26152    0.50488
## ENSG00000091831:ENSG00000082175                  0.58039    0.52550
## ENSG00000091831:ENSG00000141736                  0.07795    0.75757
## ENSG00000082175:ENSG00000141736                 -0.37518    0.56887
## ENSG00000091831:ENSG00000082175:ENSG00000141736 -0.01215    0.66538
##                                                 z value Pr(>|z|)    
## (Intercept)                                      -9.571  < 2e-16 ***
## ENSG00000091831                                  -3.537 0.000404 ***
## ENSG00000082175                                  -0.374 0.708646    
## ENSG00000141736                                  -2.499 0.012466 *  
## ENSG00000091831:ENSG00000082175                   1.104 0.269396    
## ENSG00000091831:ENSG00000141736                   0.103 0.918045    
## ENSG00000082175:ENSG00000141736                  -0.660 0.509563    
## ENSG00000091831:ENSG00000082175:ENSG00000141736  -0.018 0.985436    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 667.69  on 763  degrees of freedom
## Residual deviance: 230.81  on 756  degrees of freedom
## AIC: 246.81
## 
## Number of Fisher Scoring iterations: 8

# predictions for the train and test sets
pred.train.glm.3var <- glm.fit.3var$fitted.values
pred.test.glm.3var <- predict(glm.fit.3var,as.data.frame(data.test[,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")]),type="response")

Misclassifications in the training and test sets:

# misclassified
glm.3var.miscl.train <- rownames(data.train[abs(data.Y.train - round(pred.train.glm.3var)) == 1,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
length(glm.3var.miscl.train)

## [1] 45

glm.3var.miscl.test <- rownames(data.test[abs(data.Y.test - round(pred.test.glm.3var)) == 1,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
length(glm.3var.miscl.test)

## [1] 12

45 misclassifications in the training set and 12 in the test set.

Dubious training cases which are misclassified:

# misclassified and suspect in the train set
glm.3var.train.miscl.suspect <- glm.3var.miscl.train[which(getParticipantCode(glm.3var.miscl.train) %in% suspect)]
glm.3var.train.miscl.suspect

## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"
## [3] "TCGA-AO-A0JL-01A-11R-A056-07" "TCGA-AN-A0FL-01A-11R-A034-07"
## [5] "TCGA-AN-A0FX-01A-11R-A034-07" "TCGA-BH-A18Q-01A-12R-A12D-07"
## [7] "TCGA-AN-A0FJ-01A-11R-A00Z-07" "TCGA-A2-A04U-01A-11R-A115-07"

# misclassified and suspect in the test set
glm.3var.test.miscl.suspect <- glm.3var.miscl.test[which(getParticipantCode(glm.3var.miscl.test) %in% suspect)]
glm.3var.test.miscl.suspect

## [1] "TCGA-JL-A3YW-01A-12R-A239-07"

Several suspect cases with discordant HER2 (IHC) status and HER2 (FISH), as well as discordant HER2 (IHC) level and status are indeed influent and misclassified, based on the three biomarkers.

Looking for possible confounding variables in the clinical data:

# categorical variables identified as potential interesting variables
clinical.interest <- c('age_at_initial_pathologic_diagnosis','race_list','history_of_neoadjuvant_treatment','gender','person_neoplasm_cancer_status','stage_event_pathologic_stage','stage_event_tnm_categories','menopause_status')
TNBC.glm.clinical <- clinical$primary.solid.tumor[, clinical.interest]

TNBC.glm.clinical <- data.frame(data.Y,TNBC.glm.clinical[rownames(clinical.tnbc), ])

# Univariate logistic models
TNBC.glm.fit <- glm(data.Y~age_at_initial_pathologic_diagnosis,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit)# significant

## 
## Call:
## glm(formula = data.Y ~ age_at_initial_pathologic_diagnosis, family = binomial, 
##     data = TNBC.glm.clinical, control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8111  -0.6258  -0.5557  -0.4711   2.2501  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)                         -0.331167   0.391982  -0.845 0.398194
## age_at_initial_pathologic_diagnosis -0.023526   0.006829  -3.445 0.000571
##                                        
## (Intercept)                            
## age_at_initial_pathologic_diagnosis ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 885.56  on 1017  degrees of freedom
## Residual deviance: 873.31  on 1016  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 877.31
## 
## Number of Fisher Scoring iterations: 4

TNBC.glm.fit <- glm(data.Y~gender,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit)

## 
## Call:
## glm(formula = data.Y ~ gender, family = binomial, data = TNBC.glm.clinical, 
##     control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5883  -0.5883  -0.5883  -0.5883   1.9181  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.6665     0.0862 -19.333   <2e-16 ***
## genderMALE  -13.8995   420.1371  -0.033    0.974    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 885.90  on 1018  degrees of freedom
## Residual deviance: 881.77  on 1017  degrees of freedom
## AIC: 885.77
## 
## Number of Fisher Scoring iterations: 14

TNBC.glm.fit <- glm(data.Y~race_list,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit) # significant

## 
## Call:
## glm(formula = data.Y ~ race_list, family = binomial, data = TNBC.glm.clinical, 
##     control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8712  -0.5286  -0.5286  -0.5286   2.2792  
## 
## Coefficients:
##                                           Estimate Std. Error z value
## (Intercept)                                -2.5200     0.3929  -6.414
## race_listAMERICAN INDIAN OR ALASKA NATIVE -11.0461   535.4113  -0.021
## race_listASIAN                              0.7493     0.5483   1.367
## race_listBLACK OR AFRICAN AMERICAN          1.7468     0.4259   4.101
## race_listWHITE                              0.6223     0.4086   1.523
##                                           Pr(>|z|)    
## (Intercept)                               1.42e-10 ***
## race_listAMERICAN INDIAN OR ALASKA NATIVE    0.984    
## race_listASIAN                               0.172    
## race_listBLACK OR AFRICAN AMERICAN        4.11e-05 ***
## race_listWHITE                               0.128    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 885.90  on 1018  degrees of freedom
## Residual deviance: 849.12  on 1014  degrees of freedom
## AIC: 859.12
## 
## Number of Fisher Scoring iterations: 12

TNBC.glm.fit <- glm(data.Y~history_of_neoadjuvant_treatment,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit)

## 
## Call:
## glm(formula = data.Y ~ history_of_neoadjuvant_treatment, family = binomial, 
##     data = TNBC.glm.clinical, control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1774  -0.5872  -0.5872  -0.5872   1.9198  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)                          8.277e-15  1.414e+00   0.000    1.000
## history_of_neoadjuvant_treatmentNo  -1.670e+00  1.417e+00  -1.179    0.238
## history_of_neoadjuvant_treatmentYes -1.557e+01  4.037e+02  -0.039    0.969
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 885.90  on 1018  degrees of freedom
## Residual deviance: 880.17  on 1016  degrees of freedom
## AIC: 886.17
## 
## Number of Fisher Scoring iterations: 14

TNBC.glm.fit <- glm(data.Y~person_neoplasm_cancer_status,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))

TNBC.glm.fit <- glm(data.Y~stage_event_pathologic_stage ,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))

TNBC.glm.fit <- glm(data.Y~stage_event_tnm_categories ,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))

TNBC.glm.fit <- glm(data.Y~menopause_status ,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))

# building a multivariate glm model accouting for all potential interesting categorical variables
TNBC.glm.fit <- glm(data.Y~age_at_initial_pathologic_diagnosis+gender+race_list+history_of_neoadjuvant_treatment+menopause_status+person_neoplasm_cancer_status+stage_event_pathologic_stage,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
#summary(TNBC.glm.fit)

# Chi-squared test between response and race (significant)
chisq.test(table(TNBC.glm.clinical$data.Y,TNBC.glm.clinical$race_list))

## 
##  Pearson's Chi-squared test
## 
## data:  table(TNBC.glm.clinical$data.Y, TNBC.glm.clinical$race_list)
## X-squared = 41.392, df = 4, p-value = 2.229e-08

# Fisher test (more accurate - significant)
fisher.test(table(TNBC.glm.clinical$data.Y,TNBC.glm.clinical$race_list))

## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(TNBC.glm.clinical$data.Y, TNBC.glm.clinical$race_list)
## p-value = 8.796e-08
## alternative hypothesis: two.sided

# building the final matrix (gene expression plus age and race)
data.plus.race.age <- data
names(TNBC.glm.clinical)

## [1] "data.Y"                             
## [2] "age_at_initial_pathologic_diagnosis"
## [3] "race_list"                          
## [4] "history_of_neoadjuvant_treatment"   
## [5] "gender"                             
## [6] "person_neoplasm_cancer_status"      
## [7] "stage_event_pathologic_stage"       
## [8] "stage_event_tnm_categories"         
## [9] "menopause_status"

age.vect <- TNBC.glm.clinical[,2] 
race.vect <- as.character(TNBC.glm.clinical[,3])
race.vect[race.vect==""] <- ""
race.vect[race.vect=="AMERICAN INDIAN OR ALASKA NATIVE"] <- "0"
race.vect[race.vect=="WHITE"] <- "0"
race.vect[race.vect=="ASIAN"] <- "0"
race.vect[race.vect=="BLACK OR AFRICAN AMERICAN"] <- "1"
data.plus.race.age <- cbind(data.plus.race.age,as.factor(race.vect),age.vect,as.factor(data.Y))
dim(data.plus.race.age)

## [1]  1019 19691

# changing binary code for race ("1" is "african american or black")
data.plus.race.age <- data.plus.race.age[data.plus.race.age[,19689]!="1",]# eliminates individuals without race
data.plus.race.age[data.plus.race.age[,19689]=="2",19689] <- 0
data.plus.race.age[data.plus.race.age[,19689]=="3",19689] <- 1

# changing binary code for Y (response)
data.plus.race.age[data.plus.race.age[,19691]=="1",19691] <- 0
data.plus.race.age[data.plus.race.age[,19691]=="2",19691] <- 1

summary(data.plus.race.age[,19690])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   26.00   49.00   58.00   58.06   67.00   90.00       1

data.plus.race.age <- data.plus.race.age[-which(is.na(data.plus.race.age[,19690])),]
# eliminates individuals without age record

dim(data.plus.race.age)

## [1]   924 19691

data.plus.race.age.rank <- data.plus.race.age

# normalizing age
data.plus.race.age.rank[,19690] <- (data.plus.race.age.rank[,19690] - mean(data.plus.race.age.rank[,19690]))/ sd(data.plus.race.age.rank[,19690]) # normalized

data.Y.rank <- data.plus.race.age.rank[,19691]
length(data.Y.rank)

## [1] 924

data.plus.race.age.rank <- data.plus.race.age.rank[,-19691]
dim(data.plus.race.age.rank)

## [1]   924 19690

3 Ensemble outlier detection

3.1 TNBC original data

set.seed(2010)

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:S4Vectors':
## 
##     expand

## Loading required package: foreach

## Loaded glmnet 2.0-13

my.alpha <- seq(0.1,0.9,0.1) 

## Optimizing alpha and lambda for logistic regression with Elastic net regularization

nvar.selected.EN <- matrix(0,1,length(my.alpha))
pred.EN <- matrix(0,dim(data.plus.race.age.rank)[1],length(my.alpha))
MSE.EN <- matrix(0,1,length(my.alpha))

# assigning samples to folds, to be used in cross-validation when tunning alpha
set.seed(2010)
foldid.tnbc <- sample(1:10,size=length(data.Y.rank),replace=TRUE)

for (j in 1:length(my.alpha)){
  
  # Logistic model fitting with 10-fold cross-validation for glmnet:
  fit.EN.cv <- cv.glmnet(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank),family="binomial",foldid=foldid.tnbc,alpha=my.alpha[j])
  
  var.selected.EN <- which(fit.EN.cv$glmnet.fit$beta[,which(fit.EN.cv$cvm == min(fit.EN.cv$cvm))] != 0)
  nvar.selected.EN[j] <- length(var.selected.EN)
  
  # Predictions obtained by model i
  pred.EN[,j] <- predict(fit.EN.cv,as.matrix(data.plus.race.age.rank),s="lambda.min",type="response")
  
  # Mean squared error of prediction (MSE)
  MSE.EN[j] <- mean((data.Y.rank-pred.EN)^2)
}

MSE.EN

##           [,1]      [,2]      [,3]       [,4]       [,5]       [,6]
## [1,] 0.1492157 0.1326005 0.1157746 0.09898047 0.08214026 0.06548295
##            [,7]       [,8]      [,9]
## [1,] 0.04891806 0.03246128 0.0162588

nvar.selected.EN

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,]  438  282  230  197  175  152  143  126  107

my.alpha[which(MSE.EN == min(MSE.EN))]

## [1] 0.9

## Model ensemble

n.models <- 3
nvar.selected <- matrix(0,1,n.models)
MSE <- matrix(0,1,n.models)
Miscl <- matrix(0,dim(data.plus.race.age.rank)[1],n.models)
CookD <- matrix(0,dim(data.plus.race.age.rank)[1],n.models)

# logistic regression with Elastic net regularization (LOGIT-EN)

i <- 1

my.alpha.opt <- my.alpha[which(MSE.EN == min(MSE.EN))]

fit.EN.cv <- cv.glmnet(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank),family="binomial",foldid=foldid.tnbc,alpha=my.alpha[which(MSE.EN == min(MSE.EN))])

var.selected.EN <- which(fit.EN.cv$glmnet.fit$beta[,which(fit.EN.cv$cvm == min(fit.EN.cv$cvm))] != 0)
nvar.selected[i] <- length(var.selected.EN)

assign(paste("var.selected", i, sep = ""), var.selected.EN)

# Predictions obtained by model i:
pred.EN <- predict(fit.EN.cv,as.matrix(data.plus.race.age.rank),s="lambda.min",type="response")

# Mean squared error of prediction (MSE):
MSE[i] <- mean((data.Y.rank-pred.EN)^2)

# Misclassified individuals based on model i:
Miscl[,i] <- abs(data.Y.rank - round(pred.EN))

table(data.Y.rank,round(pred.EN))

##            
## data.Y.rank   0   1
##           0 763   8
##           1   8 145

# The Cook’s distance for each individual based on model i:
V <- diag(as.vector(sqrt(pred.EN*(1-pred.EN))))
H <- V %*% data.plus.race.age.rank[,var.selected.EN] %*% (solve(t(data.plus.race.age.rank[,var.selected.EN]) %*% V %*% data.plus.race.age.rank[,var.selected.EN])) %*% t(data.plus.race.age.rank[,var.selected.EN]) %*% V # the hat matrix

CookD[,i] <- (data.Y.rank - pred.EN)^2 * diag(H) / ((pred.EN*(-pred.EN+1))*((-diag(H)+1))^2)


# SPLSDA

i <- 2

library(spls)

## Sparse Partial Least Squares (SPLS) Regression and
## Classification (version 2.2-1)

# Optimizing K and eta by cross-validation
#fit.splsda.LOGIT.cv <- cv.splsda(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank), fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), kappa=0.5, classifier="logistic", scale.x=FALSE,n.core=10)

#fit.splsda.LOGIT.cv$K.opt
#fit.splsda.LOGIT.cv$eta.opt

# fixing the optimum K and eta obtained by cross-validation (for reproducibility)
fit.splsda.LOGIT <- splsda(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank), K = 4, eta=0.8, classifier="logistic", scale.x=FALSE)

var.selected.splsda.LOGIT <- fit.splsda.LOGIT$A
nvar.selected[i] <- length(var.selected.splsda.LOGIT)

assign(paste("var.selected", i, sep = ""), var.selected.splsda.LOGIT)

# Predictions obtained by model i:
pred.splsda.LOGIT <- predict(fit.splsda.LOGIT, as.matrix(data.plus.race.age.rank),type = "fit","coefficient", fit.type = "response")

table(data.Y.rank,round(pred.splsda.LOGIT))

##            
## data.Y.rank   0   1
##           0 756  15
##           1  14 139

# Mean squared error of prediction (MSE):
MSE[i] <- mean((data.Y.rank-pred.splsda.LOGIT)^2)

# Misclassified individuals based on model i:
Miscl[,i] <- abs(data.Y.rank - round(pred.splsda.LOGIT))

# The Cook’s distance for each individual based on model i:
V <- diag(as.vector(sqrt(pred.splsda.LOGIT*(1-pred.splsda.LOGIT))))
H <- V %*% (fit.splsda.LOGIT$T) %*% (solve(t(fit.splsda.LOGIT$T) %*% V %*% fit.splsda.LOGIT$T)) %*% t(fit.splsda.LOGIT$T) %*% V # the hat matrix

CookD[,i] <- (data.Y.rank - pred.splsda.LOGIT)^2 * diag(H) / ((pred.splsda.LOGIT*(-pred.splsda.LOGIT+1))*((-diag(H)+1))^2)


# SGPLS

i <- 3

library(spls)
library(parallel)

# Optimizing K and eta by cross-validation
#fit.sgpls.LOGIT.cv <- cv.sgpls(as.matrix(data.plus.race.age.rank),data.Y.rank, fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), scale.x=FALSE,n.core=10)

# fixing the optimum K and eta obtained by cross-validation (for reproducibility)
fit.sgpls.LOGIT <- sgpls(as.matrix(data.plus.race.age.rank),data.Y.rank, K = 4, eta=0.7, scale.x=FALSE)

var.selected.sgpls.LOGIT <- fit.sgpls.LOGIT$A
nvar.selected[i] <- length(var.selected.sgpls.LOGIT)

assign(paste("var.selected", i, sep = ""), var.selected.sgpls.LOGIT)

# Predictions obtained by model i:
pred.sgpls.LOGIT <- predict(fit.sgpls.LOGIT, as.matrix(data.plus.race.age.rank),type="fit",fit.type="response")

table(data.Y.rank,round(pred.sgpls.LOGIT))

##            
## data.Y.rank   0   1
##           0 763   8
##           1  15 138

# Mean squared error of prediction (MSE):
MSE[i] <- mean((data.Y.rank-pred.sgpls.LOGIT)^2)

# Misclassified individuals based on model i:
Miscl[,i] <- abs(data.Y.rank - round(pred.sgpls.LOGIT))

# The Cook’s distance for each individual based on model i:
V <- diag(as.vector(sqrt(pred.sgpls.LOGIT*(1-pred.sgpls.LOGIT))))
H <- V %*% (data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W) %*% (solve(t(data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W) %*% V %*% data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W)) %*% t(data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W) %*% V 

CookD[,i] <- (data.Y.rank - pred.sgpls.LOGIT)^2 * diag(H) / ((pred.sgpls.LOGIT*(-pred.sgpls.LOGIT+1))*((-diag(H)+1))^2)

# Mean squared error for the three models:
MSE

##            [,1]       [,2]       [,3]
## [1,] 0.01976205 0.02500814 0.08362941

# Misclassifications for the three models:
apply(Miscl,2,sum)

## [1] 16 29 23

# Number of variables selected by the three models:
nvar.selected

##      [,1] [,2] [,3]
## [1,]  107 2945  551

# Variables selected by the three models:
var.selected=list(var.selected1,var.selected2,var.selected3)


# The rank product:
rank.matrix <- apply(CookD,2, function(CookD) rank(-(CookD),ties.method = "first"))
rank.product <- apply(rank.matrix, 1, prod)

rho <-rank.product
n <- dim(data.plus.race.age.rank)[1]
k <- dim(rank.matrix)[2]

source("~/Heskes_pvalues.R")

# The p-values:
pvalues <- as.vector(rankprodbounds(rho,n,k,Delta ='geometric'))

# The q-values:
library(qvalue)
qobj <- qvalue(pvalues)
qvalues <- qobj$qvalues

# Misclassifications (%) by the three models
Miscl.percent <- apply(Miscl,1, function(Miscl) round((sum(Miscl)/k)*100))

# suspect individuals within the misclassified by model
Miscl.suspect.LOGIT.EN <- rownames(data.plus.race.age.rank[which(Miscl[,1]==1),])[which(getParticipantCode(rownames(data.plus.race.age.rank[which(Miscl[,1]==1),])) %in% suspect)]
Miscl.suspect.LOGIT.EN

## [1] "TCGA-LL-A5YP-01A-21R-A28M-07"

Miscl.suspect.SPLSDA <- rownames(data.plus.race.age.rank[which(Miscl[,2]==1),])[which(getParticipantCode(rownames(data.plus.race.age.rank[which(Miscl[,2]==1),])) %in% suspect)]
Miscl.suspect.SPLSDA

## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"

Miscl.suspect.SGPLS <- rownames(data.plus.race.age.rank[which(Miscl[,3]==1),])[which(getParticipantCode(rownames(data.plus.race.age.rank[which(Miscl[,3]==1),])) %in% suspect)]
Miscl.suspect.SGPLS

## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"
## [3] "TCGA-AN-A0FX-01A-11R-A034-07"

# Building the rank matrix
id <- as.vector(seq(1:dim(data.plus.race.age.rank)[1]))

outliers.rank <- as.data.frame(cbind(id, rank.matrix,rank.product,pvalues,qvalues,Miscl.percent))

# suspect in the top 25 logistic with EN model rank
rank.LOGIT.EN <- outliers.rank[order(outliers.rank[,2]),]
rownames(data.plus.race.age.rank[rank.LOGIT.EN[1:25,1],])[which(getParticipantCode(rownames(data.plus.race.age.rank[rank.LOGIT.EN[1:25,1],])) %in% suspect)]

## [1] "TCGA-A2-A0EQ-01A-11R-A034-07"

# suspect in the top 25 SPLSDA model rank
rank.SPLSDA <- outliers.rank[order(outliers.rank[,3]),]
rownames(data.plus.race.age.rank[rank.SPLSDA[1:25,1],])[which(getParticipantCode(rownames(data.plus.race.age.rank[rank.SPLSDA[1:25,1],])) %in% suspect)]

## [1] "TCGA-A2-A0EQ-01A-11R-A034-07" "TCGA-LL-A5YP-01A-21R-A28M-07"

# suspect in the top 25 SGPLS model rank
rank.SGPLS <- outliers.rank[order(outliers.rank[,4]),]
rownames(data.plus.race.age.rank[rank.SGPLS[1:25,1],])[which(getParticipantCode(rownames(data.plus.race.age.rank[rank.SGPLS[1:25,1],])) %in% suspect)]

## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-AN-A0FJ-01A-11R-A00Z-07"

outliers.rank <- outliers.rank[order(qvalues),]

# influent patients
TNBC.influent <- outliers.rank[which(outliers.rank[,7] < 0.05),1]
length(TNBC.influent)

## [1] 24

# brca type of the influent patients
data.Y.rank[TNBC.influent]

## TCGA-AC-A2QJ-01A-12R-A19W-07 TCGA-E9-A22G-01A-11R-A157-07 
##                            1                            0 
## TCGA-AR-A251-01A-12R-A169-07 TCGA-AR-A1AJ-01A-21R-A12P-07 
##                            0                            0 
## TCGA-A2-A3Y0-01A-11R-A239-07 TCGA-A2-A3XV-01A-21R-A239-07 
##                            0                            0 
## TCGA-E9-A1ND-01A-11R-A144-07 TCGA-EW-A1P1-01A-31R-A14D-07 
##                            0                            1 
## TCGA-E2-A1II-01A-11R-A144-07 TCGA-C8-A3M7-01A-12R-A21T-07 
##                            0                            1 
## TCGA-D8-A1JF-01A-11R-A13Q-07 TCGA-BH-A42U-01A-12R-A24H-07 
##                            1                            1 
## TCGA-LL-A740-01A-21R-A32P-07 TCGA-A2-A1G6-01A-11R-A13Q-07 
##                            1                            1 
## TCGA-OL-A5S0-01A-11R-A28M-07 TCGA-A2-A0EQ-01A-11R-A034-07 
##                            0                            1 
## TCGA-A2-A0YJ-01A-11R-A109-07 TCGA-AR-A1AH-01A-11R-A12D-07 
##                            0                            0 
## TCGA-AC-A62X-01A-11R-A29R-07 TCGA-E2-A1LB-01A-11R-A144-07 
##                            0                            0 
## TCGA-OL-A97C-01A-32R-A41B-07 TCGA-LL-A5YP-01A-21R-A28M-07 
##                            1                            0 
## TCGA-C8-A26X-01A-31R-A16F-07 TCGA-D8-A1XW-01A-11R-A14M-07 
##                            1                            0

sum(data.Y.rank[TNBC.influent]) # 10 TNBC and 14 non-TNBC patients

## [1] 10

# influent patients which are suspect
TNBC.influent.suspect <- rownames(data.plus.race.age.rank[TNBC.influent,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])) %in% suspect)]

# brca type of the influent, suspect patients
data.Y.rank[TNBC.influent][which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])) %in% suspect)]

## TCGA-A2-A0EQ-01A-11R-A034-07 TCGA-LL-A5YP-01A-21R-A28M-07 
##                            1                            0

# patients which are always misclassified
TNBC.alw.miscl <- which(apply(Miscl,1,sum)==3)
length(TNBC.alw.miscl)

## [1] 12

# which are suspect
TNBC.miscl.suspect <- rownames(data.plus.race.age.rank[TNBC.alw.miscl,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.alw.miscl,])) %in% suspect)]
length(TNBC.miscl.suspect)

## [1] 1

# TNBC-associated variables (ER, PG and HER2) selected by the three models:
which(colnames(data.plus.race.age.rank)=="ENSG00000091831") %in% var.selected1

## [1] TRUE

which(colnames(data.plus.race.age.rank)=="ENSG00000091831") %in% var.selected2

## [1] TRUE

which(colnames(data.plus.race.age.rank)=="ENSG00000091831") %in% var.selected3

## [1] TRUE

# ER selected by the three models

which(colnames(data.plus.race.age.rank)=="ENSG00000082175") %in% var.selected1

## [1] FALSE

which(colnames(data.plus.race.age.rank)=="ENSG00000082175") %in% var.selected2

## [1] TRUE

which(colnames(data.plus.race.age.rank)=="ENSG00000082175") %in% var.selected3

## [1] FALSE

# PR only selected by SPLSDA

which(colnames(data.plus.race.age.rank)=="ENSG00000141736") %in% var.selected1

## [1] FALSE

which(colnames(data.plus.race.age.rank)=="ENSG00000141736") %in% var.selected2

## [1] TRUE

which(colnames(data.plus.race.age.rank)=="ENSG00000141736") %in% var.selected3

## [1] TRUE

# HER2 selected by SPLSDA and SGPLS 

# Table 8 (Lopes et al.)
cbind(xdata[rownames(data.plus.race.age.rank[TNBC.influent,]),c("ENSG00000091831","ENSG00000082175","ENSG00000141736")],clinical.tnbc.aux[getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])),c(1:5)],clinical.tnbc[getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])),4],outliers.rank[1:length(TNBC.influent),2:8])

##                              ENSG00000091831 ENSG00000082175
## TCGA-AC-A2QJ-01A-12R-A19W-07      0.09981546     0.001169463
## TCGA-E9-A22G-01A-11R-A157-07      0.44347936     0.019834637
## TCGA-AR-A251-01A-12R-A169-07      1.56609545     0.098195977
## TCGA-AR-A1AJ-01A-21R-A12P-07      1.46653566     0.066805316
## TCGA-A2-A3Y0-01A-11R-A239-07      2.18089099     0.027241671
## TCGA-A2-A3XV-01A-21R-A239-07      0.01584832     0.028223807
## TCGA-E9-A1ND-01A-11R-A144-07      1.43733608     0.052120773
## TCGA-EW-A1P1-01A-31R-A14D-07      3.01820732     2.558892237
## TCGA-E2-A1II-01A-11R-A144-07      0.14270408     0.194132777
## TCGA-C8-A3M7-01A-12R-A21T-07      4.26609694     0.757809876
## TCGA-D8-A1JF-01A-11R-A13Q-07      1.25673293     0.110233707
## TCGA-BH-A42U-01A-12R-A24H-07      9.18726081     1.827635514
## TCGA-LL-A740-01A-21R-A32P-07      0.30222325     0.115332941
## TCGA-A2-A1G6-01A-11R-A13Q-07     23.89538282    21.452681110
## TCGA-OL-A5S0-01A-11R-A28M-07      0.09174435     0.063196003
## TCGA-A2-A0EQ-01A-11R-A034-07      2.12986768     0.039939245
## TCGA-A2-A0YJ-01A-11R-A109-07      0.08534338     0.031144565
## TCGA-AR-A1AH-01A-11R-A12D-07      0.03165734     0.030357178
## TCGA-AC-A62X-01A-11R-A29R-07      0.18607614     0.021666988
## TCGA-E2-A1LB-01A-11R-A144-07      0.41506846     0.091132159
## TCGA-OL-A97C-01A-32R-A41B-07     16.24660389     8.561992129
## TCGA-LL-A5YP-01A-21R-A28M-07      0.15598544     0.051391118
## TCGA-C8-A26X-01A-31R-A16F-07      0.41547024     0.127914672
## TCGA-D8-A1XW-01A-11R-A14M-07      0.31874784     0.111179573
##                              ENSG00000141736     ESR1      PGR HER2_level
## TCGA-AC-A2QJ-01A-12R-A19W-07        7.344798 Negative Negative           
## TCGA-E9-A22G-01A-11R-A157-07       15.317028 Negative Negative           
## TCGA-AR-A251-01A-12R-A169-07       14.019308 Positive Negative         2+
## TCGA-AR-A1AJ-01A-21R-A12P-07        9.742704 Positive Negative           
## TCGA-A2-A3Y0-01A-11R-A239-07       11.336328 Positive Negative         1+
## TCGA-A2-A3XV-01A-21R-A239-07      137.937244 Positive Negative         2+
## TCGA-E9-A1ND-01A-11R-A144-07       13.054248 Negative Negative           
## TCGA-EW-A1P1-01A-31R-A14D-07       23.641926 Negative Negative         2+
## TCGA-E2-A1II-01A-11R-A144-07       10.728130 Negative Positive         1+
## TCGA-C8-A3M7-01A-12R-A21T-07       25.470280 Negative Negative           
## TCGA-D8-A1JF-01A-11R-A13Q-07       32.930532 Negative Negative         1+
## TCGA-BH-A42U-01A-12R-A24H-07       38.367228 Negative Negative           
## TCGA-LL-A740-01A-21R-A32P-07       68.557840 Negative Negative         2+
## TCGA-A2-A1G6-01A-11R-A13Q-07       29.740007 Negative Negative         1+
## TCGA-OL-A5S0-01A-11R-A28M-07       31.919459 Positive Negative           
## TCGA-A2-A0EQ-01A-11R-A034-07       30.146140 Negative Negative         3+
## TCGA-A2-A0YJ-01A-11R-A109-07      240.243961 Positive Negative          0
## TCGA-AR-A1AH-01A-11R-A12D-07       34.123225 Positive Negative           
## TCGA-AC-A62X-01A-11R-A29R-07       28.531373 Positive Negative           
## TCGA-E2-A1LB-01A-11R-A144-07     1129.864921 Negative Negative         3+
## TCGA-OL-A97C-01A-32R-A41B-07       24.037033 Negative Negative           
## TCGA-LL-A5YP-01A-21R-A28M-07       15.096317 Positive Negative         1+
## TCGA-C8-A26X-01A-31R-A16F-07       60.117188 Negative Negative         1+
## TCGA-D8-A1XW-01A-11R-A14M-07       21.029425 Negative Positive         1+
##                              HER2_status HER2_FISH
## TCGA-AC-A2QJ-01A-12R-A19W-07    Negative          
## TCGA-E9-A22G-01A-11R-A157-07    Positive          
## TCGA-AR-A251-01A-12R-A169-07   Equivocal  Negative
## TCGA-AR-A1AJ-01A-21R-A12P-07    Negative          
## TCGA-A2-A3Y0-01A-11R-A239-07    Negative          
## TCGA-A2-A3XV-01A-21R-A239-07   Equivocal  Positive
## TCGA-E9-A1ND-01A-11R-A144-07    Positive          
## TCGA-EW-A1P1-01A-31R-A14D-07   Equivocal  Negative
## TCGA-E2-A1II-01A-11R-A144-07    Negative          
## TCGA-C8-A3M7-01A-12R-A21T-07    Negative          
## TCGA-D8-A1JF-01A-11R-A13Q-07    Negative          
## TCGA-BH-A42U-01A-12R-A24H-07    Negative          
## TCGA-LL-A740-01A-21R-A32P-07   Equivocal  Negative
## TCGA-A2-A1G6-01A-11R-A13Q-07    Negative          
## TCGA-OL-A5S0-01A-11R-A28M-07              Positive
## TCGA-A2-A0EQ-01A-11R-A034-07    Positive  Negative
## TCGA-A2-A0YJ-01A-11R-A109-07    Negative          
## TCGA-AR-A1AH-01A-11R-A12D-07    Negative          
## TCGA-AC-A62X-01A-11R-A29R-07                      
## TCGA-E2-A1LB-01A-11R-A144-07    Positive          
## TCGA-OL-A97C-01A-32R-A41B-07              Negative
## TCGA-LL-A5YP-01A-21R-A28M-07    Negative  Positive
## TCGA-C8-A26X-01A-31R-A16F-07    Negative          
## TCGA-D8-A1XW-01A-11R-A14M-07    Negative          
##                              clinical.tnbc[getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent, 
## TCGA-AC-A2QJ-01A-12R-A19W-07                                                                              TNBC
## TCGA-E9-A22G-01A-11R-A157-07                                                                           NO_TNBC
## TCGA-AR-A251-01A-12R-A169-07                                                                           NO_TNBC
## TCGA-AR-A1AJ-01A-21R-A12P-07                                                                           NO_TNBC
## TCGA-A2-A3Y0-01A-11R-A239-07                                                                           NO_TNBC
## TCGA-A2-A3XV-01A-21R-A239-07                                                                           NO_TNBC
## TCGA-E9-A1ND-01A-11R-A144-07                                                                           NO_TNBC
## TCGA-EW-A1P1-01A-31R-A14D-07                                                                              TNBC
## TCGA-E2-A1II-01A-11R-A144-07                                                                           NO_TNBC
## TCGA-C8-A3M7-01A-12R-A21T-07                                                                              TNBC
## TCGA-D8-A1JF-01A-11R-A13Q-07                                                                              TNBC
## TCGA-BH-A42U-01A-12R-A24H-07                                                                              TNBC
## TCGA-LL-A740-01A-21R-A32P-07                                                                              TNBC
## TCGA-A2-A1G6-01A-11R-A13Q-07                                                                              TNBC
## TCGA-OL-A5S0-01A-11R-A28M-07                                                                           NO_TNBC
## TCGA-A2-A0EQ-01A-11R-A034-07                                                                              TNBC
## TCGA-A2-A0YJ-01A-11R-A109-07                                                                           NO_TNBC
## TCGA-AR-A1AH-01A-11R-A12D-07                                                                           NO_TNBC
## TCGA-AC-A62X-01A-11R-A29R-07                                                                           NO_TNBC
## TCGA-E2-A1LB-01A-11R-A144-07                                                                           NO_TNBC
## TCGA-OL-A97C-01A-32R-A41B-07                                                                              TNBC
## TCGA-LL-A5YP-01A-21R-A28M-07                                                                           NO_TNBC
## TCGA-C8-A26X-01A-31R-A16F-07                                                                              TNBC
## TCGA-D8-A1XW-01A-11R-A14M-07                                                                           NO_TNBC
##                               V2  V3  V4 rank.product      pvalues
## TCGA-AC-A2QJ-01A-12R-A19W-07  13   1  34          442 1.395626e-05
## TCGA-E9-A22G-01A-11R-A157-07   6  13   7          546 1.828758e-05
## TCGA-AR-A251-01A-12R-A169-07  11   5   9          495 1.613518e-05
## TCGA-AR-A1AJ-01A-21R-A12P-07   2  10  44          880 3.347280e-05
## TCGA-A2-A3Y0-01A-11R-A239-07   8   7  22         1232 5.007570e-05
## TCGA-A2-A3XV-01A-21R-A239-07 166   2   5         1660 7.091672e-05
## TCGA-E9-A1ND-01A-11R-A144-07   5  11  48         2640 1.204479e-04
## TCGA-EW-A1P1-01A-31R-A14D-07  16  24   8         3072 1.427763e-04
## TCGA-E2-A1II-01A-11R-A144-07   9  21  20         3780 1.797824e-04
## TCGA-C8-A3M7-01A-12R-A21T-07   1  28 153         4284 2.063567e-04
## TCGA-D8-A1JF-01A-11R-A13Q-07  19 109   2         4142 1.988535e-04
## TCGA-BH-A42U-01A-12R-A24H-07   7   3 223         4683 2.274980e-04
## TCGA-LL-A740-01A-21R-A32P-07  47 125   1         5875 2.910510e-04
## TCGA-A2-A1G6-01A-11R-A13Q-07  10   4 204         8160 4.138327e-04
## TCGA-OL-A5S0-01A-11R-A28M-07  46   8  21         7728 3.905611e-04
## TCGA-A2-A0EQ-01A-11R-A034-07   4  12 216        10368 5.329668e-04
## TCGA-A2-A0YJ-01A-11R-A109-07  27  34  12        11016 5.679550e-04
## TCGA-AR-A1AH-01A-11R-A12D-07  21  16  38        12768 6.625311e-04
## TCGA-AC-A62X-01A-11R-A29R-07  22  35  19        14630 7.629362e-04
## TCGA-E2-A1LB-01A-11R-A144-07 230   6  11        15180 7.925618e-04
## TCGA-OL-A97C-01A-32R-A41B-07  14  41  30        17220 9.022870e-04
## TCGA-LL-A5YP-01A-21R-A28M-07  50  22  18        19800 1.040644e-03
## TCGA-C8-A26X-01A-31R-A16F-07  53  64   6        20352 1.070180e-03
## TCGA-D8-A1XW-01A-11R-A14M-07  23  14  68        21896 1.152666e-03
##                                  qvalues Miscl.percent
## TCGA-AC-A2QJ-01A-12R-A19W-07 0.005632575            67
## TCGA-E9-A22G-01A-11R-A157-07 0.005632575            33
## TCGA-AR-A251-01A-12R-A169-07 0.005632575            67
## TCGA-AR-A1AJ-01A-21R-A12P-07 0.007732217           100
## TCGA-A2-A3Y0-01A-11R-A239-07 0.009253989            33
## TCGA-A2-A3XV-01A-21R-A239-07 0.010921174            33
## TCGA-E9-A1ND-01A-11R-A144-07 0.015899128           100
## TCGA-EW-A1P1-01A-31R-A14D-07 0.016490665           100
## TCGA-E2-A1II-01A-11R-A144-07 0.017333964            33
## TCGA-C8-A3M7-01A-12R-A21T-07 0.017333964           100
## TCGA-D8-A1JF-01A-11R-A13Q-07 0.017333964             0
## TCGA-BH-A42U-01A-12R-A24H-07 0.017517346           100
## TCGA-LL-A740-01A-21R-A32P-07 0.020687007             0
## TCGA-A2-A1G6-01A-11R-A13Q-07 0.025492092           100
## TCGA-OL-A5S0-01A-11R-A28M-07 0.025492092            67
## TCGA-A2-A0EQ-01A-11R-A034-07 0.030778830            67
## TCGA-A2-A0YJ-01A-11R-A109-07 0.030870023             0
## TCGA-AR-A1AH-01A-11R-A12D-07 0.034009932           100
## TCGA-AC-A62X-01A-11R-A29R-07 0.036616355             0
## TCGA-E2-A1LB-01A-11R-A144-07 0.036616355            33
## TCGA-OL-A97C-01A-32R-A41B-07 0.039700630            33
## TCGA-LL-A5YP-01A-21R-A28M-07 0.042993316           100
## TCGA-C8-A26X-01A-31R-A16F-07 0.042993316            33
## TCGA-D8-A1XW-01A-11R-A14M-07 0.044377638            33

# plot SPLSDA-scores (Lopes et al.)
TNBC.matrix.splsda12.plot <- data.frame(LV1 = fit.splsda.LOGIT$T[,1],LV2 = fit.splsda.LOGIT$T[,2], Class = as.character(data.Y.rank))

TNBC.influent.matrix.splsda12.plot <- data.frame(LV1 =  fit.splsda.LOGIT$T[TNBC.influent,1],LV2 = fit.splsda.LOGIT$T[TNBC.influent,2], Class = as.character(data.Y.rank[TNBC.influent]))

TNBC.suspect.matrix.splsda12.plot <- data.frame(LV1 = fit.splsda.LOGIT$T[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect),1],LV2 = fit.splsda.LOGIT$T[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect),2], Class = as.character(TNBC.matrix.splsda12.plot[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect),3]))

library(ggplot2)
#postscript("splsda.ps")
#tiff("splsda.tiff")
ggplot(TNBC.matrix.splsda12.plot, aes(LV1, LV2)) + geom_point(aes(color = Class,shape = Class),size=3) + scale_shape_manual(values=c(21,24)) + scale_colour_manual(values=c("black", "black","black","black")) + theme(legend.text=element_text(size=2.5)) + theme_minimal() + theme(axis.text=element_text(size=14)) + theme(axis.title.y = element_text(size = 20, angle = 90)) + theme(axis.title.x = element_text(size = 20, angle = 0)) + 
  geom_point(data=TNBC.influent.matrix.splsda12.plot,aes(x=LV1, y=LV2,shape = Class,col="black",bg="red"),size=3) + 
  geom_point(data=TNBC.suspect.matrix.splsda12.plot,aes(x=LV1, y=LV2,shape = Class,col="black",bg="blue"),size=3) + guides(colour=FALSE)

#dev.off()

# PCA on TNBC data
ytnbc.col <- data.Y.rank
ytnbc.col[ytnbc.col==1] <- "black"
ytnbc.col[ytnbc.col==0] <- "black"
ytnbc.col[TNBC.influent] <- "blue"
ytnbc.col[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect)] <- "red"
 
ytnbc.shape <- data.Y.rank
ytnbc.shape[ytnbc.shape==1] <- 2
ytnbc.shape[ytnbc.shape==0] <- 1

ytnbc.shape <- as.factor(ytnbc.shape)
ytnbc.col <- as.factor(ytnbc.col)

tnbc.pca.matrix <- as.data.frame(cbind(data.plus.race.age.rank,ytnbc.shape,ytnbc.col))

tnbc.pca <- prcomp(tnbc.pca.matrix[,1:19690], center=FALSE,scale=FALSE)

library(ggfortify)
#postscript("pca.ps")
#tiff("pca.tiff")
autoplot(tnbc.pca, data = tnbc.pca.matrix, colour = ytnbc.col, shape= ytnbc.shape, size=3)  + theme_minimal()

#dev.off()

Evaluating the ensemble performance for synthetic datasets built based on mean and covariance matrices from the TNBC and non-TNBC classes

n.synthetic <- 1000

# taking the covariance from the original TNBC data, to make simulation more realistic
xdata.synthetic.cov <- cov(data.plus.race.age.rank)
xdata.synthetic.cov.event <- cov(data.plus.race.age.rank[which(data.Y.rank==1),])
xdata.synthetic.cov.noevent <- cov(data.plus.race.age.rank[which(data.Y.rank==0),])

# for p = 5000 randomly chosen variables
p.synthetic <- 5000
set.seed(7)
cov.vars <- sample(c(1:dim(xdata.synthetic.cov)[2]),p.synthetic)
xdata.synthetic.cov.event5000 <- xdata.synthetic.cov.event[cov.vars,cov.vars]
xdata.synthetic.cov.noevent5000 <- xdata.synthetic.cov.noevent[cov.vars,cov.vars]

# for p = full TNBC dimensionality
p.synthetic <- dim(data.plus.race.age.rank[,1:19688])[2]
xdata.synthetic.cov.eventfull <- xdata.synthetic.cov.event[1:p.synthetic,1:p.synthetic]
xdata.synthetic.cov.noeventfull <- xdata.synthetic.cov.noevent[1:p.synthetic,1:p.synthetic]

# taking the covariance from the original TNBC data, to make simulation more realistic
# for p = 5000 randomly chosen variables
mean.synthetic.event5000 <- apply(data.plus.race.age.rank[which(data.Y.rank==1),],2,mean)[cov.vars]
mean.synthetic.noevent5000 <- apply(data.plus.race.age.rank[which(data.Y.rank==0),],2,mean)[cov.vars]

# for p = full TNBC dimensionality
mean.synthetic.eventfull <- apply(data.plus.race.age.rank[which(data.Y.rank==1),],2,mean)[1:p.synthetic]
mean.synthetic.noeventfull <- apply(data.plus.race.age.rank[which(data.Y.rank==0),],2,mean)[1:p.synthetic]

# generate  a random X
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ensembldb':
## 
##     select

## The following object is masked from 'package:AnnotationDbi':
## 
##     select

set.seed(8)
xdata.synthetic.event5000 <- mvrnorm(n = 500, mean.synthetic.event5000, xdata.synthetic.cov.event5000, tol = 1e-6, empirical = FALSE)
set.seed(9)
xdata.synthetic.noevent5000 <- mvrnorm(n = 500, mean.synthetic.noevent5000, xdata.synthetic.cov.noevent5000, tol = 1e-6, empirical = FALSE)

xdata.synthetic <- rbind(xdata.synthetic.event5000,xdata.synthetic.noevent5000)
ydata.synthetic <- c(rep(1,500), rep(0,500))
data.Y.synthetic.out <- ydata.synthetic
set.seed(9)
data.synthetic.outliers <- sample(c(1:1000),25, replace=FALSE) # simulating 25 outliers
data.synthetic.outliers

##  [1] 222  25 207 216 442 134 389 367 664 984 117   9 873 298 486 494 396
## [18] 961 352 483 882  21 313 111 516

data.Y.synthetic.out[data.synthetic.outliers] <- abs(1-data.Y.synthetic.out[data.synthetic.outliers])

data.synthetic <- xdata.synthetic
# normalized X data
data.synthetic <- (data.synthetic-matrix(apply(data.synthetic,2,mean),nrow(data.synthetic),ncol(data.synthetic),byrow=TRUE))/matrix(apply(data.synthetic,2,sd),nrow(data.synthetic),ncol(data.synthetic),byrow=TRUE)

# Plots

#postscript("Fig_synthetic_outliers_PCA_n1000_p5000_muTNBC_covTNBC_out25.ps")
#tiff("Fig_synthetic_outliers_PCA_n1000_p5000_muTNBC_covTNBC_out25.tiff")
ysynthetic.col <- data.Y.synthetic.out
ysynthetic.col[ysynthetic.col==1] <- "black"
ysynthetic.col[ysynthetic.col==0] <- "gray"
ysynthetic.col[data.synthetic.outliers] <- "red"
 
synthetic.pca.matrix <- as.data.frame(data.synthetic)
synthetic.pca.matrix.groups <- factor(ysynthetic.col)
synthetic.pca.matrix <- as.matrix(synthetic.pca.matrix)

synthetic.pca <- prcomp(synthetic.pca.matrix, center=FALSE,scale=FALSE)
 
library(ggfortify)
autoplot(synthetic.pca, data = synthetic.pca.matrix, colour = synthetic.pca.matrix.groups)

#dev.off()


# Ensemble outlier detection

# e.g., for p=5000 and % of outliers = 2.5

set.seed(2010)

my.alpha.synthetic <- seq(0.1,0.9,0.1)

# Optimizing alpha and lambda for logistic regression with Elastic net regularization
nvar.selected.EN.synthetic <- matrix(0,1,length(my.alpha.synthetic) )
MSE.EN.synthetic <- matrix(0,1,length(my.alpha.synthetic))
MSE.EN.synthetic <- matrix(0,1,length(my.alpha.synthetic))

# assigning samples to folds, to be used in cross-validation when tunning alpha
set.seed(2010)
foldid.synthetic <-sample(1:10,size=length(data.Y.synthetic.out),replace=TRUE)

library(glmnet)

for (j in 1:length(my.alpha.synthetic)){
  
  # Logistic model fitting with 10-fold cross-validation for glmnet:
  fit.EN.cv.synthetic <- cv.glmnet(as.matrix(data.synthetic),as.factor(data.Y.synthetic.out),family="binomial",foldid=foldid.synthetic,alpha=my.alpha.synthetic[j])
  var.selected.EN.synthetic <- which(fit.EN.cv.synthetic$glmnet.fit$beta[,which(fit.EN.cv.synthetic$cvm == min(fit.EN.cv.synthetic$cvm))] != 0)
  nvar.selected.EN.synthetic[j] <- length(var.selected.EN.synthetic)
  
  # Predictions obtained by model i:
  pred.EN.cv.synthetic <- predict(fit.EN.cv.synthetic,as.matrix(data.synthetic),s="lambda.min",type="response")
  
  # Mean squared error of prediction (MSE):
  MSE.EN.synthetic[j] <- mean((data.Y.synthetic.out-pred.EN.cv.synthetic)^2)
}

MSE.EN.synthetic

##            [,1]       [,2]       [,3]       [,4]       [,5]       [,6]
## [1,] 0.01742921 0.01800071 0.01827128 0.01767633 0.01793559 0.01767032
##            [,7]       [,8]       [,9]
## [1,] 0.01886815 0.01874049 0.01934937

nvar.selected.EN.synthetic

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,]  557  328  245  209  183  159  142  135  131

my.alpha.synthetic[which(MSE.EN.synthetic == min(MSE.EN.synthetic))]

## [1] 0.1

## Model ensemble

n.models.synthetic <- 3
nvar.selected.synthetic <- matrix(0,1,n.models.synthetic)
MSE.synthetic <- matrix(0,1,n.models.synthetic)
Miscl.synthetic <- matrix(0,dim(data.synthetic)[1],n.models.synthetic)
CookD.synthetic <- matrix(0,dim(data.synthetic)[1],n.models.synthetic)

# logistic regression with Elastic net regularization (LOGIT-EN)

i <- 1

my.alpha.opt.synthetic <- my.alpha.synthetic[which(MSE.EN.synthetic == min(MSE.EN.synthetic))]

fit.EN.cv.synthetic <- cv.glmnet(as.matrix(data.synthetic),as.factor(data.Y.synthetic.out),family="binomial",foldid=foldid.synthetic,alpha=my.alpha.opt.synthetic)    
var.selected.EN.synthetic <- which(fit.EN.cv.synthetic$glmnet.fit$beta[,which(fit.EN.cv.synthetic$cvm == min(fit.EN.cv.synthetic$cvm))] != 0)
nvar.selected.synthetic[i] <- length(var.selected.EN.synthetic)

assign(paste("var.selected.synthetic", i, sep = ""), which(fit.EN.cv.synthetic$glmnet.fit$beta[,which(fit.EN.cv.synthetic$cvm == min(fit.EN.cv.synthetic$cvm))] != 0))

# Predictions obtained by model i:
pred.EN.cv.synthetic <- predict(fit.EN.cv.synthetic,as.matrix(data.synthetic),s="lambda.min",type="response")

# Mean squared error of prediction (MSE)
MSE.synthetic[i] <- mean((data.Y.synthetic.out-pred.EN.cv.synthetic)^2)

# Misclassified individuals (‘1’) based on method i:
Miscl.synthetic[,i] <- abs(data.Y.synthetic.out - round(pred.EN.cv.synthetic))

table(data.Y.synthetic.out,round(pred.EN.cv.synthetic))

##                     
## data.Y.synthetic.out   0   1
##                    0 496  17
##                    1   5 482

# The Cook’s distance for each individual based on method i:
V <- diag(as.vector(sqrt(pred.EN.cv.synthetic*(1-pred.EN.cv.synthetic))))
H <- V %*% data.synthetic[,var.selected.EN.synthetic] %*% (solve(t(data.synthetic[,var.selected.EN.synthetic]) %*% V %*% data.synthetic[,var.selected.EN.synthetic])) %*% t(data.synthetic[,var.selected.EN.synthetic]) %*% V 

CookD.synthetic[,i] <- (data.Y.synthetic.out - pred.EN.cv.synthetic)^2 * diag(H) / ((pred.EN.cv.synthetic*(-pred.EN.cv.synthetic+1))*((-diag(H)+1))^2)

which(Miscl.synthetic[,i]==1)

##  [1]   9  21 111 117 134 207 216 222 313 352 367 389 396 442 483 486 494
## [18] 516 664 873 882 961

which(data.synthetic.outliers %in%  which(Miscl.synthetic[,i]==1))

##  [1]  1  3  4  5  6  7  8  9 11 12 13 15 16 17 18 19 20 21 22 23 24 25

# SPLS-DA

library(spls)

i <- 2

# Optimizing the parameters by cross-validation
#fit.splsda.synthetic.cv <- cv.splsda(as.matrix(data.synthetic),as.factor(data.Y.synthetic.out), fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), kappa=0.5, classifier="logistic", scale.x=FALSE)

# using the parameters optimized above
fit.splsda.synthetic <- splsda(as.matrix(data.synthetic),as.factor(data.Y.synthetic.out), K = 5, eta=0.7, classifier="logistic", scale.x=FALSE)

var.selected.splsda.synthetic <- fit.splsda.synthetic$A
nvar.selected.synthetic[i] <- length(var.selected.splsda.synthetic)

assign(paste("var.selected.synthetic", i, sep = ""), var.selected.splsda.synthetic)

# Predictions obtained by model i:
pred.splsda.synthetic<- predict(fit.splsda.synthetic, as.matrix(data.synthetic),type = "fit","coefficient", fit.type = "response")

# Mean squared error of prediction (MSE)
MSE.synthetic[i] <- mean((data.Y.synthetic.out-pred.splsda.synthetic)^2)

# Misclassified individuals (‘1’) based on method i:
Miscl.synthetic[,i] <- abs(data.Y.synthetic.out - round(pred.splsda.synthetic))

table(data.Y.synthetic.out,round(pred.splsda.synthetic))

##                     
## data.Y.synthetic.out   0   1
##                    0 493  20
##                    1   8 479

# The Cook’s distance for each individual based on method i:
V <- diag(as.vector(sqrt(pred.splsda.synthetic*(1-pred.splsda.synthetic))))
H <- V %*% (fit.splsda.synthetic$T) %*% (solve(t(fit.splsda.synthetic$T) %*% V %*% fit.splsda.synthetic$T)) %*% t(fit.splsda.synthetic$T) %*% V # the hat matrix

CookD.synthetic[,i] <- (data.Y.synthetic.out - pred.splsda.synthetic)^2 * diag(H) / ((pred.splsda.synthetic*(-pred.splsda.synthetic+1))*((-diag(H)+1))^2)

which(Miscl.synthetic[,i]==1)

##  [1]   9  21  25  66 111 117 130 134 207 216 222 298 313 352 367 389 396
## [18] 442 486 494 516 575 664 873 882 961 984 995

which(data.synthetic.outliers %in%  which(Miscl.synthetic[,i]==1))

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 21 22 23 24
## [24] 25

# SGPLS

i <- 3

# Optimizing the parameters by cross-validation
#fit.sgpls.synthetic.cv <- cv.sgpls(as.matrix(data.synthetic),data.Y.synthetic.out, fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), scale.x=FALSE)

# using the parameters optimized above
fit.sgpls.synthetic <- sgpls(as.matrix(data.synthetic),data.Y.synthetic.out, K = 5, eta=0.7, scale.x=FALSE)

var.selected.sgpls.synthetic <- fit.sgpls.synthetic$A
nvar.selected.synthetic[i] <- length(var.selected.sgpls.synthetic)

assign(paste("var.selected.synthetic", i, sep = ""), var.selected.sgpls.synthetic)

# Predictions obtained for the train set:
pred.sgpls.synthetic <- predict(fit.sgpls.synthetic,as.matrix(data.synthetic),type="fit",fit.type="response")

# Mean squared error of prediction (MSE)
MSE.synthetic[i] <- mean((data.Y.synthetic.out-pred.sgpls.synthetic)^2)

# Misclassified individuals (‘1’) based on method i:
Miscl.synthetic[,i] <- abs(data.Y.synthetic.out - round(pred.sgpls.synthetic))

table(data.Y.synthetic.out,round(pred.sgpls.synthetic))

##                     
## data.Y.synthetic.out   0   1
##                    0 495  18
##                    1   4 483

# The Cook’s distance for each individual based on method i:
V <- diag(as.vector(sqrt(pred.sgpls.synthetic*(1-pred.sgpls.synthetic))))
H <- V %*% (data.synthetic[,var.selected.sgpls.synthetic]%*%fit.sgpls.synthetic$W) %*% (solve(t(data.synthetic[,var.selected.sgpls.synthetic]%*%fit.sgpls.synthetic$W) %*% V %*% data.synthetic[,var.selected.sgpls.synthetic]%*%fit.sgpls.synthetic$W)) %*% t(data.synthetic[,var.selected.sgpls.synthetic]%*%fit.sgpls.synthetic$W) %*% V # the hat matrix

CookD.synthetic[,i] <- (data.Y.synthetic.out - pred.sgpls.synthetic)^2 * diag(H) / ((pred.sgpls.synthetic*(-pred.sgpls.synthetic+1))*((-diag(H)+1))^2)

which(Miscl.synthetic[,i]==1)

##  [1]  21  25 111 117 134 207 216 222 298 313 352 367 389 396 486 494 516
## [18] 559 664 815 961 984

which(data.synthetic.outliers %in%  which(Miscl.synthetic[,i]==1))

##  [1]  1  2  3  4  6  7  8  9 10 11 14 15 16 17 18 19 22 23 24 25

nvar.selected.synthetic

##      [,1] [,2] [,3]
## [1,]  557 2897  249

var.selected.synthetic=list(var.selected.synthetic1,var.selected.synthetic2,var.selected.synthetic3)

source("~/Heskes_pvalues.R")

# The rank product:
rank.matrix.synthetic <- apply(CookD.synthetic,2, function(CookD.synthetic) rank(-(CookD.synthetic),ties.method = "first"))
rank.product.synthetic <- apply(rank.matrix.synthetic, 1, prod)

rho.synthetic <-rank.product.synthetic
n.synthetic <- dim(data.synthetic)[1]
k.synthetic <- dim(rank.matrix.synthetic)[2]

# The p-values:
pvalues.synthetic <- as.vector(rankprodbounds(rho.synthetic,n.synthetic,k.synthetic,Delta ='geometric'))

library(qvalue)
# The q-values:
qobj.synthetic <- qvalue(pvalues.synthetic)
qvalues.synthetic <- qobj.synthetic$qvalues

# Misclassifications
Miscl.train.percent.synthetic <- apply(Miscl.synthetic,1, function(Miscl.synthetic) round((sum(Miscl.synthetic)/k.synthetic)*100))
Miscl.train.by.method.synthetic <- apply(Miscl.synthetic,2, sum)

# Building the rank matrix
id.synthetic <- as.vector(seq(1:n.synthetic))

outliers.rank.synthetic <-as.data.frame(cbind(id.synthetic, rank.matrix.synthetic,rank.product.synthetic,pvalues.synthetic,qvalues.synthetic,Miscl.train.percent.synthetic))
outliers.rank.synthetic <- outliers.rank.synthetic[order(qvalues.synthetic),]
outliers.rank.synthetic[1:25,]

##     id.synthetic V2 V3  V4 rank.product.synthetic pvalues.synthetic
## 9              9 15  1   2                     30      2.880181e-07
## 367          367 18  2   1                     36      3.745618e-07
## 961          961  1  8  10                     80      1.146239e-06
## 664          664  5  7   4                    140      2.445232e-06
## 494          494  9  6   5                    270      5.819688e-06
## 117          117  7  5  16                    560      1.490003e-05
## 389          389 16 13   3                    624      1.709914e-05
## 216          216 17  3  14                    714      2.028412e-05
## 207          207  6 11  13                    858      2.557809e-05
## 873          873  2 19  23                    874      2.617996e-05
## 442          442 20  9   6                   1080      3.398691e-05
## 111          111 14 12   8                   1344      4.402013e-05
## 486          486 10 14  15                   2100      7.389366e-05
## 21            21 11 18  18                   3564      1.342893e-04
## 882          882  4 22  48                   4224      1.621111e-04
## 222          222 24 17  11                   4488      1.733067e-04
## 352          352 23 10  31                   7130      2.866408e-04
## 483          483 21 27  17                   9639      3.953695e-04
## 995          995 36 21  19                  14364      6.007910e-04
## 516          516  3 24 221                  15912      6.680386e-04
## 313          313  8 20 145                  23200      9.833004e-04
## 298          298 25  4 268                  26800      1.137984e-03
## 134          134 13 15 197                  38415      1.631746e-03
## 396          396 27 25  65                  43875      1.861032e-03
## 38            38 38 33  49                  61446      2.587232e-03
##     qvalues.synthetic Miscl.train.percent.synthetic
## 9        0.0001872809                            67
## 367      0.0001872809                           100
## 961      0.0003820797                           100
## 664      0.0006113080                           100
## 494      0.0011639376                           100
## 117      0.0024427342                           100
## 389      0.0024427342                           100
## 216      0.0025355148                           100
## 207      0.0026179960                           100
## 873      0.0026179960                            67
## 442      0.0030897192                            67
## 111      0.0036683444                           100
## 486      0.0056841275                           100
## 21       0.0095920929                           100
## 882      0.0108074080                            67
## 222      0.0108316700                           100
## 352      0.0168612245                           100
## 483      0.0219649745                            33
## 995      0.0316205789                            33
## 516      0.0334019320                           100
## 313      0.0468238287                           100
## 298      0.0517265592                            67
## 134      0.0709454757                           100
## 396      0.0775430185                           100
## 38       0.0997775327                             0

synthetic.influent <- outliers.rank.synthetic[which(outliers.rank.synthetic[,7] < 0.05),1]

which(data.synthetic.outliers %in% synthetic.influent)

##  [1]  1  3  4  5  7  8  9 11 12 13 15 16 18 19 20 21 22 23 24 25

length(which(data.synthetic.outliers %in% synthetic.influent))

## [1] 20

length(which(outliers.rank.synthetic[1:length(data.synthetic.outliers),1] %in% data.synthetic.outliers))

## [1] 23

length(which(outliers.rank.synthetic[which(outliers.rank.synthetic[,2]<=length(data.synthetic.outliers)),1] %in% data.synthetic.outliers))

## [1] 23

length(which(outliers.rank.synthetic[which(outliers.rank.synthetic[,3]<=length(data.synthetic.outliers)),1] %in% data.synthetic.outliers))

## [1] 23

length(which(outliers.rank.synthetic[which(outliers.rank.synthetic[,4]<=length(data.synthetic.outliers)),1] %in% data.synthetic.outliers))

## [1] 16

3.2 Genes selected

Genes selected by LOGIT-EN, SPLS-DA and SGPLS

# variables in common between EN and sPLSDA and sgPLS
TNBC.vars.in.common <- as.data.frame(table(c(var.selected1,var.selected2,var.selected3)))
TNBC.vars.in.common <- as.numeric(as.character(TNBC.vars.in.common[which(TNBC.vars.in.common[,2] == 3),1]))
length(TNBC.vars.in.common)

## [1] 26

# 26 vars in common

TNBC.vars.in.common.code <- as.matrix(mcols(ensembl.protein.coding[colnames(data.plus.race.age.rank[,TNBC.vars.in.common]),2]))
TNBC.vars.in.common.code

##       gene_name 
##  [1,] "CA12"    
##  [2,] "ESR1"    
##  [3,] "BPI"     
##  [4,] "TAF7L"   
##  [5,] "ASPN"    
##  [6,] "KCNS1"   
##  [7,] "ACE2"    
##  [8,] "CLDN10"  
##  [9,] "PDX1"    
## [10,] "PIF1"    
## [11,] "HAPLN3"  
## [12,] "MFGE8"   
## [13,] "DMRTA2"  
## [14,] "LYPD1"   
## [15,] "ZIC1"    
## [16,] "UBASH3B" 
## [17,] "SRSF12"  
## [18,] "POM121L2"
## [19,] "FTCD"    
## [20,] "PGAP3"   
## [21,] "VSNL1"   
## [22,] "KLK6"    
## [23,] "NRG4"    
## [24,] "CXXC5"   
## [25,] "KIR2DL4" 
## [26,] "PPP1R14C"

# summary of gene expression in TNBC and non-TNBC for the 26 genes selected
TNBC.vars.in.common.expression.TNBC <- xdata[ydata[,1]=="TNBC",TNBC.vars.in.common]
colnames(TNBC.vars.in.common.expression.TNBC) <- as.vector(TNBC.vars.in.common.code)
summary(TNBC.vars.in.common.expression.TNBC)

##       CA12               ESR1               BPI          
##  Min.   :  0.1188   Min.   : 0.01938   Min.   : 0.00000  
##  1st Qu.:  0.9628   1st Qu.: 0.16063   1st Qu.: 0.08627  
##  Median :  2.2454   Median : 0.35070   Median : 0.57210  
##  Mean   :  7.1957   Mean   : 1.52980   Mean   : 1.68510  
##  3rd Qu.:  4.7820   3rd Qu.: 0.82760   3rd Qu.: 1.63253  
##  Max.   :198.8447   Max.   :29.97893   Max.   :28.90646  
##      TAF7L              ASPN              KCNS1          
##  Min.   :0.00979   Min.   :  0.1891   Min.   : 0.006663  
##  1st Qu.:0.08815   1st Qu.:  3.7304   1st Qu.: 0.094688  
##  Median :0.14208   Median :  9.5310   Median : 0.332931  
##  Mean   :0.35790   Mean   : 17.7950   Mean   : 1.040426  
##  3rd Qu.:0.28278   3rd Qu.: 20.6949   3rd Qu.: 0.863642  
##  Max.   :7.30063   Max.   :136.1170   Max.   :16.869534  
##       ACE2              CLDN10              PDX1        
##  Min.   :  0.0000   Min.   : 0.00000   Min.   :0.00000  
##  1st Qu.:  0.1258   1st Qu.: 0.02278   1st Qu.:0.00000  
##  Median :  0.4727   Median : 0.05433   Median :0.01086  
##  Mean   :  4.1248   Mean   : 2.65009   Mean   :0.22273  
##  3rd Qu.:  1.4384   3rd Qu.: 0.46768   3rd Qu.:0.15051  
##  Max.   :149.4913   Max.   :91.48534   Max.   :3.92918  
##       PIF1              HAPLN3            MFGE8         
##  Min.   : 0.07712   Min.   :  0.919   Min.   :   5.714  
##  1st Qu.: 0.82028   1st Qu.:  8.362   1st Qu.:  20.410  
##  Median : 1.34009   Median : 19.276   Median :  36.383  
##  Mean   : 1.84860   Mean   : 26.925   Mean   :  81.941  
##  3rd Qu.: 2.25936   3rd Qu.: 38.167   3rd Qu.:  93.235  
##  Max.   :13.11623   Max.   :118.904   Max.   :1024.815  
##      DMRTA2             LYPD1               ZIC1            UBASH3B       
##  Min.   :0.000000   Min.   : 0.07908   Min.   : 0.0000   Min.   : 0.1274  
##  1st Qu.:0.009053   1st Qu.: 0.44511   1st Qu.: 0.5488   1st Qu.: 0.9382  
##  Median :0.030162   Median : 0.78513   Median : 1.8950   Median : 1.8369  
##  Mean   :0.353153   Mean   : 1.91865   Mean   : 2.3420   Mean   : 2.4892  
##  3rd Qu.:0.213414   3rd Qu.: 1.82027   3rd Qu.: 3.3878   3rd Qu.: 3.2079  
##  Max.   :6.013674   Max.   :36.65243   Max.   :15.0472   Max.   :15.3149  
##      SRSF12           POM121L2            FTCD             PGAP3       
##  Min.   :0.04384   Min.   :0.00000   Min.   :0.00000   Min.   : 1.250  
##  1st Qu.:0.38949   1st Qu.:0.00000   1st Qu.:0.07846   1st Qu.: 5.600  
##  Median :0.92863   Median :0.01160   Median :0.17386   Median : 7.402  
##  Mean   :1.32648   Mean   :0.03602   Mean   :0.60743   Mean   : 8.647  
##  3rd Qu.:1.91600   3rd Qu.:0.03978   3rd Qu.:0.51036   3rd Qu.:10.509  
##  Max.   :8.57894   Max.   :0.42138   Max.   :8.20286   Max.   :48.840  
##      VSNL1               KLK6              NRG4              CXXC5        
##  Min.   : 0.05713   Min.   :  0.000   Min.   :0.009831   Min.   : 0.9359  
##  1st Qu.: 1.02624   1st Qu.:  1.118   1st Qu.:0.059571   1st Qu.: 5.3429  
##  Median : 2.36642   Median : 11.205   Median :0.103041   Median : 8.3877  
##  Mean   : 4.77549   Mean   : 42.474   Mean   :0.239635   Mean   :10.3194  
##  3rd Qu.: 4.58127   3rd Qu.: 49.892   3rd Qu.:0.179789   3rd Qu.:13.0758  
##  Max.   :41.35774   Max.   :596.303   Max.   :4.219244   Max.   :60.0577  
##     KIR2DL4           PPP1R14C       
##  Min.   :0.00000   Min.   :  0.0361  
##  1st Qu.:0.04302   1st Qu.:  4.4508  
##  Median :0.14618   Median : 11.2454  
##  Mean   :0.40262   Mean   : 18.4007  
##  3rd Qu.:0.45270   3rd Qu.: 27.1789  
##  Max.   :4.30483   Max.   :107.9462

TNBC.vars.in.common.expression.nonTNBC <- xdata[ydata[,1]=="NO_TNBC",TNBC.vars.in.common]
colnames(TNBC.vars.in.common.expression.nonTNBC) <- as.vector(TNBC.vars.in.common.code)
summary(TNBC.vars.in.common.expression.nonTNBC)

##       CA12               ESR1                BPI         
##  Min.   :  0.0322   Min.   :  0.01585   Min.   :0.00000  
##  1st Qu.: 34.4695   1st Qu.: 16.14359   1st Qu.:0.01528  
##  Median : 65.3680   Median : 36.66739   Median :0.03789  
##  Mean   : 78.1013   Mean   : 47.88104   Mean   :0.12478  
##  3rd Qu.:107.0139   3rd Qu.: 69.64894   3rd Qu.:0.08706  
##  Max.   :706.3150   Max.   :272.20332   Max.   :8.19235  
##      TAF7L              ASPN             KCNS1         
##  Min.   :0.00000   Min.   :  0.231   Min.   : 0.00000  
##  1st Qu.:0.03541   1st Qu.: 12.960   1st Qu.: 0.01456  
##  Median :0.06448   Median : 33.650   Median : 0.03252  
##  Mean   :0.09564   Mean   : 56.929   Mean   : 0.11886  
##  3rd Qu.:0.11493   3rd Qu.: 73.519   3rd Qu.: 0.07619  
##  Max.   :1.31195   Max.   :541.074   Max.   :16.74360  
##       ACE2               CLDN10             PDX1              PIF1        
##  Min.   :  0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.01165  
##  1st Qu.:  0.01594   1st Qu.:0.01306   1st Qu.:0.00000   1st Qu.:0.21551  
##  Median :  0.04167   Median :0.03601   Median :0.00000   Median :0.38055  
##  Mean   :  0.50085   Mean   :0.11311   Mean   :0.01332   Mean   :0.55070  
##  3rd Qu.:  0.09604   3rd Qu.:0.09566   3rd Qu.:0.00000   3rd Qu.:0.71488  
##  Max.   :114.41918   Max.   :6.81840   Max.   :3.05159   Max.   :5.43608  
##      HAPLN3            MFGE8             DMRTA2             LYPD1        
##  Min.   : 0.1285   Min.   :  1.203   Min.   :0.000000   Min.   :0.01422  
##  1st Qu.: 1.9119   1st Qu.:  8.542   1st Qu.:0.000000   1st Qu.:0.20975  
##  Median : 3.2393   Median : 13.611   Median :0.004165   Median :0.33774  
##  Mean   : 4.7742   Mean   : 17.810   Mean   :0.018961   Mean   :0.45644  
##  3rd Qu.: 5.3090   3rd Qu.: 20.190   3rd Qu.:0.010962   3rd Qu.:0.52752  
##  Max.   :92.6516   Max.   :511.829   Max.   :0.895248   Max.   :4.31328  
##       ZIC1              UBASH3B             SRSF12       
##  Min.   : 0.000000   Min.   : 0.05978   Min.   :0.00000  
##  1st Qu.: 0.006696   1st Qu.: 0.32953   1st Qu.:0.05075  
##  Median : 0.021372   Median : 0.50439   Median :0.08674  
##  Mean   : 0.319268   Mean   : 0.67541   Mean   :0.15722  
##  3rd Qu.: 0.070787   3rd Qu.: 0.73772   3rd Qu.:0.15102  
##  Max.   :10.116302   Max.   :11.28874   Max.   :4.45669  
##     POM121L2             FTCD             PGAP3        
##  Min.   :0.000000   Min.   :0.00000   Min.   :  2.909  
##  1st Qu.:0.000000   1st Qu.:0.02111   1st Qu.: 10.087  
##  Median :0.000000   Median :0.04947   Median : 14.315  
##  Mean   :0.005048   Mean   :0.10523   Mean   : 29.555  
##  3rd Qu.:0.006027   3rd Qu.:0.11361   3rd Qu.: 20.498  
##  Max.   :0.188325   Max.   :3.98651   Max.   :579.352  
##      VSNL1                KLK6               NRG4        
##  Min.   : 0.005134   Min.   :  0.0000   Min.   :0.00000  
##  1st Qu.: 0.364670   1st Qu.:  0.0829   1st Qu.:0.03939  
##  Median : 0.697485   Median :  0.3741   Median :0.06092  
##  Mean   : 1.062458   Mean   :  3.6199   Mean   :0.08119  
##  3rd Qu.: 1.262764   3rd Qu.:  1.3324   3rd Qu.:0.09877  
##  Max.   :14.256126   Max.   :420.5274   Max.   :0.97459  
##      CXXC5            KIR2DL4           PPP1R14C       
##  Min.   :  1.666   Min.   :0.00000   Min.   :  0.0000  
##  1st Qu.: 22.886   1st Qu.:0.01187   1st Qu.:  0.1573  
##  Median : 31.383   Median :0.03795   Median :  0.4416  
##  Mean   : 34.598   Mean   :0.10067   Mean   :  1.9582  
##  3rd Qu.: 42.703   3rd Qu.:0.09911   3rd Qu.:  1.1118  
##  Max.   :135.424   Max.   :2.17800   Max.   :152.0083

# ranking coefficients of the variables selected in common by the three methods

EN.coef <- abs(fit.EN.cv$glmnet.fit$beta[,which(fit.EN.cv$cvm == min(fit.EN.cv$cvm))])
EN.coef.matrix <- cbind(c(1:dim(data.plus.race.age.rank)[2]), EN.coef)
rank.EN.coef <- cbind(EN.coef.matrix[order(EN.coef.matrix[,2], decreasing = TRUE),],c(1:dim(data.plus.race.age.rank)[2]))
rank.EN.coef <- rank.EN.coef[which(rank.EN.coef[,1] %in% TNBC.vars.in.common),]

splsda.coef <- abs(coef(fit.splsda.LOGIT))[-1]
splsda.coef.matrix <- cbind(c(1: dim(data.plus.race.age.rank)[2]), splsda.coef)
rank.splsda.coef <- cbind(splsda.coef.matrix[order(splsda.coef.matrix[,2], decreasing = TRUE),],c(1:dim(data.plus.race.age.rank)[2]))
rank.splsda.coef <- rank.splsda.coef[which(rank.splsda.coef[,1] %in% TNBC.vars.in.common),]

sgpls.coef <- abs(coef(fit.sgpls.LOGIT))[-1]
sgpls.coef.matrix <- cbind(c(1: dim(data.plus.race.age.rank)[2]), sgpls.coef)
rank.sgpls.coef <- cbind(sgpls.coef.matrix[order(sgpls.coef.matrix[,2] , decreasing = TRUE),],c(1:dim(data.plus.race.age.rank)[2]))
rank.sgpls.coef <- rank.sgpls.coef[which(rank.sgpls.coef[,1] %in% TNBC.vars.in.common),]

variables.rank.matrix <- cbind(sort(rank.EN.coef[,1]),rank.EN.coef[order(rank.EN.coef[,1]),3], rank.splsda.coef[order(rank.splsda.coef[,1]),3], rank.sgpls.coef[order(rank.sgpls.coef[,1]),3])
colnames(variables.rank.matrix) <- c("variable","LOGIT-EN","SPLSDA","SGPLS")
#variables.rank.matrix <- cbind(as.matrix(mcols(ensembl.protein.coding[rownames(variables.rank.matrix),2])),variables.rank.matrix)

combined.variables.rank.matrix <- cbind(TNBC.vars.in.common.code,apply(variables.rank.matrix[,2:4],1,prod))
combined.variables.rank.matrix

##                 gene_name           
## ENSG00000074410 "CA12"     "1669570"
## ENSG00000091831 "ESR1"     "38889"  
## ENSG00000101425 "BPI"      "964070" 
## ENSG00000102387 "TAF7L"    "33626"  
## ENSG00000106819 "ASPN"     "281637" 
## ENSG00000124134 "KCNS1"    "672"    
## ENSG00000130234 "ACE2"     "3312"   
## ENSG00000134873 "CLDN10"   "1416"   
## ENSG00000139515 "PDX1"     "85436"  
## ENSG00000140451 "PIF1"     "153594" 
## ENSG00000140511 "HAPLN3"   "622544" 
## ENSG00000140545 "MFGE8"    "873828" 
## ENSG00000142700 "DMRTA2"   "12155"  
## ENSG00000150551 "LYPD1"    "72"     
## ENSG00000152977 "ZIC1"     "35200"  
## ENSG00000154127 "UBASH3B"  "264740" 
## ENSG00000154548 "SRSF12"   "600210" 
## ENSG00000158553 "POM121L2" "1521"   
## ENSG00000160282 "FTCD"     "9900"   
## ENSG00000161395 "PGAP3"    "100"    
## ENSG00000163032 "VSNL1"    "25380"  
## ENSG00000167755 "KLK6"     "2092960"
## ENSG00000169752 "NRG4"     "23780"  
## ENSG00000171604 "CXXC5"    "55626"  
## ENSG00000189013 "KIR2DL4"  "22344"  
## ENSG00000198729 "PPP1R14C" "454230"

combined.variables.rank.matrix[order(as.numeric(combined.variables.rank.matrix[,2])),1]

## ENSG00000150551 ENSG00000161395 ENSG00000124134 ENSG00000134873 
##         "LYPD1"         "PGAP3"         "KCNS1"        "CLDN10" 
## ENSG00000158553 ENSG00000130234 ENSG00000160282 ENSG00000142700 
##      "POM121L2"          "ACE2"          "FTCD"        "DMRTA2" 
## ENSG00000189013 ENSG00000169752 ENSG00000163032 ENSG00000102387 
##       "KIR2DL4"          "NRG4"         "VSNL1"         "TAF7L" 
## ENSG00000152977 ENSG00000091831 ENSG00000171604 ENSG00000139515 
##          "ZIC1"          "ESR1"         "CXXC5"          "PDX1" 
## ENSG00000140451 ENSG00000154127 ENSG00000106819 ENSG00000198729 
##          "PIF1"       "UBASH3B"          "ASPN"      "PPP1R14C" 
## ENSG00000154548 ENSG00000140511 ENSG00000140545 ENSG00000101425 
##        "SRSF12"        "HAPLN3"         "MFGE8"           "BPI" 
## ENSG00000074410 ENSG00000167755 
##          "CA12"          "KLK6"

Genes selected by LOGIT-EN, SPLS-DA and SGPLS when changing the label of the influential observations misclassified in more than 2 methods

data.Y.rank.change <- data.Y.rank
data.Y.rank.change[outliers.rank[which(outliers.rank[1:24,8] >= 67),1]
] <- 1-data.Y.rank.change[outliers.rank[which(outliers.rank[1:24,8] >= 67),1]]

# LOGIT-EN

my.alpha <- seq(0.1,0.9,0.1) 

## Optimizing alpha and lambda for logistic regression with Elastic net regularization

nvar.selected.EN.outliers.change <- matrix(0,1,length(my.alpha))
pred.EN.outliers.change <- matrix(0,dim(data.plus.race.age.rank)[1],length(my.alpha))
MSE.EN.outliers.change <- matrix(0,1,length(my.alpha))

set.seed(2010)

for (j in 1:length(my.alpha)){
  
  # Logistic model fitting with 10-fold cross-validation for glmnet:
  fit.EN.cv.outliers.change <- cv.glmnet(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank.change),family="binomial",foldid=foldid.tnbc,alpha=my.alpha[j])
  
  var.selected.EN.outliers.change <- which(fit.EN.cv.outliers.change$glmnet.fit$beta[,which(fit.EN.cv.outliers.change$cvm == min(fit.EN.cv.outliers.change$cvm))] != 0)
  nvar.selected.EN.outliers.change[j] <- length(var.selected.EN.outliers.change)
  
  # Predictions obtained by model i
  pred.EN.outliers.change[,j] <- predict(fit.EN.cv.outliers.change,as.matrix(data.plus.race.age.rank),s="lambda.min",type="response")
  
  # Mean squared error of prediction (MSE)
  MSE.EN.outliers.change[j] <- mean((data.Y.rank.change-pred.EN.outliers.change)^2)
}

MSE.EN.outliers.change

##           [,1]      [,2]      [,3]       [,4]       [,5]       [,6]
## [1,] 0.1482493 0.1311899 0.1141429 0.09705398 0.08017282 0.06327089
##            [,7]       [,8]       [,9]
## [1,] 0.04627029 0.02925698 0.01223522

nvar.selected.EN.outliers.change

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,]  495  256  202  170  137  122  112  104   94

my.alpha[which(MSE.EN.outliers.change == min(MSE.EN.outliers.change))]

## [1] 0.9

my.alpha.opt <- my.alpha[which(MSE.EN.outliers.change == min(MSE.EN.outliers.change))]

fit.EN.cv.outliers.change <- cv.glmnet(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank.change),family="binomial",foldid=foldid.tnbc,alpha=my.alpha.opt)
var.selected.EN.outliers.change <- which(fit.EN.cv.outliers.change$glmnet.fit$beta[,which(fit.EN.cv.outliers.change$cvm == min(fit.EN.cv.outliers.change$cvm))] != 0)

# Predictions
pred.EN.outliers.change <- predict(fit.EN.cv.outliers.change,as.matrix(data.plus.race.age.rank),s="lambda.min",type="response")

# Mean squared error of prediction (MSE):
MSE.EN.outliers.change <- mean((data.Y.rank.change-pred.EN.outliers.change)^2)
MSE.EN.outliers.change

## [1] 0.01238853

# SPLSDA

#fit.splsda.LOGIT.cv.outliers.change <- cv.splsda(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank.change), fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), kappa=0.5, classifier="logistic", scale.x=FALSE,n.core=10)
fit.splsda.LOGIT.outliers.change <- splsda(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank.change), K = 5, eta=0.7, classifier="logistic", scale.x=FALSE) # parameters K and eta obtained by cross-validation above
var.selected.splsda.LOGIT.outliers.change <- fit.splsda.LOGIT.outliers.change$A

# Predictions
pred.splsda.LOGIT.outliers.change <- predict(fit.splsda.LOGIT.outliers.change, as.matrix(data.plus.race.age.rank),type = "fit","coefficient", fit.type = "response")

# Mean squared error of prediction (MSE):
MSE.splsda.LOGIT.outliers.change <- mean((data.Y.rank.change-pred.splsda.LOGIT.outliers.change)^2)
MSE.splsda.LOGIT.outliers.change

## [1] 0.009926286

# SGPLS

#fit.sgpls.LOGIT.cv.outliers.out <- cv.sgpls(as.matrix(data.plus.race.age.rank),data.Y.rank.change, fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), scale.x=FALSE,n.core=10)
fit.sgpls.LOGIT.outliers.change <- sgpls(as.matrix(data.plus.race.age.rank),data.Y.rank.change, K = 5, eta=0.8, scale.x=FALSE)
var.selected.sgpls.LOGIT.outliers.change <- fit.sgpls.LOGIT.outliers.change$A

# Predictions
pred.sgpls.LOGIT.outliers.change <- predict(fit.sgpls.LOGIT.outliers.change, as.matrix(data.plus.race.age.rank),type="fit",fit.type="response")

# Mean squared error of prediction (MSE):
MSE.sgpls.LOGIT.outliers.change <- mean((data.Y.rank.change-pred.sgpls.LOGIT.outliers.change)^2)
MSE.sgpls.LOGIT.outliers.change

## [1] 0.07822628

# variables in common between EN and SPLSDA and SGPLS
outliers.change.vars.in.common <- as.data.frame(table(c(var.selected.EN.outliers.change,var.selected.splsda.LOGIT.outliers.change,var.selected.sgpls.LOGIT.outliers.change)))
outliers.change.vars.in.common <- as.numeric(as.character(outliers.change.vars.in.common[which(outliers.change.vars.in.common[,2] == 3),1]))
length(outliers.change.vars.in.common)

## [1] 16

outliers.change.vars.in.common.code <- as.matrix(mcols(ensembl.protein.coding[colnames(data.plus.race.age.rank[,outliers.change.vars.in.common]),2]))
outliers.change.vars.in.common.code

##       gene_name 
##  [1,] "ESR1"    
##  [2,] "TAF7L"   
##  [3,] "VSX2"    
##  [4,] "ACE2"    
##  [5,] "SLCO1B1" 
##  [6,] "CLDN10"  
##  [7,] "TTLL4"   
##  [8,] "HAPLN3"  
##  [9,] "DMRTA2"  
## [10,] "LYPD1"   
## [11,] "SRSF12"  
## [12,] "PGAP3"   
## [13,] "CXXC5"   
## [14,] "PPP1R14C"
## [15,] "CT83"    
## [16,] "NOTO"

# variables in common with the previous 26 relevant selected by the three methods
outliers.change.vars.in.common.TNBC <- outliers.change.vars.in.common[which(outliers.change.vars.in.common %in% TNBC.vars.in.common)]
outliers.change.vars.in.common.TNBC.code <- as.matrix(mcols(ensembl.protein.coding[colnames(data.plus.race.age.rank[,outliers.change.vars.in.common.TNBC]),2]))
outliers.change.vars.in.common.TNBC.code

##       gene_name 
##  [1,] "ESR1"    
##  [2,] "TAF7L"   
##  [3,] "ACE2"    
##  [4,] "CLDN10"  
##  [5,] "HAPLN3"  
##  [6,] "DMRTA2"  
##  [7,] "LYPD1"   
##  [8,] "SRSF12"  
##  [9,] "PGAP3"   
## [10,] "CXXC5"   
## [11,] "PPP1R14C"

# summary of gene expression in TNBC and non-TNBC for the 10 genes selected

dim(xdata)

## [1]  1019 19688

outliers.change.vars.in.common.name <- colnames(xdata[,outliers.change.vars.in.common])

outliers.change.vars.in.common.expression.TNBC <- xdata[data.Y=="1",outliers.change.vars.in.common.name]
colnames(outliers.change.vars.in.common.expression.TNBC) <- as.vector(outliers.change.vars.in.common.code)
summary(outliers.change.vars.in.common.expression.TNBC)

##       ESR1              TAF7L              VSX2              ACE2         
##  Min.   : 0.01938   Min.   :0.00979   Min.   :0.00000   Min.   :  0.0000  
##  1st Qu.: 0.16063   1st Qu.:0.08815   1st Qu.:0.00000   1st Qu.:  0.1258  
##  Median : 0.35070   Median :0.14208   Median :0.01006   Median :  0.4727  
##  Mean   : 1.52980   Mean   :0.35790   Mean   :0.08516   Mean   :  4.1248  
##  3rd Qu.: 0.82760   3rd Qu.:0.28278   3rd Qu.:0.04406   3rd Qu.:  1.4384  
##  Max.   :29.97893   Max.   :7.30063   Max.   :2.47960   Max.   :149.4913  
##     SLCO1B1            CLDN10             TTLL4            HAPLN3       
##  Min.   :0.00000   Min.   : 0.00000   Min.   : 1.614   Min.   :  0.919  
##  1st Qu.:0.00000   1st Qu.: 0.02278   1st Qu.: 5.727   1st Qu.:  8.362  
##  Median :0.00000   Median : 0.05433   Median : 9.034   Median : 19.276  
##  Mean   :0.06594   Mean   : 2.65009   Mean   :10.526   Mean   : 26.925  
##  3rd Qu.:0.01368   3rd Qu.: 0.46768   3rd Qu.:14.144   3rd Qu.: 38.167  
##  Max.   :2.23565   Max.   :91.48534   Max.   :42.183   Max.   :118.904  
##      DMRTA2             LYPD1              SRSF12            PGAP3       
##  Min.   :0.000000   Min.   : 0.07908   Min.   :0.04384   Min.   : 1.250  
##  1st Qu.:0.009053   1st Qu.: 0.44511   1st Qu.:0.38949   1st Qu.: 5.600  
##  Median :0.030162   Median : 0.78513   Median :0.92863   Median : 7.402  
##  Mean   :0.353153   Mean   : 1.91865   Mean   :1.32648   Mean   : 8.647  
##  3rd Qu.:0.213414   3rd Qu.: 1.82027   3rd Qu.:1.91600   3rd Qu.:10.509  
##  Max.   :6.013674   Max.   :36.65243   Max.   :8.57894   Max.   :48.840  
##      CXXC5            PPP1R14C             CT83             NOTO         
##  Min.   : 0.9359   Min.   :  0.0361   Min.   : 0.000   Min.   :0.000000  
##  1st Qu.: 5.3429   1st Qu.:  4.4508   1st Qu.: 0.000   1st Qu.:0.000000  
##  Median : 8.3877   Median : 11.2454   Median : 7.716   Median :0.004625  
##  Mean   :10.3194   Mean   : 18.4007   Mean   :10.697   Mean   :0.042799  
##  3rd Qu.:13.0758   3rd Qu.: 27.1789   3rd Qu.:16.792   3rd Qu.:0.009891  
##  Max.   :60.0577   Max.   :107.9462   Max.   :67.889   Max.   :2.293503

outliers.change.vars.in.common.expression.nonTNBC <- xdata[data.Y=="0",outliers.change.vars.in.common.name]
colnames(outliers.change.vars.in.common.expression.nonTNBC) <- as.vector(outliers.change.vars.in.common.code)
summary(outliers.change.vars.in.common.expression.nonTNBC)

##       ESR1               TAF7L              VSX2         
##  Min.   :  0.01585   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.: 16.14359   1st Qu.:0.03541   1st Qu.:0.000000  
##  Median : 36.66739   Median :0.06448   Median :0.006088  
##  Mean   : 47.88104   Mean   :0.09564   Mean   :0.018380  
##  3rd Qu.: 69.64894   3rd Qu.:0.11493   3rd Qu.:0.015692  
##  Max.   :272.20332   Max.   :1.31195   Max.   :0.922559  
##       ACE2              SLCO1B1            CLDN10            TTLL4        
##  Min.   :  0.00000   Min.   :0.00000   Min.   :0.00000   Min.   : 0.7379  
##  1st Qu.:  0.01594   1st Qu.:0.00000   1st Qu.:0.01306   1st Qu.: 2.1871  
##  Median :  0.04167   Median :0.00000   Median :0.03601   Median : 2.9417  
##  Mean   :  0.50085   Mean   :0.01287   Mean   :0.11311   Mean   : 3.4643  
##  3rd Qu.:  0.09604   3rd Qu.:0.00000   3rd Qu.:0.09566   3rd Qu.: 3.9226  
##  Max.   :114.41918   Max.   :2.05987   Max.   :6.81840   Max.   :32.5132  
##      HAPLN3            DMRTA2             LYPD1             SRSF12       
##  Min.   : 0.1285   Min.   :0.000000   Min.   :0.01422   Min.   :0.00000  
##  1st Qu.: 1.9119   1st Qu.:0.000000   1st Qu.:0.20975   1st Qu.:0.05075  
##  Median : 3.2393   Median :0.004165   Median :0.33774   Median :0.08674  
##  Mean   : 4.7742   Mean   :0.018961   Mean   :0.45644   Mean   :0.15722  
##  3rd Qu.: 5.3090   3rd Qu.:0.010962   3rd Qu.:0.52752   3rd Qu.:0.15102  
##  Max.   :92.6516   Max.   :0.895248   Max.   :4.31328   Max.   :4.45669  
##      PGAP3             CXXC5            PPP1R14C             CT83        
##  Min.   :  2.909   Min.   :  1.666   Min.   :  0.0000   Min.   : 0.0000  
##  1st Qu.: 10.087   1st Qu.: 22.886   1st Qu.:  0.1573   1st Qu.: 0.0000  
##  Median : 14.315   Median : 31.383   Median :  0.4416   Median : 0.0000  
##  Mean   : 29.555   Mean   : 34.598   Mean   :  1.9582   Mean   : 0.7144  
##  3rd Qu.: 20.498   3rd Qu.: 42.703   3rd Qu.:  1.1118   3rd Qu.: 0.0000  
##  Max.   :579.352   Max.   :135.424   Max.   :152.0083   Max.   :60.3399  
##       NOTO         
##  Min.   :0.000000  
##  1st Qu.:0.000000  
##  Median :0.000000  
##  Mean   :0.010222  
##  3rd Qu.:0.007316  
##  Max.   :0.713836

3.3 TNBC data with varying sets of patients

The different models used for ensemble analysis are sparse logistic model with alpha = 0.7 (EN) based on randonly selected 80 % of patients.

source("~/ensembl.rand.patients.R")
library('qvalue')

set.seed(2010)

n.models.rand <- 100
outliers.rand.patients.EN <- ensembl.rand.patients(data.plus.race.age.rank,data.Y.rank,n.models.rand)

# mean number of variables selected in the 100 runs
mean(outliers.rand.patients.EN$nvariables.selected)

## [1] 81.97

# mean MSE for the 100 runs
mean(outliers.rand.patients.EN$MSE)

## [1] 0.03168607

# mean number of misclassification in the 100 runs
mean(outliers.rand.patients.EN$Misclassifications.by.method)

## [1] 34.81

# influent patients
TNBC.rand.patient.influent <- outliers.rand.patients.EN$outliers.rank[which(outliers.rand.patients.EN$outliers.rank[,n.models.rand+4] < 0.05),1]
length(TNBC.rand.patient.influent)

## [1] 40

# influent patients which are suspect
rownames(data.plus.race.age.rank[TNBC.rand.patient.influent,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.rand.patient.influent,])) %in% suspect)]

## [1] "TCGA-A2-A0EQ-01A-11R-A034-07" "TCGA-A2-A04U-01A-11R-A115-07"
## [3] "TCGA-AN-A0FJ-01A-11R-A00Z-07" "TCGA-LL-A5YP-01A-21R-A28M-07"
## [5] "TCGA-AN-A0FX-01A-11R-A034-07" "TCGA-AO-A0JL-01A-11R-A056-07"

TNBC.rand.patient.var.select <- as.data.frame(table(outliers.rand.patients.EN$var.selected))
TNBC.rand.patient.var.select$Var1 <- as.numeric(as.character(TNBC.rand.patient.var.select[,1]))
TNBC.rand.patient.var.select$var.name <- colnames(data.plus.race.age.rank[,TNBC.rand.patient.var.select[,1]])

# variabes selected in more that 75% of the runs
TNBC.rand.patient.var.select.more75 <- TNBC.rand.patient.var.select[which(TNBC.rand.patient.var.select[,2]>75),3]
length(TNBC.rand.patient.var.select.more75)

## [1] 23

# variables selected in common by the ensembles for the original TNBC and for the 'random patient' strategy
TNBC.rand.patient.var.select.common.originalTNBC <- TNBC.rand.patient.var.select.more75[which(TNBC.rand.patient.var.select.more75 %in% colnames(data.plus.race.age.rank[,TNBC.vars.in.common]))]
length(TNBC.rand.patient.var.select.common.originalTNBC)

## [1] 15

# variables code
ensembl.protein.coding[TNBC.rand.patient.var.select.common.originalTNBC,2][,1]

## GRanges object with 15 ranges and 1 metadata column:
##                   seqnames                 ranges strand |   gene_name
##                      <Rle>              <IRanges>  <Rle> | <character>
##   ENSG00000074410       15 [ 63321378,  63382161]      - |        CA12
##   ENSG00000091831        6 [151656691, 152129619]      + |        ESR1
##   ENSG00000124134       20 [ 45092310,  45101112]      - |       KCNS1
##   ENSG00000130234        X [ 15561033,  15602148]      - |        ACE2
##   ENSG00000140451       15 [ 64815632,  64825668]      - |        PIF1
##               ...      ...                    ...    ... .         ...
##   ENSG00000160282       21 [ 46136262,  46155567]      - |        FTCD
##   ENSG00000161395       17 [ 39671122,  39696797]      - |       PGAP3
##   ENSG00000171604        5 [139647299, 139683882]      + |       CXXC5
##   ENSG00000189013       19 [ 54803535,  54814517]      + |     KIR2DL4
##   ENSG00000198729        6 [150143076, 150250357]      + |    PPP1R14C
##   -------
##   seqinfo: 287 sequences from GRCh38 genome

as.matrix(mcols(ensembl.protein.coding[TNBC.rand.patient.var.select.common.originalTNBC,2]))

##       gene_name 
##  [1,] "CA12"    
##  [2,] "ESR1"    
##  [3,] "KCNS1"   
##  [4,] "ACE2"    
##  [5,] "PIF1"    
##  [6,] "HAPLN3"  
##  [7,] "LYPD1"   
##  [8,] "ZIC1"    
##  [9,] "UBASH3B" 
## [10,] "POM121L2"
## [11,] "FTCD"    
## [12,] "PGAP3"   
## [13,] "CXXC5"   
## [14,] "KIR2DL4" 
## [15,] "PPP1R14C"

3.4 TNBC data with varying sets of variables

The different models used for ensemble analysis are sparse logistic model with alpha = 0.7 (EN), departing from a set of 1000 randomly selected variables.

source("~/ensembl.rand.variables.R")
times <- 100
vars.rand <- 1000
outliers.rand.variables.EN <- ensembl.rand.variables(data.plus.race.age.rank,data.Y.rank, times, vars.rand)

# mean number of variables selected in the 100 runs
mean(outliers.rand.variables.EN$nvariables.selected)

## [1] 64.52

# mean MSE for the 100 runs
mean(outliers.rand.variables.EN$MSE)

## [1] 0.0347633

# mean number of misclassification in the 100 runs
mean(outliers.rand.variables.EN$Misclassifications.by.method)

## [1] 41.1

# influent patients
TNBC.rand.variables.influent <- outliers.rand.variables.EN$outliers.rank[which(outliers.rand.variables.EN$outliers.rank[,times+4] < 0.05),1]
length(TNBC.rand.variables.influent)

## [1] 37

# influent patients which are suspect
rownames(data.plus.race.age.rank[TNBC.rand.variables.influent,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.rand.variables.influent,])) %in% suspect)]

## [1] "TCGA-A2-A04U-01A-11R-A115-07" "TCGA-AN-A0FJ-01A-11R-A00Z-07"
## [3] "TCGA-A2-A0EQ-01A-11R-A034-07" "TCGA-LL-A5YP-01A-21R-A28M-07"
## [5] "TCGA-AO-A0JL-01A-11R-A056-07" "TCGA-AN-A0FX-01A-11R-A034-07"

# which influent patients in common with the ensemble for the original TNBC data and by the 'random patient' and 'random variable' strategies
TNBC.rand.outliers.in.common <- as.data.frame(table(c(TNBC.rand.variables.influent,TNBC.rand.patient.influent,TNBC.influent)))
TNBC.rand.outliers.in.common <- as.numeric(as.character(TNBC.rand.outliers.in.common[which(TNBC.rand.outliers.in.common[,2] == 3),1]))
length(TNBC.rand.outliers.in.common)

## [1] 18

getParticipantCode(rownames(data.plus.race.age.rank[TNBC.rand.outliers.in.common,]))

##  [1] "TCGA-AC-A2QJ" "TCGA-OL-A97C" "TCGA-AC-A62X" "TCGA-A2-A0YJ"
##  [5] "TCGA-LL-A5YP" "TCGA-E2-A1II" "TCGA-A2-A0EQ" "TCGA-AR-A1AH"
##  [9] "TCGA-LL-A740" "TCGA-A2-A3Y0" "TCGA-C8-A3M7" "TCGA-E9-A1ND"
## [13] "TCGA-A2-A1G6" "TCGA-D8-A1JF" "TCGA-OL-A5S0" "TCGA-E9-A22G"
## [17] "TCGA-AR-A251" "TCGA-AR-A1AJ"

# 18 patients in common (TNBC original + rand patients + rand variables) detected as outliers 

# influent patients in common for the three emsemble strategies which are suspect
rownames(data.plus.race.age.rank[TNBC.rand.outliers.in.common,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.rand.outliers.in.common,])) %in% suspect)]

## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"

Outlier Ensemble Analysis of Triple-Negative Breast Cancer (TNBC) Data

Marta B. Lopes, André Veríssimo, Eunice Carrasquinha, Sandra Casimiro, Niko Beerenwinkel, Susana Vinga

16-01-2018