The data analysis described below requires R version 3.3.2 and Bioconductor version 3.4.

1 The BRCA dataset

The Cancer Genome Atlas Breast Invasive Carcinoma (TCGA-BRCA) data collection is part of a larger effort to build a research community focused on connecting cancer phenotypes to genotypes by providing clinical images matched to subjects from The Cancer Genome Atlas (TCGA).

The BRCA data is publicly available (https://cancergenome.nih.gov/) and is decomposed into two datasets:

  1. the gene expression data, composed of 57251 variables for a total of 1222 samples with 1097 individuals. From those samples, 1102 with primary solid tumor, 7 metastatic and 113 with normal tissue;

  2. the clinical data is composed of 114 variables obtained from the same individuals (much more is available in the gdc data variable, such as follow_up, drug, radiation, …)

The BRCA datasets will be imported using the package ‘brca.data’.

# Package developed by our group
#library(devtools)
#devtools::install_url('https://github.com/averissimo/brca.data/releases/download/1.0/brca.data_1.0.tar.gz')
library(brca.data)

1.1 TNBC dataset construction

Triple negative breast cancer (TNBC) refers to any BRCA that does not express the genes for the estrogen receptor (ESR1; ENSG00000091831), progesterone receptor (PGR; ENSG00000082175) and which does not overexpress the human epidermal growth factor receptor 2 (HER2; ENSG00000141736).

From the clinical data, the columns related to gene expression status of ESR1, PGR and HER2 must be selected. There are three possible variables for HER2:

  1. ‘her2_immunohistochemistry_level_result’, which refers to the ImmunoHistoChemistry (IHC) HER2 result into levels ‘0’ (negative), ‘1’ (negative), ‘2+’ (indeterminate) and ‘3+’(positive);

  2. ‘lab_proc_her2_neu_immunohistochemistry_receptor_status’, corresponding to the IHC result into status indeterminate, equivocal, negative and positive;

  3. ‘lab_procedure_her2_neu_in_situ_hybrid_outcome_type’, the HER2 individuals’ classification by the fluorescence in situ hybridization (FISH) method, considered more accurate and often performed when IHC results do not show a clear classification of cells into HER2-positive or negative.

data("fpkm.per.tissue","fpkm.per.tissue.barcode", 'clinical', package = 'brca.data')

tnbc.vars <- c('breast_carcinoma_estrogen_receptor_status',
               'breast_carcinoma_progesterone_receptor_status',
               'her2_immunohistochemistry_level_result',
               'lab_proc_her2_neu_immunohistochemistry_receptor_status',
               'lab_procedure_her2_neu_in_situ_hybrid_outcome_type'
)
clinical.tnbc <- clinical$primary.solid.tumor[, tnbc.vars]

clinical.tnbc.aux <- clinical.tnbc
names(clinical.tnbc.aux) <- c("ESR1","PGR","HER2_level","HER2_status","HER2_FISH")

summary(clinical.tnbc.aux)
##             ESR1                PGR      HER2_level        HER2_status 
##               : 48                : 49       :471                :177  
##  Indeterminate:  2   Indeterminate:  4   0   : 60   Equivocal    :178  
##  Negative     :237   Negative     :343   1+  :271   Indeterminate: 12  
##  Positive     :803   Positive     :694   2+  :198   Negative     :559  
##  NA's         :  1   NA's         :  1   3+  : 90   Positive     :164  
##                                          NA's:  1   NA's         :  1  
##          HER2_FISH  
##               :673  
##  Equivocal    :  5  
##  Indeterminate:  4  
##  Negative     :330  
##  Positive     : 78  
##  NA's         :  1
which(is.na(clinical.tnbc.aux[,1]))
## [1] 321
clinical.tnbc.aux <- clinical.tnbc.aux[-(which(is.na(clinical.tnbc.aux[,1]))),]

Finding the concordance between the HER2 variables measured.

Concordance between HER2 (IHC) level and HER2 (IHC) status:

table(as.factor(clinical.tnbc.aux[,3]),clinical.tnbc.aux[,4])
##     
##          Equivocal Indeterminate Negative Positive
##      176         7             6      241       41
##   0    0         0             0       60        0
##   1+   0         4             1      255       11
##   2+   0       166             4        1       27
##   3+   1         1             1        2       85

Creating a vector of HER2 (IHC) status based on HER2 (ICH) level:

clinical.tnbc.aux$my_HER2_level <- as.character(clinical.tnbc.aux$HER2_level)

clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '0'),6] <- 'Negative'
clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '1+'),6] <- 'Negative'
clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '2+'),6] <- 'Indeterminate'
clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '3+'),6] <- 'Positive'

Individuals for which HER2 (IHC) status different from myHER2 (IHC) status (based on HER2 IHC level)

HER2status.dif.HER2level <- clinical.tnbc.aux[c(rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,6] == 'Negative' & clinical.tnbc.aux[,4] == 'Positive'),]),rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,6] == 'Positive' & clinical.tnbc.aux[,4] == 'Negative'),])),]

# getting the individuals short name, as in the clinical data
clinical.tnbc.barcode.pattern <- '(TCGA-[A-Z0-9a-z]{2}-[a-zA-Z0-9]{4})-([0-9]{2}).+'
clinical.tnbc.shortname <- gsub(clinical.tnbc.barcode.pattern,'\\1',rownames(clinical.tnbc))

# individuals with discordant HER2 (IHC) status and HER2 (IHC) level
HER2status.dif.HER2level.names <- clinical.tnbc.shortname[which(clinical.tnbc.shortname %in% rownames(HER2status.dif.HER2level))]
length(HER2status.dif.HER2level.names)
## [1] 13

Concordance between HER2 (IHC) status and HER2 (FISH):

table(clinical.tnbc.aux[,5],clinical.tnbc.aux[,4])
##                
##                     Equivocal Indeterminate Negative Positive
##                 115        18             4      432      104
##   Equivocal       0         3             0        0        2
##   Indeterminate   0         0             0        3        1
##   Negative       53       138             6      121       12
##   Positive        9        19             2        3       45

Individuals for which HER2 (IHC) status different from HER2 (FISH):

HER2status.dif.FISH <- clinical.tnbc.aux[c(rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,4] == 'Negative' & clinical.tnbc.aux[,5] == 'Positive'),]),rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,4] == 'Positive' & clinical.tnbc.aux[,5] == 'Negative'),])),]

HER2status.dif.HER2FISH.names <- clinical.tnbc.shortname[which(clinical.tnbc.shortname %in% rownames( HER2status.dif.FISH))]
length(HER2status.dif.HER2FISH.names)
## [1] 15
# suspect individuals with discordante HER2 labels from different lab testing
suspect <- c(HER2status.dif.HER2level.names,HER2status.dif.HER2FISH.names)
length(suspect)
## [1] 28

Replacing HER2 (IHC) status by HER2 (FISH) (a more accurate test) whenever available:

clinical.tnbc.aux$HER2_status_plus_FISH <- clinical.tnbc.aux[,4]
clinical.tnbc.aux[clinical.tnbc.aux[,5] !='',7] <- clinical.tnbc.aux[clinical.tnbc.aux[,5] !='',5]

Concordance between HER2 (IHC) status and HER2 (IHC status + FISH):

table(clinical.tnbc.aux[,4],clinical.tnbc.aux[,7])
##                
##                     Equivocal Indeterminate Negative Positive
##                 115         0             0       53        9
##   Equivocal       0        21             0      138       19
##   Indeterminate   0         0             4        6        2
##   Negative        0         0             3      553        3
##   Positive        0         2             1       12      149

Building the final clinical TNBC dataset, composed of all individuals and gene expression status of ESR1, PGR and HER2:

clinical.tnbc <- clinical.tnbc.aux[,c(1,2,7)]

# replace all '' or NA with 'Indeterminate'
clinical.tnbc[is.na(clinical.tnbc) | clinical.tnbc == '' | clinical.tnbc == 'Equivocal'] <- 'Indeterminate'

# individuals with at least one 'Positive' and keep all are marked as not TNBC
tnbc.status.pos <- clinical.tnbc == 'Positive'
#tnbc.status.pos <- clinical.tnbc[,c(1,2,3)] == 'Positive'
not.tnbc.id     <- rownames(tnbc.status.pos[which(sapply(seq(nrow(tnbc.status.pos)), function(ix) { any(tnbc.status.pos[ix,]) })),])

# individuals with at least one 'Indeterminate' and excluded from dataset
tnbc.status.ind <- clinical.tnbc[!(rownames(clinical.tnbc) %in% not.tnbc.id),] == 'Indeterminate'
not.tnbc.id     <- rownames(tnbc.status.ind[which(sapply(seq(nrow(tnbc.status.ind)), function(ix) { any(tnbc.status.ind[ix,]) })),])
clinical.tnbc <- clinical.tnbc[!(rownames(clinical.tnbc) %in% not.tnbc.id), ]

tnbc.status.neg <- clinical.tnbc == 'Negative'
tnbc.ix <- sapply(seq(nrow(clinical.tnbc)), function(ix) { all(tnbc.status.neg[ix,]) })

# set cases with TNBC
clinical.tnbc$tnbc <- 'NO_TNBC'
clinical.tnbc[tnbc.ix, 'tnbc'] <- 'TNBC'

Building the Y (binary data, i.e., TNBC vs. nonTNBC) response vector:

# building the ydata (binary)
ydata <- data.frame(tnbc = clinical.tnbc$tnbc, row.names = rownames(clinical.tnbc))
ydata$tnbc <- factor(ydata$tnbc)

Protein coding genes catalog for building the X gene expression matrix

I. Install packages

source("http://bioconductor.org/biocLite.R")  
## Bioconductor version 3.6 (BiocInstaller 1.28.0), ?biocLite for help
biocLite(c('ensembldb', 'EnsDb.Hsapiens.v86'))
## BioC_mirror: https://bioconductor.org
## Using Bioconductor 3.6 (BiocInstaller 1.28.0), R 3.4.2 (2017-09-28).
## Installing package(s) 'ensembldb', 'EnsDb.Hsapiens.v86'
## Old packages: 'biomaRt', 'chron', 'circlize', 'cowplot', 'dbplyr',
##   'digest', 'dnet', 'dotCall64', 'DRR', 'DT', 'edgeR', 'ellipse', 'fgsea',
##   'fpc', 'GenomicRanges', 'git2r', 'haven', 'hexbin', 'Hmisc', 'hms',
##   'htmlTable', 'htmlwidgets', 'irlba', 'knitr', 'lava', 'lhs', 'limma',
##   'littler', 'lme4', 'MASS', 'matrixStats', 'mgcv', 'microbenchmark',
##   'mvabund', 'pcaPP', 'psichomics', 'Rcpp', 'RCurl', 'recipes', 'rlang',
##   'rms', 'rpart', 'rprojroot', 'rtracklayer', 'shinyjs', 'sp', 'spam',
##   'SummarizedExperiment', 'TCGAbiolinks', 'tibble', 'TTR', 'viridis',
##   'withr', 'xml2', 'xts', 'zoo'
# Load libraries

suppressPackageStartupMessages({
  library(ensembldb)
  library(EnsDb.Hsapiens.v86)
  library(futile.logger)
  .Last.value <- flog.layout(layout.format('~m'))
})
  1. Load ensembl data

Ensembl is a genome browser for vertebrate genomes that supports research in comparative genomics, evolution, sequence variation and transcriptional regulation. Ensembl annotate genes, computes multiple alignments, predicts regulatory function and collects disease data. Ensembl tools include BLAST, BLAT, BioMart and the Variant Effect Predictor (VEP) for all supported species.

edb <- EnsDb.Hsapiens.v86
# hasProteinData(edb) # from devel only, as of 3.4
ensembl.protein.coding <- genes(edb, 
                                filter = list(GenebiotypeFilter('protein_coding')), 
                                columns = c('gene_id', 'gene_name'))
{
  flog.info('         Granges: %d', nrow(ensembl.protein.coding@elementMetadata))
  flog.info('Metadata columns: %d', ncol(ensembl.protein.coding@elementMetadata))
}
##          Granges: 22285
## Metadata columns: 3
  1. Load ccds data

The Consensus CDS (CCDS) project is a collaborative effort to identify a core set of human and mouse protein coding regions that are consistently annotated and of high quality. The long term goal is to support convergence towards a standard set of gene annotations.

ccds <- read.table(url('ftp://ftp.ncbi.nih.gov/pub/CCDS/current_human/CCDS.current.txt'),
                   sep = '\t', 
                   header = T, 
                   comment.char = "|", # necessary as header line has a # character
                   stringsAsFactors = FALSE)
flog.info('Size of ccds: %d x %d', nrow(ccds), ncol(ccds))
## Size of ccds: 34245 x 11
  1. Build unique gene set
ensembl.genes         <- sort(unique(ensembl.protein.coding@elementMetadata$gene_name))
ensembl.genes.ensg.id <- sort(ensembl.protein.coding@elementMetadata$gene_id)

ccds.genes       <- sort(unique(ccds$gene))
ccds.extra.genes <- sort(ccds.genes[(!ccds.genes %in% ensembl.genes)])
ccds.extra.genes.ensg.id <- genes(edb, filter = list(GenenameFilter(ccds.extra.genes)), 
                                  columns = c('gene_id', 'gene_name'))

ensg.id <- sort(unique(c(ensembl.protein.coding@elementMetadata$gene_id, ensembl.genes.ensg.id)))

all.genes <- sort(unique(c(ensembl.genes, ccds.extra.genes)))

Building the X gene expression data matrix:

xdata <- t(fpkm.per.tissue$primary.solid.tumor[which(rownames(fpkm.per.tissue$primary.solid.tumor) %in% ensg.id),-which(duplicated(fpkm.per.tissue.barcode$primary.solid.tumor))])
common.rows <- getParticipantCode(rownames(xdata)) %in% rownames(ydata)
xdata <- xdata[common.rows,]

# removing variables with sd=0
xdata.sd <- sapply(seq(ncol(xdata)), function(ix) {sd(xdata[,ix])})
xdata <- xdata[,xdata.sd != 0]

# Gene expression data for ESR1, PGR and HER2 from the suspect (potential outlier) cases identified above

# Clinical and gene expression data of individuals with discordant HER2 level and HER2 status
xdata.shortname <- getParticipantCode(rownames(xdata))

xdata.plus.clinical.HER2status.dif.HER2level <- cbind(xdata[which(xdata.shortname %in% HER2status.dif.HER2level.names), c("ENSG00000091831","ENSG00000082175","ENSG00000141736")],clinical.tnbc.aux[which(rownames(clinical.tnbc.aux) %in% HER2status.dif.HER2level.names),],ydata$tnbc[which(rownames(ydata) %in% HER2status.dif.HER2level.names)])

# clinical and gene expression data of individuals with discordant HER2 status and HER2 FISH
xdata.shortname <- getParticipantCode(rownames(xdata))
xdata.plus.clinical.HER2status.dif.HER2FISH <- cbind(xdata[which(xdata.shortname %in% HER2status.dif.HER2FISH.names), c("ENSG00000091831","ENSG00000082175","ENSG00000141736")],clinical.tnbc.aux[which(rownames(clinical.tnbc.aux) %in% HER2status.dif.HER2FISH.names),],ydata$tnbc[which(rownames(ydata) %in% HER2status.dif.HER2FISH.names)])

Data transformation to be used in further analysis:

data.Y <- as.matrix(ydata)
data.Y[data.Y =="NO_TNBC"] <- 0
data.Y[data.Y =="TNBC"] <- 1
data.Y <- as.numeric(data.Y)

# log-transform xdata
data <- log2(xdata+1)

# normalizing xdata
data <- (data-matrix(apply(data,2,mean),nrow(data),ncol(data),byrow=TRUE))/matrix(apply(data,2,sd),nrow(data),ncol(data),byrow=TRUE)

2 Exploratory data analysis

Summary of TNBC biomarkers:

summary(xdata[which(data.Y=="1"),c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
##  ENSG00000091831    ENSG00000082175     ENSG00000141736  
##  Min.   : 0.01938   Min.   : 0.001169   Min.   :  1.561  
##  1st Qu.: 0.16063   1st Qu.: 0.037205   1st Qu.: 13.964  
##  Median : 0.35070   Median : 0.079403   Median : 19.776  
##  Mean   : 1.52980   Mean   : 0.711833   Mean   : 21.991  
##  3rd Qu.: 0.82760   3rd Qu.: 0.186198   3rd Qu.: 26.058  
##  Max.   :29.97893   Max.   :22.977867   Max.   :103.689
summary(xdata[which(data.Y=="0"),c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
##  ENSG00000091831     ENSG00000082175    ENSG00000141736    
##  Min.   :  0.01585   Min.   :  0.0079   Min.   :   0.6054  
##  1st Qu.: 16.14359   1st Qu.:  0.6003   1st Qu.:  26.5798  
##  Median : 36.66739   Median :  4.2282   Median :  38.7319  
##  Mean   : 47.88104   Mean   : 12.0115   Mean   :  99.7406  
##  3rd Qu.: 69.64894   3rd Qu.: 15.3263   3rd Qu.:  58.8010  
##  Max.   :272.20332   Max.   :327.9133   Max.   :1668.3526
# Building the training and test sets; keep in the training set observations ESR1- and PGR- that would have the opposite classification if HER2 (FISH) had not replaced the HER2 (IHC) status; and observations ESR1- and PGR- for which FISH was not measured and have no concordance between HER2 (IHC) status and HER2 (IHC) level (having a potential impact on the final TNBC/nonTNBC classification)

# taking individuals ID from the 'HER2status.dif.FISH' and 'HER2status.dif.HER2level' created above
HER2status.dif.FISH
##                  ESR1      PGR HER2_level HER2_status HER2_FISH
## TCGA-LL-A5YP Positive Negative         1+    Negative  Positive
## TCGA-AO-A0JL Negative Negative         1+    Negative  Positive
## TCGA-A2-A04U Negative Negative         1+    Negative  Positive
## TCGA-A2-A0EQ Negative Negative         3+    Positive  Negative
## TCGA-BH-A18T Negative Negative         2+    Positive  Negative
## TCGA-AN-A0XV Positive Positive         2+    Positive  Negative
## TCGA-AO-A12C Positive Positive         3+    Positive  Negative
## TCGA-BH-A18Q Negative Negative               Positive  Negative
## TCGA-LL-A5YL Positive Negative         3+    Positive  Negative
## TCGA-E2-A10A Positive Positive         2+    Positive  Negative
## TCGA-AO-A03L Positive Positive         2+    Positive  Negative
## TCGA-AO-A12G Positive Positive         2+    Positive  Negative
## TCGA-BH-A1EX Positive Positive               Positive  Negative
## TCGA-BH-A0AU Positive Positive               Positive  Negative
## TCGA-AN-A0XW Positive Positive         2+    Positive  Negative
##              my_HER2_level
## TCGA-LL-A5YP      Negative
## TCGA-AO-A0JL      Negative
## TCGA-A2-A04U      Negative
## TCGA-A2-A0EQ      Positive
## TCGA-BH-A18T Indeterminate
## TCGA-AN-A0XV Indeterminate
## TCGA-AO-A12C      Positive
## TCGA-BH-A18Q              
## TCGA-LL-A5YL      Positive
## TCGA-E2-A10A Indeterminate
## TCGA-AO-A03L Indeterminate
## TCGA-AO-A12G Indeterminate
## TCGA-BH-A1EX              
## TCGA-BH-A0AU              
## TCGA-AN-A0XW Indeterminate
HER2status.dif.HER2level
##                  ESR1      PGR HER2_level HER2_status HER2_FISH
## TCGA-AC-A8OS Positive Positive         1+    Positive          
## TCGA-AN-A0FL Negative Negative         1+    Positive          
## TCGA-AN-A0FX Negative Negative         1+    Positive          
## TCGA-AN-A0FK Positive Positive         1+    Positive          
## TCGA-E9-A295 Positive Positive         1+    Positive          
## TCGA-AC-A3YI Positive Positive         1+    Positive          
## TCGA-JL-A3YW Positive Positive         1+    Positive          
## TCGA-AN-A0FS Positive Negative         1+    Positive          
## TCGA-AN-A0FN Positive Positive         1+    Positive          
## TCGA-AN-A0FJ Positive Negative         1+    Positive          
## TCGA-AN-A03X Positive Positive         1+    Positive          
## TCGA-LL-A73Y Negative Negative         3+    Negative          
## TCGA-AC-A3W6 Positive Positive         3+    Negative          
##              my_HER2_level
## TCGA-AC-A8OS      Negative
## TCGA-AN-A0FL      Negative
## TCGA-AN-A0FX      Negative
## TCGA-AN-A0FK      Negative
## TCGA-E9-A295      Negative
## TCGA-AC-A3YI      Negative
## TCGA-JL-A3YW      Negative
## TCGA-AN-A0FS      Negative
## TCGA-AN-A0FN      Negative
## TCGA-AN-A0FJ      Negative
## TCGA-AN-A03X      Negative
## TCGA-LL-A73Y      Positive
## TCGA-AC-A3W6      Positive
set.seed(2010)

test.id <- sample((1:nrow(data))[-which(getParticipantCode(rownames(data)) %in% c("TCGA-A2-A0EQ","TCGA-BH-A18T","TCGA-AO-A0JL","TCGA-BH-A18Q","TCGA-A2-A04U","TCGA-LL-A73Y","TCGA-AN-A0FL","TCGA-AN-A0FX"))],round(nrow(data)/4),replace=FALSE)

data.train <- data[-test.id,]
data.Y.train <- data.Y[-test.id]

data.test <- data[test.id,]
data.Y.test <- data.Y[test.id]

Fitting a logistic model based on the three biomarker variables (ESR1, PGR and HER2):

glm.data.train.3var <- data.frame(data.Y.train,data.train[,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
colnames(glm.data.train.3var)[1] <- 'data.y'

# without interaction
glm.fit.3var <- glm(data.y~.,family=binomial,data=glm.data.train.3var,control = glm.control(maxit = 1000))
summary(glm.fit.3var)
## 
## Call:
## glm(formula = data.y ~ ., family = binomial, data = glm.data.train.3var, 
##     control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.92289  -0.17395  -0.08124  -0.03853   3.15187  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.94274    0.35293 -11.171  < 2e-16 ***
## ENSG00000091831 -2.76484    0.35260  -7.841 4.46e-15 ***
## ENSG00000082175 -0.05407    0.37973  -0.142    0.887    
## ENSG00000141736 -1.04076    0.18347  -5.673 1.40e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 667.69  on 763  degrees of freedom
## Residual deviance: 233.49  on 760  degrees of freedom
## AIC: 241.49
## 
## Number of Fisher Scoring iterations: 7
# with interaction
glm.fit.3var <- glm(data.y~ENSG00000091831*ENSG00000082175*ENSG00000141736,family=binomial,data=glm.data.train.3var,control = glm.control(maxit = 1000))
summary(glm.fit.3var)
## 
## Call:
## glm(formula = data.y ~ ENSG00000091831 * ENSG00000082175 * ENSG00000141736, 
##     family = binomial, data = glm.data.train.3var, control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.93100  -0.17283  -0.09933  -0.04878   3.13543  
## 
## Coefficients:
##                                                 Estimate Std. Error
## (Intercept)                                     -4.10968    0.42940
## ENSG00000091831                                 -2.22048    0.62773
## ENSG00000082175                                 -0.18193    0.48687
## ENSG00000141736                                 -1.26152    0.50488
## ENSG00000091831:ENSG00000082175                  0.58039    0.52550
## ENSG00000091831:ENSG00000141736                  0.07795    0.75757
## ENSG00000082175:ENSG00000141736                 -0.37518    0.56887
## ENSG00000091831:ENSG00000082175:ENSG00000141736 -0.01215    0.66538
##                                                 z value Pr(>|z|)    
## (Intercept)                                      -9.571  < 2e-16 ***
## ENSG00000091831                                  -3.537 0.000404 ***
## ENSG00000082175                                  -0.374 0.708646    
## ENSG00000141736                                  -2.499 0.012466 *  
## ENSG00000091831:ENSG00000082175                   1.104 0.269396    
## ENSG00000091831:ENSG00000141736                   0.103 0.918045    
## ENSG00000082175:ENSG00000141736                  -0.660 0.509563    
## ENSG00000091831:ENSG00000082175:ENSG00000141736  -0.018 0.985436    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 667.69  on 763  degrees of freedom
## Residual deviance: 230.81  on 756  degrees of freedom
## AIC: 246.81
## 
## Number of Fisher Scoring iterations: 8
# predictions for the train and test sets
pred.train.glm.3var <- glm.fit.3var$fitted.values
pred.test.glm.3var <- predict(glm.fit.3var,as.data.frame(data.test[,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")]),type="response")

Misclassifications in the training and test sets:

# misclassified
glm.3var.miscl.train <- rownames(data.train[abs(data.Y.train - round(pred.train.glm.3var)) == 1,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
length(glm.3var.miscl.train)
## [1] 45
glm.3var.miscl.test <- rownames(data.test[abs(data.Y.test - round(pred.test.glm.3var)) == 1,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
length(glm.3var.miscl.test)
## [1] 12

45 misclassifications in the training set and 12 in the test set.

Dubious training cases which are misclassified:

# misclassified and suspect in the train set
glm.3var.train.miscl.suspect <- glm.3var.miscl.train[which(getParticipantCode(glm.3var.miscl.train) %in% suspect)]
glm.3var.train.miscl.suspect
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"
## [3] "TCGA-AO-A0JL-01A-11R-A056-07" "TCGA-AN-A0FL-01A-11R-A034-07"
## [5] "TCGA-AN-A0FX-01A-11R-A034-07" "TCGA-BH-A18Q-01A-12R-A12D-07"
## [7] "TCGA-AN-A0FJ-01A-11R-A00Z-07" "TCGA-A2-A04U-01A-11R-A115-07"
# misclassified and suspect in the test set
glm.3var.test.miscl.suspect <- glm.3var.miscl.test[which(getParticipantCode(glm.3var.miscl.test) %in% suspect)]
glm.3var.test.miscl.suspect
## [1] "TCGA-JL-A3YW-01A-12R-A239-07"

Several suspect cases with discordant HER2 (IHC) status and HER2 (FISH), as well as discordant HER2 (IHC) level and status are indeed influent and misclassified, based on the three biomarkers.

Looking for possible confounding variables in the clinical data:

# categorical variables identified as potential interesting variables
clinical.interest <- c('age_at_initial_pathologic_diagnosis','race_list','history_of_neoadjuvant_treatment','gender','person_neoplasm_cancer_status','stage_event_pathologic_stage','stage_event_tnm_categories','menopause_status')
TNBC.glm.clinical <- clinical$primary.solid.tumor[, clinical.interest]

TNBC.glm.clinical <- data.frame(data.Y,TNBC.glm.clinical[rownames(clinical.tnbc), ])

# Univariate logistic models
TNBC.glm.fit <- glm(data.Y~age_at_initial_pathologic_diagnosis,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit)# significant
## 
## Call:
## glm(formula = data.Y ~ age_at_initial_pathologic_diagnosis, family = binomial, 
##     data = TNBC.glm.clinical, control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8111  -0.6258  -0.5557  -0.4711   2.2501  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)                         -0.331167   0.391982  -0.845 0.398194
## age_at_initial_pathologic_diagnosis -0.023526   0.006829  -3.445 0.000571
##                                        
## (Intercept)                            
## age_at_initial_pathologic_diagnosis ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 885.56  on 1017  degrees of freedom
## Residual deviance: 873.31  on 1016  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 877.31
## 
## Number of Fisher Scoring iterations: 4
TNBC.glm.fit <- glm(data.Y~gender,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit)
## 
## Call:
## glm(formula = data.Y ~ gender, family = binomial, data = TNBC.glm.clinical, 
##     control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5883  -0.5883  -0.5883  -0.5883   1.9181  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.6665     0.0862 -19.333   <2e-16 ***
## genderMALE  -13.8995   420.1371  -0.033    0.974    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 885.90  on 1018  degrees of freedom
## Residual deviance: 881.77  on 1017  degrees of freedom
## AIC: 885.77
## 
## Number of Fisher Scoring iterations: 14
TNBC.glm.fit <- glm(data.Y~race_list,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit) # significant
## 
## Call:
## glm(formula = data.Y ~ race_list, family = binomial, data = TNBC.glm.clinical, 
##     control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8712  -0.5286  -0.5286  -0.5286   2.2792  
## 
## Coefficients:
##                                           Estimate Std. Error z value
## (Intercept)                                -2.5200     0.3929  -6.414
## race_listAMERICAN INDIAN OR ALASKA NATIVE -11.0461   535.4113  -0.021
## race_listASIAN                              0.7493     0.5483   1.367
## race_listBLACK OR AFRICAN AMERICAN          1.7468     0.4259   4.101
## race_listWHITE                              0.6223     0.4086   1.523
##                                           Pr(>|z|)    
## (Intercept)                               1.42e-10 ***
## race_listAMERICAN INDIAN OR ALASKA NATIVE    0.984    
## race_listASIAN                               0.172    
## race_listBLACK OR AFRICAN AMERICAN        4.11e-05 ***
## race_listWHITE                               0.128    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 885.90  on 1018  degrees of freedom
## Residual deviance: 849.12  on 1014  degrees of freedom
## AIC: 859.12
## 
## Number of Fisher Scoring iterations: 12
TNBC.glm.fit <- glm(data.Y~history_of_neoadjuvant_treatment,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit)
## 
## Call:
## glm(formula = data.Y ~ history_of_neoadjuvant_treatment, family = binomial, 
##     data = TNBC.glm.clinical, control = glm.control(maxit = 1000))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1774  -0.5872  -0.5872  -0.5872   1.9198  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)                          8.277e-15  1.414e+00   0.000    1.000
## history_of_neoadjuvant_treatmentNo  -1.670e+00  1.417e+00  -1.179    0.238
## history_of_neoadjuvant_treatmentYes -1.557e+01  4.037e+02  -0.039    0.969
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 885.90  on 1018  degrees of freedom
## Residual deviance: 880.17  on 1016  degrees of freedom
## AIC: 886.17
## 
## Number of Fisher Scoring iterations: 14
TNBC.glm.fit <- glm(data.Y~person_neoplasm_cancer_status,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))

TNBC.glm.fit <- glm(data.Y~stage_event_pathologic_stage ,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))

TNBC.glm.fit <- glm(data.Y~stage_event_tnm_categories ,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))

TNBC.glm.fit <- glm(data.Y~menopause_status ,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
# building a multivariate glm model accouting for all potential interesting categorical variables
TNBC.glm.fit <- glm(data.Y~age_at_initial_pathologic_diagnosis+gender+race_list+history_of_neoadjuvant_treatment+menopause_status+person_neoplasm_cancer_status+stage_event_pathologic_stage,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
#summary(TNBC.glm.fit)

# Chi-squared test between response and race (significant)
chisq.test(table(TNBC.glm.clinical$data.Y,TNBC.glm.clinical$race_list))
## 
##  Pearson's Chi-squared test
## 
## data:  table(TNBC.glm.clinical$data.Y, TNBC.glm.clinical$race_list)
## X-squared = 41.392, df = 4, p-value = 2.229e-08
# Fisher test (more accurate - significant)
fisher.test(table(TNBC.glm.clinical$data.Y,TNBC.glm.clinical$race_list))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(TNBC.glm.clinical$data.Y, TNBC.glm.clinical$race_list)
## p-value = 8.796e-08
## alternative hypothesis: two.sided
# building the final matrix (gene expression plus age and race)
data.plus.race.age <- data
names(TNBC.glm.clinical)
## [1] "data.Y"                             
## [2] "age_at_initial_pathologic_diagnosis"
## [3] "race_list"                          
## [4] "history_of_neoadjuvant_treatment"   
## [5] "gender"                             
## [6] "person_neoplasm_cancer_status"      
## [7] "stage_event_pathologic_stage"       
## [8] "stage_event_tnm_categories"         
## [9] "menopause_status"
age.vect <- TNBC.glm.clinical[,2] 
race.vect <- as.character(TNBC.glm.clinical[,3])
race.vect[race.vect==""] <- ""
race.vect[race.vect=="AMERICAN INDIAN OR ALASKA NATIVE"] <- "0"
race.vect[race.vect=="WHITE"] <- "0"
race.vect[race.vect=="ASIAN"] <- "0"
race.vect[race.vect=="BLACK OR AFRICAN AMERICAN"] <- "1"
data.plus.race.age <- cbind(data.plus.race.age,as.factor(race.vect),age.vect,as.factor(data.Y))
dim(data.plus.race.age)
## [1]  1019 19691
# changing binary code for race ("1" is "african american or black")
data.plus.race.age <- data.plus.race.age[data.plus.race.age[,19689]!="1",]# eliminates individuals without race
data.plus.race.age[data.plus.race.age[,19689]=="2",19689] <- 0
data.plus.race.age[data.plus.race.age[,19689]=="3",19689] <- 1

# changing binary code for Y (response)
data.plus.race.age[data.plus.race.age[,19691]=="1",19691] <- 0
data.plus.race.age[data.plus.race.age[,19691]=="2",19691] <- 1

summary(data.plus.race.age[,19690])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   26.00   49.00   58.00   58.06   67.00   90.00       1
data.plus.race.age <- data.plus.race.age[-which(is.na(data.plus.race.age[,19690])),]
# eliminates individuals without age record

dim(data.plus.race.age)
## [1]   924 19691
data.plus.race.age.rank <- data.plus.race.age

# normalizing age
data.plus.race.age.rank[,19690] <- (data.plus.race.age.rank[,19690] - mean(data.plus.race.age.rank[,19690]))/ sd(data.plus.race.age.rank[,19690]) # normalized

data.Y.rank <- data.plus.race.age.rank[,19691]
length(data.Y.rank)
## [1] 924
data.plus.race.age.rank <- data.plus.race.age.rank[,-19691]
dim(data.plus.race.age.rank)
## [1]   924 19690

3 Ensemble outlier detection

3.1 TNBC original data

set.seed(2010)

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## Loading required package: foreach
## Loaded glmnet 2.0-13
my.alpha <- seq(0.1,0.9,0.1) 

## Optimizing alpha and lambda for logistic regression with Elastic net regularization

nvar.selected.EN <- matrix(0,1,length(my.alpha))
pred.EN <- matrix(0,dim(data.plus.race.age.rank)[1],length(my.alpha))
MSE.EN <- matrix(0,1,length(my.alpha))

# assigning samples to folds, to be used in cross-validation when tunning alpha
set.seed(2010)
foldid.tnbc <- sample(1:10,size=length(data.Y.rank),replace=TRUE)

for (j in 1:length(my.alpha)){
  
  # Logistic model fitting with 10-fold cross-validation for glmnet:
  fit.EN.cv <- cv.glmnet(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank),family="binomial",foldid=foldid.tnbc,alpha=my.alpha[j])
  
  var.selected.EN <- which(fit.EN.cv$glmnet.fit$beta[,which(fit.EN.cv$cvm == min(fit.EN.cv$cvm))] != 0)
  nvar.selected.EN[j] <- length(var.selected.EN)
  
  # Predictions obtained by model i
  pred.EN[,j] <- predict(fit.EN.cv,as.matrix(data.plus.race.age.rank),s="lambda.min",type="response")
  
  # Mean squared error of prediction (MSE)
  MSE.EN[j] <- mean((data.Y.rank-pred.EN)^2)
}

MSE.EN
##           [,1]      [,2]      [,3]       [,4]       [,5]       [,6]
## [1,] 0.1492157 0.1326005 0.1157746 0.09898047 0.08214026 0.06548295
##            [,7]       [,8]      [,9]
## [1,] 0.04891806 0.03246128 0.0162588
nvar.selected.EN
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,]  438  282  230  197  175  152  143  126  107
my.alpha[which(MSE.EN == min(MSE.EN))]
## [1] 0.9
## Model ensemble

n.models <- 3
nvar.selected <- matrix(0,1,n.models)
MSE <- matrix(0,1,n.models)
Miscl <- matrix(0,dim(data.plus.race.age.rank)[1],n.models)
CookD <- matrix(0,dim(data.plus.race.age.rank)[1],n.models)

# logistic regression with Elastic net regularization (LOGIT-EN)

i <- 1

my.alpha.opt <- my.alpha[which(MSE.EN == min(MSE.EN))]

fit.EN.cv <- cv.glmnet(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank),family="binomial",foldid=foldid.tnbc,alpha=my.alpha[which(MSE.EN == min(MSE.EN))])

var.selected.EN <- which(fit.EN.cv$glmnet.fit$beta[,which(fit.EN.cv$cvm == min(fit.EN.cv$cvm))] != 0)
nvar.selected[i] <- length(var.selected.EN)

assign(paste("var.selected", i, sep = ""), var.selected.EN)

# Predictions obtained by model i:
pred.EN <- predict(fit.EN.cv,as.matrix(data.plus.race.age.rank),s="lambda.min",type="response")

# Mean squared error of prediction (MSE):
MSE[i] <- mean((data.Y.rank-pred.EN)^2)

# Misclassified individuals based on model i:
Miscl[,i] <- abs(data.Y.rank - round(pred.EN))

table(data.Y.rank,round(pred.EN))
##            
## data.Y.rank   0   1
##           0 763   8
##           1   8 145
# The Cook’s distance for each individual based on model i:
V <- diag(as.vector(sqrt(pred.EN*(1-pred.EN))))
H <- V %*% data.plus.race.age.rank[,var.selected.EN] %*% (solve(t(data.plus.race.age.rank[,var.selected.EN]) %*% V %*% data.plus.race.age.rank[,var.selected.EN])) %*% t(data.plus.race.age.rank[,var.selected.EN]) %*% V # the hat matrix

CookD[,i] <- (data.Y.rank - pred.EN)^2 * diag(H) / ((pred.EN*(-pred.EN+1))*((-diag(H)+1))^2)


# SPLSDA

i <- 2

library(spls)
## Sparse Partial Least Squares (SPLS) Regression and
## Classification (version 2.2-1)
# Optimizing K and eta by cross-validation
#fit.splsda.LOGIT.cv <- cv.splsda(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank), fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), kappa=0.5, classifier="logistic", scale.x=FALSE,n.core=10)

#fit.splsda.LOGIT.cv$K.opt
#fit.splsda.LOGIT.cv$eta.opt

# fixing the optimum K and eta obtained by cross-validation (for reproducibility)
fit.splsda.LOGIT <- splsda(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank), K = 4, eta=0.8, classifier="logistic", scale.x=FALSE)

var.selected.splsda.LOGIT <- fit.splsda.LOGIT$A
nvar.selected[i] <- length(var.selected.splsda.LOGIT)

assign(paste("var.selected", i, sep = ""), var.selected.splsda.LOGIT)

# Predictions obtained by model i:
pred.splsda.LOGIT <- predict(fit.splsda.LOGIT, as.matrix(data.plus.race.age.rank),type = "fit","coefficient", fit.type = "response")

table(data.Y.rank,round(pred.splsda.LOGIT))
##            
## data.Y.rank   0   1
##           0 756  15
##           1  14 139
# Mean squared error of prediction (MSE):
MSE[i] <- mean((data.Y.rank-pred.splsda.LOGIT)^2)

# Misclassified individuals based on model i:
Miscl[,i] <- abs(data.Y.rank - round(pred.splsda.LOGIT))

# The Cook’s distance for each individual based on model i:
V <- diag(as.vector(sqrt(pred.splsda.LOGIT*(1-pred.splsda.LOGIT))))
H <- V %*% (fit.splsda.LOGIT$T) %*% (solve(t(fit.splsda.LOGIT$T) %*% V %*% fit.splsda.LOGIT$T)) %*% t(fit.splsda.LOGIT$T) %*% V # the hat matrix

CookD[,i] <- (data.Y.rank - pred.splsda.LOGIT)^2 * diag(H) / ((pred.splsda.LOGIT*(-pred.splsda.LOGIT+1))*((-diag(H)+1))^2)


# SGPLS

i <- 3

library(spls)
library(parallel)

# Optimizing K and eta by cross-validation
#fit.sgpls.LOGIT.cv <- cv.sgpls(as.matrix(data.plus.race.age.rank),data.Y.rank, fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), scale.x=FALSE,n.core=10)

# fixing the optimum K and eta obtained by cross-validation (for reproducibility)
fit.sgpls.LOGIT <- sgpls(as.matrix(data.plus.race.age.rank),data.Y.rank, K = 4, eta=0.7, scale.x=FALSE)

var.selected.sgpls.LOGIT <- fit.sgpls.LOGIT$A
nvar.selected[i] <- length(var.selected.sgpls.LOGIT)

assign(paste("var.selected", i, sep = ""), var.selected.sgpls.LOGIT)

# Predictions obtained by model i:
pred.sgpls.LOGIT <- predict(fit.sgpls.LOGIT, as.matrix(data.plus.race.age.rank),type="fit",fit.type="response")

table(data.Y.rank,round(pred.sgpls.LOGIT))
##            
## data.Y.rank   0   1
##           0 763   8
##           1  15 138
# Mean squared error of prediction (MSE):
MSE[i] <- mean((data.Y.rank-pred.sgpls.LOGIT)^2)

# Misclassified individuals based on model i:
Miscl[,i] <- abs(data.Y.rank - round(pred.sgpls.LOGIT))

# The Cook’s distance for each individual based on model i:
V <- diag(as.vector(sqrt(pred.sgpls.LOGIT*(1-pred.sgpls.LOGIT))))
H <- V %*% (data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W) %*% (solve(t(data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W) %*% V %*% data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W)) %*% t(data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W) %*% V 

CookD[,i] <- (data.Y.rank - pred.sgpls.LOGIT)^2 * diag(H) / ((pred.sgpls.LOGIT*(-pred.sgpls.LOGIT+1))*((-diag(H)+1))^2)

# Mean squared error for the three models:
MSE
##            [,1]       [,2]       [,3]
## [1,] 0.01976205 0.02500814 0.08362941
# Misclassifications for the three models:
apply(Miscl,2,sum)
## [1] 16 29 23
# Number of variables selected by the three models:
nvar.selected
##      [,1] [,2] [,3]
## [1,]  107 2945  551
# Variables selected by the three models:
var.selected=list(var.selected1,var.selected2,var.selected3)


# The rank product:
rank.matrix <- apply(CookD,2, function(CookD) rank(-(CookD),ties.method = "first"))
rank.product <- apply(rank.matrix, 1, prod)

rho <-rank.product
n <- dim(data.plus.race.age.rank)[1]
k <- dim(rank.matrix)[2]

source("~/Heskes_pvalues.R")

# The p-values:
pvalues <- as.vector(rankprodbounds(rho,n,k,Delta ='geometric'))

# The q-values:
library(qvalue)
qobj <- qvalue(pvalues)
qvalues <- qobj$qvalues

# Misclassifications (%) by the three models
Miscl.percent <- apply(Miscl,1, function(Miscl) round((sum(Miscl)/k)*100))

# suspect individuals within the misclassified by model
Miscl.suspect.LOGIT.EN <- rownames(data.plus.race.age.rank[which(Miscl[,1]==1),])[which(getParticipantCode(rownames(data.plus.race.age.rank[which(Miscl[,1]==1),])) %in% suspect)]
Miscl.suspect.LOGIT.EN
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07"
Miscl.suspect.SPLSDA <- rownames(data.plus.race.age.rank[which(Miscl[,2]==1),])[which(getParticipantCode(rownames(data.plus.race.age.rank[which(Miscl[,2]==1),])) %in% suspect)]
Miscl.suspect.SPLSDA
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"
Miscl.suspect.SGPLS <- rownames(data.plus.race.age.rank[which(Miscl[,3]==1),])[which(getParticipantCode(rownames(data.plus.race.age.rank[which(Miscl[,3]==1),])) %in% suspect)]
Miscl.suspect.SGPLS
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"
## [3] "TCGA-AN-A0FX-01A-11R-A034-07"
# Building the rank matrix
id <- as.vector(seq(1:dim(data.plus.race.age.rank)[1]))

outliers.rank <- as.data.frame(cbind(id, rank.matrix,rank.product,pvalues,qvalues,Miscl.percent))

# suspect in the top 25 logistic with EN model rank
rank.LOGIT.EN <- outliers.rank[order(outliers.rank[,2]),]
rownames(data.plus.race.age.rank[rank.LOGIT.EN[1:25,1],])[which(getParticipantCode(rownames(data.plus.race.age.rank[rank.LOGIT.EN[1:25,1],])) %in% suspect)]
## [1] "TCGA-A2-A0EQ-01A-11R-A034-07"
# suspect in the top 25 SPLSDA model rank
rank.SPLSDA <- outliers.rank[order(outliers.rank[,3]),]
rownames(data.plus.race.age.rank[rank.SPLSDA[1:25,1],])[which(getParticipantCode(rownames(data.plus.race.age.rank[rank.SPLSDA[1:25,1],])) %in% suspect)]
## [1] "TCGA-A2-A0EQ-01A-11R-A034-07" "TCGA-LL-A5YP-01A-21R-A28M-07"
# suspect in the top 25 SGPLS model rank
rank.SGPLS <- outliers.rank[order(outliers.rank[,4]),]
rownames(data.plus.race.age.rank[rank.SGPLS[1:25,1],])[which(getParticipantCode(rownames(data.plus.race.age.rank[rank.SGPLS[1:25,1],])) %in% suspect)]
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-AN-A0FJ-01A-11R-A00Z-07"
outliers.rank <- outliers.rank[order(qvalues),]

# influent patients
TNBC.influent <- outliers.rank[which(outliers.rank[,7] < 0.05),1]
length(TNBC.influent)
## [1] 24
# brca type of the influent patients
data.Y.rank[TNBC.influent]
## TCGA-AC-A2QJ-01A-12R-A19W-07 TCGA-E9-A22G-01A-11R-A157-07 
##                            1                            0 
## TCGA-AR-A251-01A-12R-A169-07 TCGA-AR-A1AJ-01A-21R-A12P-07 
##                            0                            0 
## TCGA-A2-A3Y0-01A-11R-A239-07 TCGA-A2-A3XV-01A-21R-A239-07 
##                            0                            0 
## TCGA-E9-A1ND-01A-11R-A144-07 TCGA-EW-A1P1-01A-31R-A14D-07 
##                            0                            1 
## TCGA-E2-A1II-01A-11R-A144-07 TCGA-C8-A3M7-01A-12R-A21T-07 
##                            0                            1 
## TCGA-D8-A1JF-01A-11R-A13Q-07 TCGA-BH-A42U-01A-12R-A24H-07 
##                            1                            1 
## TCGA-LL-A740-01A-21R-A32P-07 TCGA-A2-A1G6-01A-11R-A13Q-07 
##                            1                            1 
## TCGA-OL-A5S0-01A-11R-A28M-07 TCGA-A2-A0EQ-01A-11R-A034-07 
##                            0                            1 
## TCGA-A2-A0YJ-01A-11R-A109-07 TCGA-AR-A1AH-01A-11R-A12D-07 
##                            0                            0 
## TCGA-AC-A62X-01A-11R-A29R-07 TCGA-E2-A1LB-01A-11R-A144-07 
##                            0                            0 
## TCGA-OL-A97C-01A-32R-A41B-07 TCGA-LL-A5YP-01A-21R-A28M-07 
##                            1                            0 
## TCGA-C8-A26X-01A-31R-A16F-07 TCGA-D8-A1XW-01A-11R-A14M-07 
##                            1                            0
sum(data.Y.rank[TNBC.influent]) # 10 TNBC and 14 non-TNBC patients
## [1] 10
# influent patients which are suspect
TNBC.influent.suspect <- rownames(data.plus.race.age.rank[TNBC.influent,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])) %in% suspect)]

# brca type of the influent, suspect patients
data.Y.rank[TNBC.influent][which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])) %in% suspect)]
## TCGA-A2-A0EQ-01A-11R-A034-07 TCGA-LL-A5YP-01A-21R-A28M-07 
##                            1                            0
# patients which are always misclassified
TNBC.alw.miscl <- which(apply(Miscl,1,sum)==3)
length(TNBC.alw.miscl)
## [1] 12
# which are suspect
TNBC.miscl.suspect <- rownames(data.plus.race.age.rank[TNBC.alw.miscl,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.alw.miscl,])) %in% suspect)]
length(TNBC.miscl.suspect)
## [1] 1
# TNBC-associated variables (ER, PG and HER2) selected by the three models:
which(colnames(data.plus.race.age.rank)=="ENSG00000091831") %in% var.selected1
## [1] TRUE
which(colnames(data.plus.race.age.rank)=="ENSG00000091831") %in% var.selected2
## [1] TRUE
which(colnames(data.plus.race.age.rank)=="ENSG00000091831") %in% var.selected3
## [1] TRUE
# ER selected by the three models

which(colnames(data.plus.race.age.rank)=="ENSG00000082175") %in% var.selected1
## [1] FALSE
which(colnames(data.plus.race.age.rank)=="ENSG00000082175") %in% var.selected2
## [1] TRUE
which(colnames(data.plus.race.age.rank)=="ENSG00000082175") %in% var.selected3
## [1] FALSE
# PR only selected by SPLSDA

which(colnames(data.plus.race.age.rank)=="ENSG00000141736") %in% var.selected1
## [1] FALSE
which(colnames(data.plus.race.age.rank)=="ENSG00000141736") %in% var.selected2
## [1] TRUE
which(colnames(data.plus.race.age.rank)=="ENSG00000141736") %in% var.selected3
## [1] TRUE
# HER2 selected by SPLSDA and SGPLS 

# Table 8 (Lopes et al.)
cbind(xdata[rownames(data.plus.race.age.rank[TNBC.influent,]),c("ENSG00000091831","ENSG00000082175","ENSG00000141736")],clinical.tnbc.aux[getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])),c(1:5)],clinical.tnbc[getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])),4],outliers.rank[1:length(TNBC.influent),2:8])
##                              ENSG00000091831 ENSG00000082175
## TCGA-AC-A2QJ-01A-12R-A19W-07      0.09981546     0.001169463
## TCGA-E9-A22G-01A-11R-A157-07      0.44347936     0.019834637
## TCGA-AR-A251-01A-12R-A169-07      1.56609545     0.098195977
## TCGA-AR-A1AJ-01A-21R-A12P-07      1.46653566     0.066805316
## TCGA-A2-A3Y0-01A-11R-A239-07      2.18089099     0.027241671
## TCGA-A2-A3XV-01A-21R-A239-07      0.01584832     0.028223807
## TCGA-E9-A1ND-01A-11R-A144-07      1.43733608     0.052120773
## TCGA-EW-A1P1-01A-31R-A14D-07      3.01820732     2.558892237
## TCGA-E2-A1II-01A-11R-A144-07      0.14270408     0.194132777
## TCGA-C8-A3M7-01A-12R-A21T-07      4.26609694     0.757809876
## TCGA-D8-A1JF-01A-11R-A13Q-07      1.25673293     0.110233707
## TCGA-BH-A42U-01A-12R-A24H-07      9.18726081     1.827635514
## TCGA-LL-A740-01A-21R-A32P-07      0.30222325     0.115332941
## TCGA-A2-A1G6-01A-11R-A13Q-07     23.89538282    21.452681110
## TCGA-OL-A5S0-01A-11R-A28M-07      0.09174435     0.063196003
## TCGA-A2-A0EQ-01A-11R-A034-07      2.12986768     0.039939245
## TCGA-A2-A0YJ-01A-11R-A109-07      0.08534338     0.031144565
## TCGA-AR-A1AH-01A-11R-A12D-07      0.03165734     0.030357178
## TCGA-AC-A62X-01A-11R-A29R-07      0.18607614     0.021666988
## TCGA-E2-A1LB-01A-11R-A144-07      0.41506846     0.091132159
## TCGA-OL-A97C-01A-32R-A41B-07     16.24660389     8.561992129
## TCGA-LL-A5YP-01A-21R-A28M-07      0.15598544     0.051391118
## TCGA-C8-A26X-01A-31R-A16F-07      0.41547024     0.127914672
## TCGA-D8-A1XW-01A-11R-A14M-07      0.31874784     0.111179573
##                              ENSG00000141736     ESR1      PGR HER2_level
## TCGA-AC-A2QJ-01A-12R-A19W-07        7.344798 Negative Negative           
## TCGA-E9-A22G-01A-11R-A157-07       15.317028 Negative Negative           
## TCGA-AR-A251-01A-12R-A169-07       14.019308 Positive Negative         2+
## TCGA-AR-A1AJ-01A-21R-A12P-07        9.742704 Positive Negative           
## TCGA-A2-A3Y0-01A-11R-A239-07       11.336328 Positive Negative         1+
## TCGA-A2-A3XV-01A-21R-A239-07      137.937244 Positive Negative         2+
## TCGA-E9-A1ND-01A-11R-A144-07       13.054248 Negative Negative           
## TCGA-EW-A1P1-01A-31R-A14D-07       23.641926 Negative Negative         2+
## TCGA-E2-A1II-01A-11R-A144-07       10.728130 Negative Positive         1+
## TCGA-C8-A3M7-01A-12R-A21T-07       25.470280 Negative Negative           
## TCGA-D8-A1JF-01A-11R-A13Q-07       32.930532 Negative Negative         1+
## TCGA-BH-A42U-01A-12R-A24H-07       38.367228 Negative Negative           
## TCGA-LL-A740-01A-21R-A32P-07       68.557840 Negative Negative         2+
## TCGA-A2-A1G6-01A-11R-A13Q-07       29.740007 Negative Negative         1+
## TCGA-OL-A5S0-01A-11R-A28M-07       31.919459 Positive Negative           
## TCGA-A2-A0EQ-01A-11R-A034-07       30.146140 Negative Negative         3+
## TCGA-A2-A0YJ-01A-11R-A109-07      240.243961 Positive Negative          0
## TCGA-AR-A1AH-01A-11R-A12D-07       34.123225 Positive Negative           
## TCGA-AC-A62X-01A-11R-A29R-07       28.531373 Positive Negative           
## TCGA-E2-A1LB-01A-11R-A144-07     1129.864921 Negative Negative         3+
## TCGA-OL-A97C-01A-32R-A41B-07       24.037033 Negative Negative           
## TCGA-LL-A5YP-01A-21R-A28M-07       15.096317 Positive Negative         1+
## TCGA-C8-A26X-01A-31R-A16F-07       60.117188 Negative Negative         1+
## TCGA-D8-A1XW-01A-11R-A14M-07       21.029425 Negative Positive         1+
##                              HER2_status HER2_FISH
## TCGA-AC-A2QJ-01A-12R-A19W-07    Negative          
## TCGA-E9-A22G-01A-11R-A157-07    Positive          
## TCGA-AR-A251-01A-12R-A169-07   Equivocal  Negative
## TCGA-AR-A1AJ-01A-21R-A12P-07    Negative          
## TCGA-A2-A3Y0-01A-11R-A239-07    Negative          
## TCGA-A2-A3XV-01A-21R-A239-07   Equivocal  Positive
## TCGA-E9-A1ND-01A-11R-A144-07    Positive          
## TCGA-EW-A1P1-01A-31R-A14D-07   Equivocal  Negative
## TCGA-E2-A1II-01A-11R-A144-07    Negative          
## TCGA-C8-A3M7-01A-12R-A21T-07    Negative          
## TCGA-D8-A1JF-01A-11R-A13Q-07    Negative          
## TCGA-BH-A42U-01A-12R-A24H-07    Negative          
## TCGA-LL-A740-01A-21R-A32P-07   Equivocal  Negative
## TCGA-A2-A1G6-01A-11R-A13Q-07    Negative          
## TCGA-OL-A5S0-01A-11R-A28M-07              Positive
## TCGA-A2-A0EQ-01A-11R-A034-07    Positive  Negative
## TCGA-A2-A0YJ-01A-11R-A109-07    Negative          
## TCGA-AR-A1AH-01A-11R-A12D-07    Negative          
## TCGA-AC-A62X-01A-11R-A29R-07                      
## TCGA-E2-A1LB-01A-11R-A144-07    Positive          
## TCGA-OL-A97C-01A-32R-A41B-07              Negative
## TCGA-LL-A5YP-01A-21R-A28M-07    Negative  Positive
## TCGA-C8-A26X-01A-31R-A16F-07    Negative          
## TCGA-D8-A1XW-01A-11R-A14M-07    Negative          
##                              clinical.tnbc[getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent, 
## TCGA-AC-A2QJ-01A-12R-A19W-07                                                                              TNBC
## TCGA-E9-A22G-01A-11R-A157-07                                                                           NO_TNBC
## TCGA-AR-A251-01A-12R-A169-07                                                                           NO_TNBC
## TCGA-AR-A1AJ-01A-21R-A12P-07                                                                           NO_TNBC
## TCGA-A2-A3Y0-01A-11R-A239-07                                                                           NO_TNBC
## TCGA-A2-A3XV-01A-21R-A239-07                                                                           NO_TNBC
## TCGA-E9-A1ND-01A-11R-A144-07                                                                           NO_TNBC
## TCGA-EW-A1P1-01A-31R-A14D-07                                                                              TNBC
## TCGA-E2-A1II-01A-11R-A144-07                                                                           NO_TNBC
## TCGA-C8-A3M7-01A-12R-A21T-07                                                                              TNBC
## TCGA-D8-A1JF-01A-11R-A13Q-07                                                                              TNBC
## TCGA-BH-A42U-01A-12R-A24H-07                                                                              TNBC
## TCGA-LL-A740-01A-21R-A32P-07                                                                              TNBC
## TCGA-A2-A1G6-01A-11R-A13Q-07                                                                              TNBC
## TCGA-OL-A5S0-01A-11R-A28M-07                                                                           NO_TNBC
## TCGA-A2-A0EQ-01A-11R-A034-07                                                                              TNBC
## TCGA-A2-A0YJ-01A-11R-A109-07                                                                           NO_TNBC
## TCGA-AR-A1AH-01A-11R-A12D-07                                                                           NO_TNBC
## TCGA-AC-A62X-01A-11R-A29R-07                                                                           NO_TNBC
## TCGA-E2-A1LB-01A-11R-A144-07                                                                           NO_TNBC
## TCGA-OL-A97C-01A-32R-A41B-07                                                                              TNBC
## TCGA-LL-A5YP-01A-21R-A28M-07                                                                           NO_TNBC
## TCGA-C8-A26X-01A-31R-A16F-07                                                                              TNBC
## TCGA-D8-A1XW-01A-11R-A14M-07                                                                           NO_TNBC
##                               V2  V3  V4 rank.product      pvalues
## TCGA-AC-A2QJ-01A-12R-A19W-07  13   1  34          442 1.395626e-05
## TCGA-E9-A22G-01A-11R-A157-07   6  13   7          546 1.828758e-05
## TCGA-AR-A251-01A-12R-A169-07  11   5   9          495 1.613518e-05
## TCGA-AR-A1AJ-01A-21R-A12P-07   2  10  44          880 3.347280e-05
## TCGA-A2-A3Y0-01A-11R-A239-07   8   7  22         1232 5.007570e-05
## TCGA-A2-A3XV-01A-21R-A239-07 166   2   5         1660 7.091672e-05
## TCGA-E9-A1ND-01A-11R-A144-07   5  11  48         2640 1.204479e-04
## TCGA-EW-A1P1-01A-31R-A14D-07  16  24   8         3072 1.427763e-04
## TCGA-E2-A1II-01A-11R-A144-07   9  21  20         3780 1.797824e-04
## TCGA-C8-A3M7-01A-12R-A21T-07   1  28 153         4284 2.063567e-04
## TCGA-D8-A1JF-01A-11R-A13Q-07  19 109   2         4142 1.988535e-04
## TCGA-BH-A42U-01A-12R-A24H-07   7   3 223         4683 2.274980e-04
## TCGA-LL-A740-01A-21R-A32P-07  47 125   1         5875 2.910510e-04
## TCGA-A2-A1G6-01A-11R-A13Q-07  10   4 204         8160 4.138327e-04
## TCGA-OL-A5S0-01A-11R-A28M-07  46   8  21         7728 3.905611e-04
## TCGA-A2-A0EQ-01A-11R-A034-07   4  12 216        10368 5.329668e-04
## TCGA-A2-A0YJ-01A-11R-A109-07  27  34  12        11016 5.679550e-04
## TCGA-AR-A1AH-01A-11R-A12D-07  21  16  38        12768 6.625311e-04
## TCGA-AC-A62X-01A-11R-A29R-07  22  35  19        14630 7.629362e-04
## TCGA-E2-A1LB-01A-11R-A144-07 230   6  11        15180 7.925618e-04
## TCGA-OL-A97C-01A-32R-A41B-07  14  41  30        17220 9.022870e-04
## TCGA-LL-A5YP-01A-21R-A28M-07  50  22  18        19800 1.040644e-03
## TCGA-C8-A26X-01A-31R-A16F-07  53  64   6        20352 1.070180e-03
## TCGA-D8-A1XW-01A-11R-A14M-07  23  14  68        21896 1.152666e-03
##                                  qvalues Miscl.percent
## TCGA-AC-A2QJ-01A-12R-A19W-07 0.005632575            67
## TCGA-E9-A22G-01A-11R-A157-07 0.005632575            33
## TCGA-AR-A251-01A-12R-A169-07 0.005632575            67
## TCGA-AR-A1AJ-01A-21R-A12P-07 0.007732217           100
## TCGA-A2-A3Y0-01A-11R-A239-07 0.009253989            33
## TCGA-A2-A3XV-01A-21R-A239-07 0.010921174            33
## TCGA-E9-A1ND-01A-11R-A144-07 0.015899128           100
## TCGA-EW-A1P1-01A-31R-A14D-07 0.016490665           100
## TCGA-E2-A1II-01A-11R-A144-07 0.017333964            33
## TCGA-C8-A3M7-01A-12R-A21T-07 0.017333964           100
## TCGA-D8-A1JF-01A-11R-A13Q-07 0.017333964             0
## TCGA-BH-A42U-01A-12R-A24H-07 0.017517346           100
## TCGA-LL-A740-01A-21R-A32P-07 0.020687007             0
## TCGA-A2-A1G6-01A-11R-A13Q-07 0.025492092           100
## TCGA-OL-A5S0-01A-11R-A28M-07 0.025492092            67
## TCGA-A2-A0EQ-01A-11R-A034-07 0.030778830            67
## TCGA-A2-A0YJ-01A-11R-A109-07 0.030870023             0
## TCGA-AR-A1AH-01A-11R-A12D-07 0.034009932           100
## TCGA-AC-A62X-01A-11R-A29R-07 0.036616355             0
## TCGA-E2-A1LB-01A-11R-A144-07 0.036616355            33
## TCGA-OL-A97C-01A-32R-A41B-07 0.039700630            33
## TCGA-LL-A5YP-01A-21R-A28M-07 0.042993316           100
## TCGA-C8-A26X-01A-31R-A16F-07 0.042993316            33
## TCGA-D8-A1XW-01A-11R-A14M-07 0.044377638            33
# plot SPLSDA-scores (Lopes et al.)
TNBC.matrix.splsda12.plot <- data.frame(LV1 = fit.splsda.LOGIT$T[,1],LV2 = fit.splsda.LOGIT$T[,2], Class = as.character(data.Y.rank))

TNBC.influent.matrix.splsda12.plot <- data.frame(LV1 =  fit.splsda.LOGIT$T[TNBC.influent,1],LV2 = fit.splsda.LOGIT$T[TNBC.influent,2], Class = as.character(data.Y.rank[TNBC.influent]))

TNBC.suspect.matrix.splsda12.plot <- data.frame(LV1 = fit.splsda.LOGIT$T[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect),1],LV2 = fit.splsda.LOGIT$T[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect),2], Class = as.character(TNBC.matrix.splsda12.plot[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect),3]))

library(ggplot2)
#postscript("splsda.ps")
#tiff("splsda.tiff")
ggplot(TNBC.matrix.splsda12.plot, aes(LV1, LV2)) + geom_point(aes(color = Class,shape = Class),size=3) + scale_shape_manual(values=c(21,24)) + scale_colour_manual(values=c("black", "black","black","black")) + theme(legend.text=element_text(size=2.5)) + theme_minimal() + theme(axis.text=element_text(size=14)) + theme(axis.title.y = element_text(size = 20, angle = 90)) + theme(axis.title.x = element_text(size = 20, angle = 0)) + 
  geom_point(data=TNBC.influent.matrix.splsda12.plot,aes(x=LV1, y=LV2,shape = Class,col="black",bg="red"),size=3) + 
  geom_point(data=TNBC.suspect.matrix.splsda12.plot,aes(x=LV1, y=LV2,shape = Class,col="black",bg="blue"),size=3) + guides(colour=FALSE)