The data analysis described below requires R version 3.3.2 and Bioconductor version 3.4.
The Cancer Genome Atlas Breast Invasive Carcinoma (TCGA-BRCA) data collection is part of a larger effort to build a research community focused on connecting cancer phenotypes to genotypes by providing clinical images matched to subjects from The Cancer Genome Atlas (TCGA).
The BRCA data is publicly available (https://cancergenome.nih.gov/) and is decomposed into two datasets:
the gene expression data, composed of 57251 variables for a total of 1222 samples with 1097 individuals. From those samples, 1102 with primary solid tumor, 7 metastatic and 113 with normal tissue;
the clinical data is composed of 114 variables obtained from the same individuals (much more is available in the gdc data variable, such as follow_up, drug, radiation, …)
The BRCA datasets will be imported using the package ‘brca.data’.
# Package developed by our group
#library(devtools)
#devtools::install_url('https://github.com/averissimo/brca.data/releases/download/1.0/brca.data_1.0.tar.gz')
library(brca.data)
Triple negative breast cancer (TNBC) refers to any BRCA that does not express the genes for the estrogen receptor (ESR1; ENSG00000091831), progesterone receptor (PGR; ENSG00000082175) and which does not overexpress the human epidermal growth factor receptor 2 (HER2; ENSG00000141736).
From the clinical data, the columns related to gene expression status of ESR1, PGR and HER2 must be selected. There are three possible variables for HER2:
‘her2_immunohistochemistry_level_result’, which refers to the ImmunoHistoChemistry (IHC) HER2 result into levels ‘0’ (negative), ‘1’ (negative), ‘2+’ (indeterminate) and ‘3+’(positive);
‘lab_proc_her2_neu_immunohistochemistry_receptor_status’, corresponding to the IHC result into status indeterminate, equivocal, negative and positive;
‘lab_procedure_her2_neu_in_situ_hybrid_outcome_type’, the HER2 individuals’ classification by the fluorescence in situ hybridization (FISH) method, considered more accurate and often performed when IHC results do not show a clear classification of cells into HER2-positive or negative.
data("fpkm.per.tissue","fpkm.per.tissue.barcode", 'clinical', package = 'brca.data')
tnbc.vars <- c('breast_carcinoma_estrogen_receptor_status',
'breast_carcinoma_progesterone_receptor_status',
'her2_immunohistochemistry_level_result',
'lab_proc_her2_neu_immunohistochemistry_receptor_status',
'lab_procedure_her2_neu_in_situ_hybrid_outcome_type'
)
clinical.tnbc <- clinical$primary.solid.tumor[, tnbc.vars]
clinical.tnbc.aux <- clinical.tnbc
names(clinical.tnbc.aux) <- c("ESR1","PGR","HER2_level","HER2_status","HER2_FISH")
summary(clinical.tnbc.aux)
## ESR1 PGR HER2_level HER2_status
## : 48 : 49 :471 :177
## Indeterminate: 2 Indeterminate: 4 0 : 60 Equivocal :178
## Negative :237 Negative :343 1+ :271 Indeterminate: 12
## Positive :803 Positive :694 2+ :198 Negative :559
## NA's : 1 NA's : 1 3+ : 90 Positive :164
## NA's: 1 NA's : 1
## HER2_FISH
## :673
## Equivocal : 5
## Indeterminate: 4
## Negative :330
## Positive : 78
## NA's : 1
which(is.na(clinical.tnbc.aux[,1]))
## [1] 321
clinical.tnbc.aux <- clinical.tnbc.aux[-(which(is.na(clinical.tnbc.aux[,1]))),]
Finding the concordance between the HER2 variables measured.
Concordance between HER2 (IHC) level and HER2 (IHC) status:
table(as.factor(clinical.tnbc.aux[,3]),clinical.tnbc.aux[,4])
##
## Equivocal Indeterminate Negative Positive
## 176 7 6 241 41
## 0 0 0 0 60 0
## 1+ 0 4 1 255 11
## 2+ 0 166 4 1 27
## 3+ 1 1 1 2 85
Creating a vector of HER2 (IHC) status based on HER2 (ICH) level:
clinical.tnbc.aux$my_HER2_level <- as.character(clinical.tnbc.aux$HER2_level)
clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '0'),6] <- 'Negative'
clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '1+'),6] <- 'Negative'
clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '2+'),6] <- 'Indeterminate'
clinical.tnbc.aux[which(clinical.tnbc.aux$HER2_level == '3+'),6] <- 'Positive'
Individuals for which HER2 (IHC) status different from myHER2 (IHC) status (based on HER2 IHC level)
HER2status.dif.HER2level <- clinical.tnbc.aux[c(rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,6] == 'Negative' & clinical.tnbc.aux[,4] == 'Positive'),]),rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,6] == 'Positive' & clinical.tnbc.aux[,4] == 'Negative'),])),]
# getting the individuals short name, as in the clinical data
clinical.tnbc.barcode.pattern <- '(TCGA-[A-Z0-9a-z]{2}-[a-zA-Z0-9]{4})-([0-9]{2}).+'
clinical.tnbc.shortname <- gsub(clinical.tnbc.barcode.pattern,'\\1',rownames(clinical.tnbc))
# individuals with discordant HER2 (IHC) status and HER2 (IHC) level
HER2status.dif.HER2level.names <- clinical.tnbc.shortname[which(clinical.tnbc.shortname %in% rownames(HER2status.dif.HER2level))]
length(HER2status.dif.HER2level.names)
## [1] 13
Concordance between HER2 (IHC) status and HER2 (FISH):
table(clinical.tnbc.aux[,5],clinical.tnbc.aux[,4])
##
## Equivocal Indeterminate Negative Positive
## 115 18 4 432 104
## Equivocal 0 3 0 0 2
## Indeterminate 0 0 0 3 1
## Negative 53 138 6 121 12
## Positive 9 19 2 3 45
Individuals for which HER2 (IHC) status different from HER2 (FISH):
HER2status.dif.FISH <- clinical.tnbc.aux[c(rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,4] == 'Negative' & clinical.tnbc.aux[,5] == 'Positive'),]),rownames(clinical.tnbc.aux[which(clinical.tnbc.aux[,4] == 'Positive' & clinical.tnbc.aux[,5] == 'Negative'),])),]
HER2status.dif.HER2FISH.names <- clinical.tnbc.shortname[which(clinical.tnbc.shortname %in% rownames( HER2status.dif.FISH))]
length(HER2status.dif.HER2FISH.names)
## [1] 15
# suspect individuals with discordante HER2 labels from different lab testing
suspect <- c(HER2status.dif.HER2level.names,HER2status.dif.HER2FISH.names)
length(suspect)
## [1] 28
Replacing HER2 (IHC) status by HER2 (FISH) (a more accurate test) whenever available:
clinical.tnbc.aux$HER2_status_plus_FISH <- clinical.tnbc.aux[,4]
clinical.tnbc.aux[clinical.tnbc.aux[,5] !='',7] <- clinical.tnbc.aux[clinical.tnbc.aux[,5] !='',5]
Concordance between HER2 (IHC) status and HER2 (IHC status + FISH):
table(clinical.tnbc.aux[,4],clinical.tnbc.aux[,7])
##
## Equivocal Indeterminate Negative Positive
## 115 0 0 53 9
## Equivocal 0 21 0 138 19
## Indeterminate 0 0 4 6 2
## Negative 0 0 3 553 3
## Positive 0 2 1 12 149
Building the final clinical TNBC dataset, composed of all individuals and gene expression status of ESR1, PGR and HER2:
clinical.tnbc <- clinical.tnbc.aux[,c(1,2,7)]
# replace all '' or NA with 'Indeterminate'
clinical.tnbc[is.na(clinical.tnbc) | clinical.tnbc == '' | clinical.tnbc == 'Equivocal'] <- 'Indeterminate'
# individuals with at least one 'Positive' and keep all are marked as not TNBC
tnbc.status.pos <- clinical.tnbc == 'Positive'
#tnbc.status.pos <- clinical.tnbc[,c(1,2,3)] == 'Positive'
not.tnbc.id <- rownames(tnbc.status.pos[which(sapply(seq(nrow(tnbc.status.pos)), function(ix) { any(tnbc.status.pos[ix,]) })),])
# individuals with at least one 'Indeterminate' and excluded from dataset
tnbc.status.ind <- clinical.tnbc[!(rownames(clinical.tnbc) %in% not.tnbc.id),] == 'Indeterminate'
not.tnbc.id <- rownames(tnbc.status.ind[which(sapply(seq(nrow(tnbc.status.ind)), function(ix) { any(tnbc.status.ind[ix,]) })),])
clinical.tnbc <- clinical.tnbc[!(rownames(clinical.tnbc) %in% not.tnbc.id), ]
tnbc.status.neg <- clinical.tnbc == 'Negative'
tnbc.ix <- sapply(seq(nrow(clinical.tnbc)), function(ix) { all(tnbc.status.neg[ix,]) })
# set cases with TNBC
clinical.tnbc$tnbc <- 'NO_TNBC'
clinical.tnbc[tnbc.ix, 'tnbc'] <- 'TNBC'
Building the Y (binary data, i.e., TNBC vs. nonTNBC) response vector:
# building the ydata (binary)
ydata <- data.frame(tnbc = clinical.tnbc$tnbc, row.names = rownames(clinical.tnbc))
ydata$tnbc <- factor(ydata$tnbc)
Protein coding genes catalog for building the X gene expression matrix
I. Install packages
source("http://bioconductor.org/biocLite.R")
## Bioconductor version 3.6 (BiocInstaller 1.28.0), ?biocLite for help
biocLite(c('ensembldb', 'EnsDb.Hsapiens.v86'))
## BioC_mirror: https://bioconductor.org
## Using Bioconductor 3.6 (BiocInstaller 1.28.0), R 3.4.2 (2017-09-28).
## Installing package(s) 'ensembldb', 'EnsDb.Hsapiens.v86'
## Old packages: 'biomaRt', 'chron', 'circlize', 'cowplot', 'dbplyr',
## 'digest', 'dnet', 'dotCall64', 'DRR', 'DT', 'edgeR', 'ellipse', 'fgsea',
## 'fpc', 'GenomicRanges', 'git2r', 'haven', 'hexbin', 'Hmisc', 'hms',
## 'htmlTable', 'htmlwidgets', 'irlba', 'knitr', 'lava', 'lhs', 'limma',
## 'littler', 'lme4', 'MASS', 'matrixStats', 'mgcv', 'microbenchmark',
## 'mvabund', 'pcaPP', 'psichomics', 'Rcpp', 'RCurl', 'recipes', 'rlang',
## 'rms', 'rpart', 'rprojroot', 'rtracklayer', 'shinyjs', 'sp', 'spam',
## 'SummarizedExperiment', 'TCGAbiolinks', 'tibble', 'TTR', 'viridis',
## 'withr', 'xml2', 'xts', 'zoo'
# Load libraries
suppressPackageStartupMessages({
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(futile.logger)
.Last.value <- flog.layout(layout.format('~m'))
})
Ensembl is a genome browser for vertebrate genomes that supports research in comparative genomics, evolution, sequence variation and transcriptional regulation. Ensembl annotate genes, computes multiple alignments, predicts regulatory function and collects disease data. Ensembl tools include BLAST, BLAT, BioMart and the Variant Effect Predictor (VEP) for all supported species.
edb <- EnsDb.Hsapiens.v86
# hasProteinData(edb) # from devel only, as of 3.4
ensembl.protein.coding <- genes(edb,
filter = list(GenebiotypeFilter('protein_coding')),
columns = c('gene_id', 'gene_name'))
{
flog.info(' Granges: %d', nrow(ensembl.protein.coding@elementMetadata))
flog.info('Metadata columns: %d', ncol(ensembl.protein.coding@elementMetadata))
}
## Granges: 22285
## Metadata columns: 3
The Consensus CDS (CCDS) project is a collaborative effort to identify a core set of human and mouse protein coding regions that are consistently annotated and of high quality. The long term goal is to support convergence towards a standard set of gene annotations.
ccds <- read.table(url('ftp://ftp.ncbi.nih.gov/pub/CCDS/current_human/CCDS.current.txt'),
sep = '\t',
header = T,
comment.char = "|", # necessary as header line has a # character
stringsAsFactors = FALSE)
flog.info('Size of ccds: %d x %d', nrow(ccds), ncol(ccds))
## Size of ccds: 34245 x 11
ensembl.genes <- sort(unique(ensembl.protein.coding@elementMetadata$gene_name))
ensembl.genes.ensg.id <- sort(ensembl.protein.coding@elementMetadata$gene_id)
ccds.genes <- sort(unique(ccds$gene))
ccds.extra.genes <- sort(ccds.genes[(!ccds.genes %in% ensembl.genes)])
ccds.extra.genes.ensg.id <- genes(edb, filter = list(GenenameFilter(ccds.extra.genes)),
columns = c('gene_id', 'gene_name'))
ensg.id <- sort(unique(c(ensembl.protein.coding@elementMetadata$gene_id, ensembl.genes.ensg.id)))
all.genes <- sort(unique(c(ensembl.genes, ccds.extra.genes)))
Building the X gene expression data matrix:
xdata <- t(fpkm.per.tissue$primary.solid.tumor[which(rownames(fpkm.per.tissue$primary.solid.tumor) %in% ensg.id),-which(duplicated(fpkm.per.tissue.barcode$primary.solid.tumor))])
common.rows <- getParticipantCode(rownames(xdata)) %in% rownames(ydata)
xdata <- xdata[common.rows,]
# removing variables with sd=0
xdata.sd <- sapply(seq(ncol(xdata)), function(ix) {sd(xdata[,ix])})
xdata <- xdata[,xdata.sd != 0]
# Gene expression data for ESR1, PGR and HER2 from the suspect (potential outlier) cases identified above
# Clinical and gene expression data of individuals with discordant HER2 level and HER2 status
xdata.shortname <- getParticipantCode(rownames(xdata))
xdata.plus.clinical.HER2status.dif.HER2level <- cbind(xdata[which(xdata.shortname %in% HER2status.dif.HER2level.names), c("ENSG00000091831","ENSG00000082175","ENSG00000141736")],clinical.tnbc.aux[which(rownames(clinical.tnbc.aux) %in% HER2status.dif.HER2level.names),],ydata$tnbc[which(rownames(ydata) %in% HER2status.dif.HER2level.names)])
# clinical and gene expression data of individuals with discordant HER2 status and HER2 FISH
xdata.shortname <- getParticipantCode(rownames(xdata))
xdata.plus.clinical.HER2status.dif.HER2FISH <- cbind(xdata[which(xdata.shortname %in% HER2status.dif.HER2FISH.names), c("ENSG00000091831","ENSG00000082175","ENSG00000141736")],clinical.tnbc.aux[which(rownames(clinical.tnbc.aux) %in% HER2status.dif.HER2FISH.names),],ydata$tnbc[which(rownames(ydata) %in% HER2status.dif.HER2FISH.names)])
Data transformation to be used in further analysis:
data.Y <- as.matrix(ydata)
data.Y[data.Y =="NO_TNBC"] <- 0
data.Y[data.Y =="TNBC"] <- 1
data.Y <- as.numeric(data.Y)
# log-transform xdata
data <- log2(xdata+1)
# normalizing xdata
data <- (data-matrix(apply(data,2,mean),nrow(data),ncol(data),byrow=TRUE))/matrix(apply(data,2,sd),nrow(data),ncol(data),byrow=TRUE)
Summary of TNBC biomarkers:
summary(xdata[which(data.Y=="1"),c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
## ENSG00000091831 ENSG00000082175 ENSG00000141736
## Min. : 0.01938 Min. : 0.001169 Min. : 1.561
## 1st Qu.: 0.16063 1st Qu.: 0.037205 1st Qu.: 13.964
## Median : 0.35070 Median : 0.079403 Median : 19.776
## Mean : 1.52980 Mean : 0.711833 Mean : 21.991
## 3rd Qu.: 0.82760 3rd Qu.: 0.186198 3rd Qu.: 26.058
## Max. :29.97893 Max. :22.977867 Max. :103.689
summary(xdata[which(data.Y=="0"),c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
## ENSG00000091831 ENSG00000082175 ENSG00000141736
## Min. : 0.01585 Min. : 0.0079 Min. : 0.6054
## 1st Qu.: 16.14359 1st Qu.: 0.6003 1st Qu.: 26.5798
## Median : 36.66739 Median : 4.2282 Median : 38.7319
## Mean : 47.88104 Mean : 12.0115 Mean : 99.7406
## 3rd Qu.: 69.64894 3rd Qu.: 15.3263 3rd Qu.: 58.8010
## Max. :272.20332 Max. :327.9133 Max. :1668.3526
# Building the training and test sets; keep in the training set observations ESR1- and PGR- that would have the opposite classification if HER2 (FISH) had not replaced the HER2 (IHC) status; and observations ESR1- and PGR- for which FISH was not measured and have no concordance between HER2 (IHC) status and HER2 (IHC) level (having a potential impact on the final TNBC/nonTNBC classification)
# taking individuals ID from the 'HER2status.dif.FISH' and 'HER2status.dif.HER2level' created above
HER2status.dif.FISH
## ESR1 PGR HER2_level HER2_status HER2_FISH
## TCGA-LL-A5YP Positive Negative 1+ Negative Positive
## TCGA-AO-A0JL Negative Negative 1+ Negative Positive
## TCGA-A2-A04U Negative Negative 1+ Negative Positive
## TCGA-A2-A0EQ Negative Negative 3+ Positive Negative
## TCGA-BH-A18T Negative Negative 2+ Positive Negative
## TCGA-AN-A0XV Positive Positive 2+ Positive Negative
## TCGA-AO-A12C Positive Positive 3+ Positive Negative
## TCGA-BH-A18Q Negative Negative Positive Negative
## TCGA-LL-A5YL Positive Negative 3+ Positive Negative
## TCGA-E2-A10A Positive Positive 2+ Positive Negative
## TCGA-AO-A03L Positive Positive 2+ Positive Negative
## TCGA-AO-A12G Positive Positive 2+ Positive Negative
## TCGA-BH-A1EX Positive Positive Positive Negative
## TCGA-BH-A0AU Positive Positive Positive Negative
## TCGA-AN-A0XW Positive Positive 2+ Positive Negative
## my_HER2_level
## TCGA-LL-A5YP Negative
## TCGA-AO-A0JL Negative
## TCGA-A2-A04U Negative
## TCGA-A2-A0EQ Positive
## TCGA-BH-A18T Indeterminate
## TCGA-AN-A0XV Indeterminate
## TCGA-AO-A12C Positive
## TCGA-BH-A18Q
## TCGA-LL-A5YL Positive
## TCGA-E2-A10A Indeterminate
## TCGA-AO-A03L Indeterminate
## TCGA-AO-A12G Indeterminate
## TCGA-BH-A1EX
## TCGA-BH-A0AU
## TCGA-AN-A0XW Indeterminate
HER2status.dif.HER2level
## ESR1 PGR HER2_level HER2_status HER2_FISH
## TCGA-AC-A8OS Positive Positive 1+ Positive
## TCGA-AN-A0FL Negative Negative 1+ Positive
## TCGA-AN-A0FX Negative Negative 1+ Positive
## TCGA-AN-A0FK Positive Positive 1+ Positive
## TCGA-E9-A295 Positive Positive 1+ Positive
## TCGA-AC-A3YI Positive Positive 1+ Positive
## TCGA-JL-A3YW Positive Positive 1+ Positive
## TCGA-AN-A0FS Positive Negative 1+ Positive
## TCGA-AN-A0FN Positive Positive 1+ Positive
## TCGA-AN-A0FJ Positive Negative 1+ Positive
## TCGA-AN-A03X Positive Positive 1+ Positive
## TCGA-LL-A73Y Negative Negative 3+ Negative
## TCGA-AC-A3W6 Positive Positive 3+ Negative
## my_HER2_level
## TCGA-AC-A8OS Negative
## TCGA-AN-A0FL Negative
## TCGA-AN-A0FX Negative
## TCGA-AN-A0FK Negative
## TCGA-E9-A295 Negative
## TCGA-AC-A3YI Negative
## TCGA-JL-A3YW Negative
## TCGA-AN-A0FS Negative
## TCGA-AN-A0FN Negative
## TCGA-AN-A0FJ Negative
## TCGA-AN-A03X Negative
## TCGA-LL-A73Y Positive
## TCGA-AC-A3W6 Positive
set.seed(2010)
test.id <- sample((1:nrow(data))[-which(getParticipantCode(rownames(data)) %in% c("TCGA-A2-A0EQ","TCGA-BH-A18T","TCGA-AO-A0JL","TCGA-BH-A18Q","TCGA-A2-A04U","TCGA-LL-A73Y","TCGA-AN-A0FL","TCGA-AN-A0FX"))],round(nrow(data)/4),replace=FALSE)
data.train <- data[-test.id,]
data.Y.train <- data.Y[-test.id]
data.test <- data[test.id,]
data.Y.test <- data.Y[test.id]
Fitting a logistic model based on the three biomarker variables (ESR1, PGR and HER2):
glm.data.train.3var <- data.frame(data.Y.train,data.train[,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
colnames(glm.data.train.3var)[1] <- 'data.y'
# without interaction
glm.fit.3var <- glm(data.y~.,family=binomial,data=glm.data.train.3var,control = glm.control(maxit = 1000))
summary(glm.fit.3var)
##
## Call:
## glm(formula = data.y ~ ., family = binomial, data = glm.data.train.3var,
## control = glm.control(maxit = 1000))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.92289 -0.17395 -0.08124 -0.03853 3.15187
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.94274 0.35293 -11.171 < 2e-16 ***
## ENSG00000091831 -2.76484 0.35260 -7.841 4.46e-15 ***
## ENSG00000082175 -0.05407 0.37973 -0.142 0.887
## ENSG00000141736 -1.04076 0.18347 -5.673 1.40e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 667.69 on 763 degrees of freedom
## Residual deviance: 233.49 on 760 degrees of freedom
## AIC: 241.49
##
## Number of Fisher Scoring iterations: 7
# with interaction
glm.fit.3var <- glm(data.y~ENSG00000091831*ENSG00000082175*ENSG00000141736,family=binomial,data=glm.data.train.3var,control = glm.control(maxit = 1000))
summary(glm.fit.3var)
##
## Call:
## glm(formula = data.y ~ ENSG00000091831 * ENSG00000082175 * ENSG00000141736,
## family = binomial, data = glm.data.train.3var, control = glm.control(maxit = 1000))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.93100 -0.17283 -0.09933 -0.04878 3.13543
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -4.10968 0.42940
## ENSG00000091831 -2.22048 0.62773
## ENSG00000082175 -0.18193 0.48687
## ENSG00000141736 -1.26152 0.50488
## ENSG00000091831:ENSG00000082175 0.58039 0.52550
## ENSG00000091831:ENSG00000141736 0.07795 0.75757
## ENSG00000082175:ENSG00000141736 -0.37518 0.56887
## ENSG00000091831:ENSG00000082175:ENSG00000141736 -0.01215 0.66538
## z value Pr(>|z|)
## (Intercept) -9.571 < 2e-16 ***
## ENSG00000091831 -3.537 0.000404 ***
## ENSG00000082175 -0.374 0.708646
## ENSG00000141736 -2.499 0.012466 *
## ENSG00000091831:ENSG00000082175 1.104 0.269396
## ENSG00000091831:ENSG00000141736 0.103 0.918045
## ENSG00000082175:ENSG00000141736 -0.660 0.509563
## ENSG00000091831:ENSG00000082175:ENSG00000141736 -0.018 0.985436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 667.69 on 763 degrees of freedom
## Residual deviance: 230.81 on 756 degrees of freedom
## AIC: 246.81
##
## Number of Fisher Scoring iterations: 8
# predictions for the train and test sets
pred.train.glm.3var <- glm.fit.3var$fitted.values
pred.test.glm.3var <- predict(glm.fit.3var,as.data.frame(data.test[,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")]),type="response")
Misclassifications in the training and test sets:
# misclassified
glm.3var.miscl.train <- rownames(data.train[abs(data.Y.train - round(pred.train.glm.3var)) == 1,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
length(glm.3var.miscl.train)
## [1] 45
glm.3var.miscl.test <- rownames(data.test[abs(data.Y.test - round(pred.test.glm.3var)) == 1,c("ENSG00000091831","ENSG00000082175","ENSG00000141736")])
length(glm.3var.miscl.test)
## [1] 12
45 misclassifications in the training set and 12 in the test set.
Dubious training cases which are misclassified:
# misclassified and suspect in the train set
glm.3var.train.miscl.suspect <- glm.3var.miscl.train[which(getParticipantCode(glm.3var.miscl.train) %in% suspect)]
glm.3var.train.miscl.suspect
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"
## [3] "TCGA-AO-A0JL-01A-11R-A056-07" "TCGA-AN-A0FL-01A-11R-A034-07"
## [5] "TCGA-AN-A0FX-01A-11R-A034-07" "TCGA-BH-A18Q-01A-12R-A12D-07"
## [7] "TCGA-AN-A0FJ-01A-11R-A00Z-07" "TCGA-A2-A04U-01A-11R-A115-07"
# misclassified and suspect in the test set
glm.3var.test.miscl.suspect <- glm.3var.miscl.test[which(getParticipantCode(glm.3var.miscl.test) %in% suspect)]
glm.3var.test.miscl.suspect
## [1] "TCGA-JL-A3YW-01A-12R-A239-07"
Several suspect cases with discordant HER2 (IHC) status and HER2 (FISH), as well as discordant HER2 (IHC) level and status are indeed influent and misclassified, based on the three biomarkers.
Looking for possible confounding variables in the clinical data:
# categorical variables identified as potential interesting variables
clinical.interest <- c('age_at_initial_pathologic_diagnosis','race_list','history_of_neoadjuvant_treatment','gender','person_neoplasm_cancer_status','stage_event_pathologic_stage','stage_event_tnm_categories','menopause_status')
TNBC.glm.clinical <- clinical$primary.solid.tumor[, clinical.interest]
TNBC.glm.clinical <- data.frame(data.Y,TNBC.glm.clinical[rownames(clinical.tnbc), ])
# Univariate logistic models
TNBC.glm.fit <- glm(data.Y~age_at_initial_pathologic_diagnosis,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit)# significant
##
## Call:
## glm(formula = data.Y ~ age_at_initial_pathologic_diagnosis, family = binomial,
## data = TNBC.glm.clinical, control = glm.control(maxit = 1000))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8111 -0.6258 -0.5557 -0.4711 2.2501
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.331167 0.391982 -0.845 0.398194
## age_at_initial_pathologic_diagnosis -0.023526 0.006829 -3.445 0.000571
##
## (Intercept)
## age_at_initial_pathologic_diagnosis ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 885.56 on 1017 degrees of freedom
## Residual deviance: 873.31 on 1016 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 877.31
##
## Number of Fisher Scoring iterations: 4
TNBC.glm.fit <- glm(data.Y~gender,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit)
##
## Call:
## glm(formula = data.Y ~ gender, family = binomial, data = TNBC.glm.clinical,
## control = glm.control(maxit = 1000))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5883 -0.5883 -0.5883 -0.5883 1.9181
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6665 0.0862 -19.333 <2e-16 ***
## genderMALE -13.8995 420.1371 -0.033 0.974
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 885.90 on 1018 degrees of freedom
## Residual deviance: 881.77 on 1017 degrees of freedom
## AIC: 885.77
##
## Number of Fisher Scoring iterations: 14
TNBC.glm.fit <- glm(data.Y~race_list,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit) # significant
##
## Call:
## glm(formula = data.Y ~ race_list, family = binomial, data = TNBC.glm.clinical,
## control = glm.control(maxit = 1000))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8712 -0.5286 -0.5286 -0.5286 2.2792
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -2.5200 0.3929 -6.414
## race_listAMERICAN INDIAN OR ALASKA NATIVE -11.0461 535.4113 -0.021
## race_listASIAN 0.7493 0.5483 1.367
## race_listBLACK OR AFRICAN AMERICAN 1.7468 0.4259 4.101
## race_listWHITE 0.6223 0.4086 1.523
## Pr(>|z|)
## (Intercept) 1.42e-10 ***
## race_listAMERICAN INDIAN OR ALASKA NATIVE 0.984
## race_listASIAN 0.172
## race_listBLACK OR AFRICAN AMERICAN 4.11e-05 ***
## race_listWHITE 0.128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 885.90 on 1018 degrees of freedom
## Residual deviance: 849.12 on 1014 degrees of freedom
## AIC: 859.12
##
## Number of Fisher Scoring iterations: 12
TNBC.glm.fit <- glm(data.Y~history_of_neoadjuvant_treatment,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
summary(TNBC.glm.fit)
##
## Call:
## glm(formula = data.Y ~ history_of_neoadjuvant_treatment, family = binomial,
## data = TNBC.glm.clinical, control = glm.control(maxit = 1000))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1774 -0.5872 -0.5872 -0.5872 1.9198
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.277e-15 1.414e+00 0.000 1.000
## history_of_neoadjuvant_treatmentNo -1.670e+00 1.417e+00 -1.179 0.238
## history_of_neoadjuvant_treatmentYes -1.557e+01 4.037e+02 -0.039 0.969
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 885.90 on 1018 degrees of freedom
## Residual deviance: 880.17 on 1016 degrees of freedom
## AIC: 886.17
##
## Number of Fisher Scoring iterations: 14
TNBC.glm.fit <- glm(data.Y~person_neoplasm_cancer_status,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
TNBC.glm.fit <- glm(data.Y~stage_event_pathologic_stage ,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
TNBC.glm.fit <- glm(data.Y~stage_event_tnm_categories ,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
TNBC.glm.fit <- glm(data.Y~menopause_status ,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
# building a multivariate glm model accouting for all potential interesting categorical variables
TNBC.glm.fit <- glm(data.Y~age_at_initial_pathologic_diagnosis+gender+race_list+history_of_neoadjuvant_treatment+menopause_status+person_neoplasm_cancer_status+stage_event_pathologic_stage,family=binomial,data=TNBC.glm.clinical,control = glm.control(maxit = 1000))
#summary(TNBC.glm.fit)
# Chi-squared test between response and race (significant)
chisq.test(table(TNBC.glm.clinical$data.Y,TNBC.glm.clinical$race_list))
##
## Pearson's Chi-squared test
##
## data: table(TNBC.glm.clinical$data.Y, TNBC.glm.clinical$race_list)
## X-squared = 41.392, df = 4, p-value = 2.229e-08
# Fisher test (more accurate - significant)
fisher.test(table(TNBC.glm.clinical$data.Y,TNBC.glm.clinical$race_list))
##
## Fisher's Exact Test for Count Data
##
## data: table(TNBC.glm.clinical$data.Y, TNBC.glm.clinical$race_list)
## p-value = 8.796e-08
## alternative hypothesis: two.sided
# building the final matrix (gene expression plus age and race)
data.plus.race.age <- data
names(TNBC.glm.clinical)
## [1] "data.Y"
## [2] "age_at_initial_pathologic_diagnosis"
## [3] "race_list"
## [4] "history_of_neoadjuvant_treatment"
## [5] "gender"
## [6] "person_neoplasm_cancer_status"
## [7] "stage_event_pathologic_stage"
## [8] "stage_event_tnm_categories"
## [9] "menopause_status"
age.vect <- TNBC.glm.clinical[,2]
race.vect <- as.character(TNBC.glm.clinical[,3])
race.vect[race.vect==""] <- ""
race.vect[race.vect=="AMERICAN INDIAN OR ALASKA NATIVE"] <- "0"
race.vect[race.vect=="WHITE"] <- "0"
race.vect[race.vect=="ASIAN"] <- "0"
race.vect[race.vect=="BLACK OR AFRICAN AMERICAN"] <- "1"
data.plus.race.age <- cbind(data.plus.race.age,as.factor(race.vect),age.vect,as.factor(data.Y))
dim(data.plus.race.age)
## [1] 1019 19691
# changing binary code for race ("1" is "african american or black")
data.plus.race.age <- data.plus.race.age[data.plus.race.age[,19689]!="1",]# eliminates individuals without race
data.plus.race.age[data.plus.race.age[,19689]=="2",19689] <- 0
data.plus.race.age[data.plus.race.age[,19689]=="3",19689] <- 1
# changing binary code for Y (response)
data.plus.race.age[data.plus.race.age[,19691]=="1",19691] <- 0
data.plus.race.age[data.plus.race.age[,19691]=="2",19691] <- 1
summary(data.plus.race.age[,19690])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 26.00 49.00 58.00 58.06 67.00 90.00 1
data.plus.race.age <- data.plus.race.age[-which(is.na(data.plus.race.age[,19690])),]
# eliminates individuals without age record
dim(data.plus.race.age)
## [1] 924 19691
data.plus.race.age.rank <- data.plus.race.age
# normalizing age
data.plus.race.age.rank[,19690] <- (data.plus.race.age.rank[,19690] - mean(data.plus.race.age.rank[,19690]))/ sd(data.plus.race.age.rank[,19690]) # normalized
data.Y.rank <- data.plus.race.age.rank[,19691]
length(data.Y.rank)
## [1] 924
data.plus.race.age.rank <- data.plus.race.age.rank[,-19691]
dim(data.plus.race.age.rank)
## [1] 924 19690
set.seed(2010)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
## Loading required package: foreach
## Loaded glmnet 2.0-13
my.alpha <- seq(0.1,0.9,0.1)
## Optimizing alpha and lambda for logistic regression with Elastic net regularization
nvar.selected.EN <- matrix(0,1,length(my.alpha))
pred.EN <- matrix(0,dim(data.plus.race.age.rank)[1],length(my.alpha))
MSE.EN <- matrix(0,1,length(my.alpha))
# assigning samples to folds, to be used in cross-validation when tunning alpha
set.seed(2010)
foldid.tnbc <- sample(1:10,size=length(data.Y.rank),replace=TRUE)
for (j in 1:length(my.alpha)){
# Logistic model fitting with 10-fold cross-validation for glmnet:
fit.EN.cv <- cv.glmnet(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank),family="binomial",foldid=foldid.tnbc,alpha=my.alpha[j])
var.selected.EN <- which(fit.EN.cv$glmnet.fit$beta[,which(fit.EN.cv$cvm == min(fit.EN.cv$cvm))] != 0)
nvar.selected.EN[j] <- length(var.selected.EN)
# Predictions obtained by model i
pred.EN[,j] <- predict(fit.EN.cv,as.matrix(data.plus.race.age.rank),s="lambda.min",type="response")
# Mean squared error of prediction (MSE)
MSE.EN[j] <- mean((data.Y.rank-pred.EN)^2)
}
MSE.EN
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.1492157 0.1326005 0.1157746 0.09898047 0.08214026 0.06548295
## [,7] [,8] [,9]
## [1,] 0.04891806 0.03246128 0.0162588
nvar.selected.EN
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 438 282 230 197 175 152 143 126 107
my.alpha[which(MSE.EN == min(MSE.EN))]
## [1] 0.9
## Model ensemble
n.models <- 3
nvar.selected <- matrix(0,1,n.models)
MSE <- matrix(0,1,n.models)
Miscl <- matrix(0,dim(data.plus.race.age.rank)[1],n.models)
CookD <- matrix(0,dim(data.plus.race.age.rank)[1],n.models)
# logistic regression with Elastic net regularization (LOGIT-EN)
i <- 1
my.alpha.opt <- my.alpha[which(MSE.EN == min(MSE.EN))]
fit.EN.cv <- cv.glmnet(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank),family="binomial",foldid=foldid.tnbc,alpha=my.alpha[which(MSE.EN == min(MSE.EN))])
var.selected.EN <- which(fit.EN.cv$glmnet.fit$beta[,which(fit.EN.cv$cvm == min(fit.EN.cv$cvm))] != 0)
nvar.selected[i] <- length(var.selected.EN)
assign(paste("var.selected", i, sep = ""), var.selected.EN)
# Predictions obtained by model i:
pred.EN <- predict(fit.EN.cv,as.matrix(data.plus.race.age.rank),s="lambda.min",type="response")
# Mean squared error of prediction (MSE):
MSE[i] <- mean((data.Y.rank-pred.EN)^2)
# Misclassified individuals based on model i:
Miscl[,i] <- abs(data.Y.rank - round(pred.EN))
table(data.Y.rank,round(pred.EN))
##
## data.Y.rank 0 1
## 0 763 8
## 1 8 145
# The Cook’s distance for each individual based on model i:
V <- diag(as.vector(sqrt(pred.EN*(1-pred.EN))))
H <- V %*% data.plus.race.age.rank[,var.selected.EN] %*% (solve(t(data.plus.race.age.rank[,var.selected.EN]) %*% V %*% data.plus.race.age.rank[,var.selected.EN])) %*% t(data.plus.race.age.rank[,var.selected.EN]) %*% V # the hat matrix
CookD[,i] <- (data.Y.rank - pred.EN)^2 * diag(H) / ((pred.EN*(-pred.EN+1))*((-diag(H)+1))^2)
# SPLSDA
i <- 2
library(spls)
## Sparse Partial Least Squares (SPLS) Regression and
## Classification (version 2.2-1)
# Optimizing K and eta by cross-validation
#fit.splsda.LOGIT.cv <- cv.splsda(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank), fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), kappa=0.5, classifier="logistic", scale.x=FALSE,n.core=10)
#fit.splsda.LOGIT.cv$K.opt
#fit.splsda.LOGIT.cv$eta.opt
# fixing the optimum K and eta obtained by cross-validation (for reproducibility)
fit.splsda.LOGIT <- splsda(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank), K = 4, eta=0.8, classifier="logistic", scale.x=FALSE)
var.selected.splsda.LOGIT <- fit.splsda.LOGIT$A
nvar.selected[i] <- length(var.selected.splsda.LOGIT)
assign(paste("var.selected", i, sep = ""), var.selected.splsda.LOGIT)
# Predictions obtained by model i:
pred.splsda.LOGIT <- predict(fit.splsda.LOGIT, as.matrix(data.plus.race.age.rank),type = "fit","coefficient", fit.type = "response")
table(data.Y.rank,round(pred.splsda.LOGIT))
##
## data.Y.rank 0 1
## 0 756 15
## 1 14 139
# Mean squared error of prediction (MSE):
MSE[i] <- mean((data.Y.rank-pred.splsda.LOGIT)^2)
# Misclassified individuals based on model i:
Miscl[,i] <- abs(data.Y.rank - round(pred.splsda.LOGIT))
# The Cook’s distance for each individual based on model i:
V <- diag(as.vector(sqrt(pred.splsda.LOGIT*(1-pred.splsda.LOGIT))))
H <- V %*% (fit.splsda.LOGIT$T) %*% (solve(t(fit.splsda.LOGIT$T) %*% V %*% fit.splsda.LOGIT$T)) %*% t(fit.splsda.LOGIT$T) %*% V # the hat matrix
CookD[,i] <- (data.Y.rank - pred.splsda.LOGIT)^2 * diag(H) / ((pred.splsda.LOGIT*(-pred.splsda.LOGIT+1))*((-diag(H)+1))^2)
# SGPLS
i <- 3
library(spls)
library(parallel)
# Optimizing K and eta by cross-validation
#fit.sgpls.LOGIT.cv <- cv.sgpls(as.matrix(data.plus.race.age.rank),data.Y.rank, fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), scale.x=FALSE,n.core=10)
# fixing the optimum K and eta obtained by cross-validation (for reproducibility)
fit.sgpls.LOGIT <- sgpls(as.matrix(data.plus.race.age.rank),data.Y.rank, K = 4, eta=0.7, scale.x=FALSE)
var.selected.sgpls.LOGIT <- fit.sgpls.LOGIT$A
nvar.selected[i] <- length(var.selected.sgpls.LOGIT)
assign(paste("var.selected", i, sep = ""), var.selected.sgpls.LOGIT)
# Predictions obtained by model i:
pred.sgpls.LOGIT <- predict(fit.sgpls.LOGIT, as.matrix(data.plus.race.age.rank),type="fit",fit.type="response")
table(data.Y.rank,round(pred.sgpls.LOGIT))
##
## data.Y.rank 0 1
## 0 763 8
## 1 15 138
# Mean squared error of prediction (MSE):
MSE[i] <- mean((data.Y.rank-pred.sgpls.LOGIT)^2)
# Misclassified individuals based on model i:
Miscl[,i] <- abs(data.Y.rank - round(pred.sgpls.LOGIT))
# The Cook’s distance for each individual based on model i:
V <- diag(as.vector(sqrt(pred.sgpls.LOGIT*(1-pred.sgpls.LOGIT))))
H <- V %*% (data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W) %*% (solve(t(data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W) %*% V %*% data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W)) %*% t(data.plus.race.age.rank[,fit.sgpls.LOGIT$A]%*%fit.sgpls.LOGIT$W) %*% V
CookD[,i] <- (data.Y.rank - pred.sgpls.LOGIT)^2 * diag(H) / ((pred.sgpls.LOGIT*(-pred.sgpls.LOGIT+1))*((-diag(H)+1))^2)
# Mean squared error for the three models:
MSE
## [,1] [,2] [,3]
## [1,] 0.01976205 0.02500814 0.08362941
# Misclassifications for the three models:
apply(Miscl,2,sum)
## [1] 16 29 23
# Number of variables selected by the three models:
nvar.selected
## [,1] [,2] [,3]
## [1,] 107 2945 551
# Variables selected by the three models:
var.selected=list(var.selected1,var.selected2,var.selected3)
# The rank product:
rank.matrix <- apply(CookD,2, function(CookD) rank(-(CookD),ties.method = "first"))
rank.product <- apply(rank.matrix, 1, prod)
rho <-rank.product
n <- dim(data.plus.race.age.rank)[1]
k <- dim(rank.matrix)[2]
source("~/Heskes_pvalues.R")
# The p-values:
pvalues <- as.vector(rankprodbounds(rho,n,k,Delta ='geometric'))
# The q-values:
library(qvalue)
qobj <- qvalue(pvalues)
qvalues <- qobj$qvalues
# Misclassifications (%) by the three models
Miscl.percent <- apply(Miscl,1, function(Miscl) round((sum(Miscl)/k)*100))
# suspect individuals within the misclassified by model
Miscl.suspect.LOGIT.EN <- rownames(data.plus.race.age.rank[which(Miscl[,1]==1),])[which(getParticipantCode(rownames(data.plus.race.age.rank[which(Miscl[,1]==1),])) %in% suspect)]
Miscl.suspect.LOGIT.EN
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07"
Miscl.suspect.SPLSDA <- rownames(data.plus.race.age.rank[which(Miscl[,2]==1),])[which(getParticipantCode(rownames(data.plus.race.age.rank[which(Miscl[,2]==1),])) %in% suspect)]
Miscl.suspect.SPLSDA
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"
Miscl.suspect.SGPLS <- rownames(data.plus.race.age.rank[which(Miscl[,3]==1),])[which(getParticipantCode(rownames(data.plus.race.age.rank[which(Miscl[,3]==1),])) %in% suspect)]
Miscl.suspect.SGPLS
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"
## [3] "TCGA-AN-A0FX-01A-11R-A034-07"
# Building the rank matrix
id <- as.vector(seq(1:dim(data.plus.race.age.rank)[1]))
outliers.rank <- as.data.frame(cbind(id, rank.matrix,rank.product,pvalues,qvalues,Miscl.percent))
# suspect in the top 25 logistic with EN model rank
rank.LOGIT.EN <- outliers.rank[order(outliers.rank[,2]),]
rownames(data.plus.race.age.rank[rank.LOGIT.EN[1:25,1],])[which(getParticipantCode(rownames(data.plus.race.age.rank[rank.LOGIT.EN[1:25,1],])) %in% suspect)]
## [1] "TCGA-A2-A0EQ-01A-11R-A034-07"
# suspect in the top 25 SPLSDA model rank
rank.SPLSDA <- outliers.rank[order(outliers.rank[,3]),]
rownames(data.plus.race.age.rank[rank.SPLSDA[1:25,1],])[which(getParticipantCode(rownames(data.plus.race.age.rank[rank.SPLSDA[1:25,1],])) %in% suspect)]
## [1] "TCGA-A2-A0EQ-01A-11R-A034-07" "TCGA-LL-A5YP-01A-21R-A28M-07"
# suspect in the top 25 SGPLS model rank
rank.SGPLS <- outliers.rank[order(outliers.rank[,4]),]
rownames(data.plus.race.age.rank[rank.SGPLS[1:25,1],])[which(getParticipantCode(rownames(data.plus.race.age.rank[rank.SGPLS[1:25,1],])) %in% suspect)]
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-AN-A0FJ-01A-11R-A00Z-07"
outliers.rank <- outliers.rank[order(qvalues),]
# influent patients
TNBC.influent <- outliers.rank[which(outliers.rank[,7] < 0.05),1]
length(TNBC.influent)
## [1] 24
# brca type of the influent patients
data.Y.rank[TNBC.influent]
## TCGA-AC-A2QJ-01A-12R-A19W-07 TCGA-E9-A22G-01A-11R-A157-07
## 1 0
## TCGA-AR-A251-01A-12R-A169-07 TCGA-AR-A1AJ-01A-21R-A12P-07
## 0 0
## TCGA-A2-A3Y0-01A-11R-A239-07 TCGA-A2-A3XV-01A-21R-A239-07
## 0 0
## TCGA-E9-A1ND-01A-11R-A144-07 TCGA-EW-A1P1-01A-31R-A14D-07
## 0 1
## TCGA-E2-A1II-01A-11R-A144-07 TCGA-C8-A3M7-01A-12R-A21T-07
## 0 1
## TCGA-D8-A1JF-01A-11R-A13Q-07 TCGA-BH-A42U-01A-12R-A24H-07
## 1 1
## TCGA-LL-A740-01A-21R-A32P-07 TCGA-A2-A1G6-01A-11R-A13Q-07
## 1 1
## TCGA-OL-A5S0-01A-11R-A28M-07 TCGA-A2-A0EQ-01A-11R-A034-07
## 0 1
## TCGA-A2-A0YJ-01A-11R-A109-07 TCGA-AR-A1AH-01A-11R-A12D-07
## 0 0
## TCGA-AC-A62X-01A-11R-A29R-07 TCGA-E2-A1LB-01A-11R-A144-07
## 0 0
## TCGA-OL-A97C-01A-32R-A41B-07 TCGA-LL-A5YP-01A-21R-A28M-07
## 1 0
## TCGA-C8-A26X-01A-31R-A16F-07 TCGA-D8-A1XW-01A-11R-A14M-07
## 1 0
sum(data.Y.rank[TNBC.influent]) # 10 TNBC and 14 non-TNBC patients
## [1] 10
# influent patients which are suspect
TNBC.influent.suspect <- rownames(data.plus.race.age.rank[TNBC.influent,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])) %in% suspect)]
# brca type of the influent, suspect patients
data.Y.rank[TNBC.influent][which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])) %in% suspect)]
## TCGA-A2-A0EQ-01A-11R-A034-07 TCGA-LL-A5YP-01A-21R-A28M-07
## 1 0
# patients which are always misclassified
TNBC.alw.miscl <- which(apply(Miscl,1,sum)==3)
length(TNBC.alw.miscl)
## [1] 12
# which are suspect
TNBC.miscl.suspect <- rownames(data.plus.race.age.rank[TNBC.alw.miscl,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.alw.miscl,])) %in% suspect)]
length(TNBC.miscl.suspect)
## [1] 1
# TNBC-associated variables (ER, PG and HER2) selected by the three models:
which(colnames(data.plus.race.age.rank)=="ENSG00000091831") %in% var.selected1
## [1] TRUE
which(colnames(data.plus.race.age.rank)=="ENSG00000091831") %in% var.selected2
## [1] TRUE
which(colnames(data.plus.race.age.rank)=="ENSG00000091831") %in% var.selected3
## [1] TRUE
# ER selected by the three models
which(colnames(data.plus.race.age.rank)=="ENSG00000082175") %in% var.selected1
## [1] FALSE
which(colnames(data.plus.race.age.rank)=="ENSG00000082175") %in% var.selected2
## [1] TRUE
which(colnames(data.plus.race.age.rank)=="ENSG00000082175") %in% var.selected3
## [1] FALSE
# PR only selected by SPLSDA
which(colnames(data.plus.race.age.rank)=="ENSG00000141736") %in% var.selected1
## [1] FALSE
which(colnames(data.plus.race.age.rank)=="ENSG00000141736") %in% var.selected2
## [1] TRUE
which(colnames(data.plus.race.age.rank)=="ENSG00000141736") %in% var.selected3
## [1] TRUE
# HER2 selected by SPLSDA and SGPLS
# Table 8 (Lopes et al.)
cbind(xdata[rownames(data.plus.race.age.rank[TNBC.influent,]),c("ENSG00000091831","ENSG00000082175","ENSG00000141736")],clinical.tnbc.aux[getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])),c(1:5)],clinical.tnbc[getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,])),4],outliers.rank[1:length(TNBC.influent),2:8])
## ENSG00000091831 ENSG00000082175
## TCGA-AC-A2QJ-01A-12R-A19W-07 0.09981546 0.001169463
## TCGA-E9-A22G-01A-11R-A157-07 0.44347936 0.019834637
## TCGA-AR-A251-01A-12R-A169-07 1.56609545 0.098195977
## TCGA-AR-A1AJ-01A-21R-A12P-07 1.46653566 0.066805316
## TCGA-A2-A3Y0-01A-11R-A239-07 2.18089099 0.027241671
## TCGA-A2-A3XV-01A-21R-A239-07 0.01584832 0.028223807
## TCGA-E9-A1ND-01A-11R-A144-07 1.43733608 0.052120773
## TCGA-EW-A1P1-01A-31R-A14D-07 3.01820732 2.558892237
## TCGA-E2-A1II-01A-11R-A144-07 0.14270408 0.194132777
## TCGA-C8-A3M7-01A-12R-A21T-07 4.26609694 0.757809876
## TCGA-D8-A1JF-01A-11R-A13Q-07 1.25673293 0.110233707
## TCGA-BH-A42U-01A-12R-A24H-07 9.18726081 1.827635514
## TCGA-LL-A740-01A-21R-A32P-07 0.30222325 0.115332941
## TCGA-A2-A1G6-01A-11R-A13Q-07 23.89538282 21.452681110
## TCGA-OL-A5S0-01A-11R-A28M-07 0.09174435 0.063196003
## TCGA-A2-A0EQ-01A-11R-A034-07 2.12986768 0.039939245
## TCGA-A2-A0YJ-01A-11R-A109-07 0.08534338 0.031144565
## TCGA-AR-A1AH-01A-11R-A12D-07 0.03165734 0.030357178
## TCGA-AC-A62X-01A-11R-A29R-07 0.18607614 0.021666988
## TCGA-E2-A1LB-01A-11R-A144-07 0.41506846 0.091132159
## TCGA-OL-A97C-01A-32R-A41B-07 16.24660389 8.561992129
## TCGA-LL-A5YP-01A-21R-A28M-07 0.15598544 0.051391118
## TCGA-C8-A26X-01A-31R-A16F-07 0.41547024 0.127914672
## TCGA-D8-A1XW-01A-11R-A14M-07 0.31874784 0.111179573
## ENSG00000141736 ESR1 PGR HER2_level
## TCGA-AC-A2QJ-01A-12R-A19W-07 7.344798 Negative Negative
## TCGA-E9-A22G-01A-11R-A157-07 15.317028 Negative Negative
## TCGA-AR-A251-01A-12R-A169-07 14.019308 Positive Negative 2+
## TCGA-AR-A1AJ-01A-21R-A12P-07 9.742704 Positive Negative
## TCGA-A2-A3Y0-01A-11R-A239-07 11.336328 Positive Negative 1+
## TCGA-A2-A3XV-01A-21R-A239-07 137.937244 Positive Negative 2+
## TCGA-E9-A1ND-01A-11R-A144-07 13.054248 Negative Negative
## TCGA-EW-A1P1-01A-31R-A14D-07 23.641926 Negative Negative 2+
## TCGA-E2-A1II-01A-11R-A144-07 10.728130 Negative Positive 1+
## TCGA-C8-A3M7-01A-12R-A21T-07 25.470280 Negative Negative
## TCGA-D8-A1JF-01A-11R-A13Q-07 32.930532 Negative Negative 1+
## TCGA-BH-A42U-01A-12R-A24H-07 38.367228 Negative Negative
## TCGA-LL-A740-01A-21R-A32P-07 68.557840 Negative Negative 2+
## TCGA-A2-A1G6-01A-11R-A13Q-07 29.740007 Negative Negative 1+
## TCGA-OL-A5S0-01A-11R-A28M-07 31.919459 Positive Negative
## TCGA-A2-A0EQ-01A-11R-A034-07 30.146140 Negative Negative 3+
## TCGA-A2-A0YJ-01A-11R-A109-07 240.243961 Positive Negative 0
## TCGA-AR-A1AH-01A-11R-A12D-07 34.123225 Positive Negative
## TCGA-AC-A62X-01A-11R-A29R-07 28.531373 Positive Negative
## TCGA-E2-A1LB-01A-11R-A144-07 1129.864921 Negative Negative 3+
## TCGA-OL-A97C-01A-32R-A41B-07 24.037033 Negative Negative
## TCGA-LL-A5YP-01A-21R-A28M-07 15.096317 Positive Negative 1+
## TCGA-C8-A26X-01A-31R-A16F-07 60.117188 Negative Negative 1+
## TCGA-D8-A1XW-01A-11R-A14M-07 21.029425 Negative Positive 1+
## HER2_status HER2_FISH
## TCGA-AC-A2QJ-01A-12R-A19W-07 Negative
## TCGA-E9-A22G-01A-11R-A157-07 Positive
## TCGA-AR-A251-01A-12R-A169-07 Equivocal Negative
## TCGA-AR-A1AJ-01A-21R-A12P-07 Negative
## TCGA-A2-A3Y0-01A-11R-A239-07 Negative
## TCGA-A2-A3XV-01A-21R-A239-07 Equivocal Positive
## TCGA-E9-A1ND-01A-11R-A144-07 Positive
## TCGA-EW-A1P1-01A-31R-A14D-07 Equivocal Negative
## TCGA-E2-A1II-01A-11R-A144-07 Negative
## TCGA-C8-A3M7-01A-12R-A21T-07 Negative
## TCGA-D8-A1JF-01A-11R-A13Q-07 Negative
## TCGA-BH-A42U-01A-12R-A24H-07 Negative
## TCGA-LL-A740-01A-21R-A32P-07 Equivocal Negative
## TCGA-A2-A1G6-01A-11R-A13Q-07 Negative
## TCGA-OL-A5S0-01A-11R-A28M-07 Positive
## TCGA-A2-A0EQ-01A-11R-A034-07 Positive Negative
## TCGA-A2-A0YJ-01A-11R-A109-07 Negative
## TCGA-AR-A1AH-01A-11R-A12D-07 Negative
## TCGA-AC-A62X-01A-11R-A29R-07
## TCGA-E2-A1LB-01A-11R-A144-07 Positive
## TCGA-OL-A97C-01A-32R-A41B-07 Negative
## TCGA-LL-A5YP-01A-21R-A28M-07 Negative Positive
## TCGA-C8-A26X-01A-31R-A16F-07 Negative
## TCGA-D8-A1XW-01A-11R-A14M-07 Negative
## clinical.tnbc[getParticipantCode(rownames(data.plus.race.age.rank[TNBC.influent,
## TCGA-AC-A2QJ-01A-12R-A19W-07 TNBC
## TCGA-E9-A22G-01A-11R-A157-07 NO_TNBC
## TCGA-AR-A251-01A-12R-A169-07 NO_TNBC
## TCGA-AR-A1AJ-01A-21R-A12P-07 NO_TNBC
## TCGA-A2-A3Y0-01A-11R-A239-07 NO_TNBC
## TCGA-A2-A3XV-01A-21R-A239-07 NO_TNBC
## TCGA-E9-A1ND-01A-11R-A144-07 NO_TNBC
## TCGA-EW-A1P1-01A-31R-A14D-07 TNBC
## TCGA-E2-A1II-01A-11R-A144-07 NO_TNBC
## TCGA-C8-A3M7-01A-12R-A21T-07 TNBC
## TCGA-D8-A1JF-01A-11R-A13Q-07 TNBC
## TCGA-BH-A42U-01A-12R-A24H-07 TNBC
## TCGA-LL-A740-01A-21R-A32P-07 TNBC
## TCGA-A2-A1G6-01A-11R-A13Q-07 TNBC
## TCGA-OL-A5S0-01A-11R-A28M-07 NO_TNBC
## TCGA-A2-A0EQ-01A-11R-A034-07 TNBC
## TCGA-A2-A0YJ-01A-11R-A109-07 NO_TNBC
## TCGA-AR-A1AH-01A-11R-A12D-07 NO_TNBC
## TCGA-AC-A62X-01A-11R-A29R-07 NO_TNBC
## TCGA-E2-A1LB-01A-11R-A144-07 NO_TNBC
## TCGA-OL-A97C-01A-32R-A41B-07 TNBC
## TCGA-LL-A5YP-01A-21R-A28M-07 NO_TNBC
## TCGA-C8-A26X-01A-31R-A16F-07 TNBC
## TCGA-D8-A1XW-01A-11R-A14M-07 NO_TNBC
## V2 V3 V4 rank.product pvalues
## TCGA-AC-A2QJ-01A-12R-A19W-07 13 1 34 442 1.395626e-05
## TCGA-E9-A22G-01A-11R-A157-07 6 13 7 546 1.828758e-05
## TCGA-AR-A251-01A-12R-A169-07 11 5 9 495 1.613518e-05
## TCGA-AR-A1AJ-01A-21R-A12P-07 2 10 44 880 3.347280e-05
## TCGA-A2-A3Y0-01A-11R-A239-07 8 7 22 1232 5.007570e-05
## TCGA-A2-A3XV-01A-21R-A239-07 166 2 5 1660 7.091672e-05
## TCGA-E9-A1ND-01A-11R-A144-07 5 11 48 2640 1.204479e-04
## TCGA-EW-A1P1-01A-31R-A14D-07 16 24 8 3072 1.427763e-04
## TCGA-E2-A1II-01A-11R-A144-07 9 21 20 3780 1.797824e-04
## TCGA-C8-A3M7-01A-12R-A21T-07 1 28 153 4284 2.063567e-04
## TCGA-D8-A1JF-01A-11R-A13Q-07 19 109 2 4142 1.988535e-04
## TCGA-BH-A42U-01A-12R-A24H-07 7 3 223 4683 2.274980e-04
## TCGA-LL-A740-01A-21R-A32P-07 47 125 1 5875 2.910510e-04
## TCGA-A2-A1G6-01A-11R-A13Q-07 10 4 204 8160 4.138327e-04
## TCGA-OL-A5S0-01A-11R-A28M-07 46 8 21 7728 3.905611e-04
## TCGA-A2-A0EQ-01A-11R-A034-07 4 12 216 10368 5.329668e-04
## TCGA-A2-A0YJ-01A-11R-A109-07 27 34 12 11016 5.679550e-04
## TCGA-AR-A1AH-01A-11R-A12D-07 21 16 38 12768 6.625311e-04
## TCGA-AC-A62X-01A-11R-A29R-07 22 35 19 14630 7.629362e-04
## TCGA-E2-A1LB-01A-11R-A144-07 230 6 11 15180 7.925618e-04
## TCGA-OL-A97C-01A-32R-A41B-07 14 41 30 17220 9.022870e-04
## TCGA-LL-A5YP-01A-21R-A28M-07 50 22 18 19800 1.040644e-03
## TCGA-C8-A26X-01A-31R-A16F-07 53 64 6 20352 1.070180e-03
## TCGA-D8-A1XW-01A-11R-A14M-07 23 14 68 21896 1.152666e-03
## qvalues Miscl.percent
## TCGA-AC-A2QJ-01A-12R-A19W-07 0.005632575 67
## TCGA-E9-A22G-01A-11R-A157-07 0.005632575 33
## TCGA-AR-A251-01A-12R-A169-07 0.005632575 67
## TCGA-AR-A1AJ-01A-21R-A12P-07 0.007732217 100
## TCGA-A2-A3Y0-01A-11R-A239-07 0.009253989 33
## TCGA-A2-A3XV-01A-21R-A239-07 0.010921174 33
## TCGA-E9-A1ND-01A-11R-A144-07 0.015899128 100
## TCGA-EW-A1P1-01A-31R-A14D-07 0.016490665 100
## TCGA-E2-A1II-01A-11R-A144-07 0.017333964 33
## TCGA-C8-A3M7-01A-12R-A21T-07 0.017333964 100
## TCGA-D8-A1JF-01A-11R-A13Q-07 0.017333964 0
## TCGA-BH-A42U-01A-12R-A24H-07 0.017517346 100
## TCGA-LL-A740-01A-21R-A32P-07 0.020687007 0
## TCGA-A2-A1G6-01A-11R-A13Q-07 0.025492092 100
## TCGA-OL-A5S0-01A-11R-A28M-07 0.025492092 67
## TCGA-A2-A0EQ-01A-11R-A034-07 0.030778830 67
## TCGA-A2-A0YJ-01A-11R-A109-07 0.030870023 0
## TCGA-AR-A1AH-01A-11R-A12D-07 0.034009932 100
## TCGA-AC-A62X-01A-11R-A29R-07 0.036616355 0
## TCGA-E2-A1LB-01A-11R-A144-07 0.036616355 33
## TCGA-OL-A97C-01A-32R-A41B-07 0.039700630 33
## TCGA-LL-A5YP-01A-21R-A28M-07 0.042993316 100
## TCGA-C8-A26X-01A-31R-A16F-07 0.042993316 33
## TCGA-D8-A1XW-01A-11R-A14M-07 0.044377638 33
# plot SPLSDA-scores (Lopes et al.)
TNBC.matrix.splsda12.plot <- data.frame(LV1 = fit.splsda.LOGIT$T[,1],LV2 = fit.splsda.LOGIT$T[,2], Class = as.character(data.Y.rank))
TNBC.influent.matrix.splsda12.plot <- data.frame(LV1 = fit.splsda.LOGIT$T[TNBC.influent,1],LV2 = fit.splsda.LOGIT$T[TNBC.influent,2], Class = as.character(data.Y.rank[TNBC.influent]))
TNBC.suspect.matrix.splsda12.plot <- data.frame(LV1 = fit.splsda.LOGIT$T[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect),1],LV2 = fit.splsda.LOGIT$T[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect),2], Class = as.character(TNBC.matrix.splsda12.plot[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect),3]))
library(ggplot2)
#postscript("splsda.ps")
#tiff("splsda.tiff")
ggplot(TNBC.matrix.splsda12.plot, aes(LV1, LV2)) + geom_point(aes(color = Class,shape = Class),size=3) + scale_shape_manual(values=c(21,24)) + scale_colour_manual(values=c("black", "black","black","black")) + theme(legend.text=element_text(size=2.5)) + theme_minimal() + theme(axis.text=element_text(size=14)) + theme(axis.title.y = element_text(size = 20, angle = 90)) + theme(axis.title.x = element_text(size = 20, angle = 0)) +
geom_point(data=TNBC.influent.matrix.splsda12.plot,aes(x=LV1, y=LV2,shape = Class,col="black",bg="red"),size=3) +
geom_point(data=TNBC.suspect.matrix.splsda12.plot,aes(x=LV1, y=LV2,shape = Class,col="black",bg="blue"),size=3) + guides(colour=FALSE)
#dev.off()
# PCA on TNBC data
ytnbc.col <- data.Y.rank
ytnbc.col[ytnbc.col==1] <- "black"
ytnbc.col[ytnbc.col==0] <- "black"
ytnbc.col[TNBC.influent] <- "blue"
ytnbc.col[which(rownames(data.plus.race.age.rank)==TNBC.influent.suspect)] <- "red"
ytnbc.shape <- data.Y.rank
ytnbc.shape[ytnbc.shape==1] <- 2
ytnbc.shape[ytnbc.shape==0] <- 1
ytnbc.shape <- as.factor(ytnbc.shape)
ytnbc.col <- as.factor(ytnbc.col)
tnbc.pca.matrix <- as.data.frame(cbind(data.plus.race.age.rank,ytnbc.shape,ytnbc.col))
tnbc.pca <- prcomp(tnbc.pca.matrix[,1:19690], center=FALSE,scale=FALSE)
library(ggfortify)
#postscript("pca.ps")
#tiff("pca.tiff")
autoplot(tnbc.pca, data = tnbc.pca.matrix, colour = ytnbc.col, shape= ytnbc.shape, size=3) + theme_minimal()
#dev.off()
Evaluating the ensemble performance for synthetic datasets built based on mean and covariance matrices from the TNBC and non-TNBC classes
n.synthetic <- 1000
# taking the covariance from the original TNBC data, to make simulation more realistic
xdata.synthetic.cov <- cov(data.plus.race.age.rank)
xdata.synthetic.cov.event <- cov(data.plus.race.age.rank[which(data.Y.rank==1),])
xdata.synthetic.cov.noevent <- cov(data.plus.race.age.rank[which(data.Y.rank==0),])
# for p = 5000 randomly chosen variables
p.synthetic <- 5000
set.seed(7)
cov.vars <- sample(c(1:dim(xdata.synthetic.cov)[2]),p.synthetic)
xdata.synthetic.cov.event5000 <- xdata.synthetic.cov.event[cov.vars,cov.vars]
xdata.synthetic.cov.noevent5000 <- xdata.synthetic.cov.noevent[cov.vars,cov.vars]
# for p = full TNBC dimensionality
p.synthetic <- dim(data.plus.race.age.rank[,1:19688])[2]
xdata.synthetic.cov.eventfull <- xdata.synthetic.cov.event[1:p.synthetic,1:p.synthetic]
xdata.synthetic.cov.noeventfull <- xdata.synthetic.cov.noevent[1:p.synthetic,1:p.synthetic]
# taking the covariance from the original TNBC data, to make simulation more realistic
# for p = 5000 randomly chosen variables
mean.synthetic.event5000 <- apply(data.plus.race.age.rank[which(data.Y.rank==1),],2,mean)[cov.vars]
mean.synthetic.noevent5000 <- apply(data.plus.race.age.rank[which(data.Y.rank==0),],2,mean)[cov.vars]
# for p = full TNBC dimensionality
mean.synthetic.eventfull <- apply(data.plus.race.age.rank[which(data.Y.rank==1),],2,mean)[1:p.synthetic]
mean.synthetic.noeventfull <- apply(data.plus.race.age.rank[which(data.Y.rank==0),],2,mean)[1:p.synthetic]
# generate a random X
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ensembldb':
##
## select
## The following object is masked from 'package:AnnotationDbi':
##
## select
set.seed(8)
xdata.synthetic.event5000 <- mvrnorm(n = 500, mean.synthetic.event5000, xdata.synthetic.cov.event5000, tol = 1e-6, empirical = FALSE)
set.seed(9)
xdata.synthetic.noevent5000 <- mvrnorm(n = 500, mean.synthetic.noevent5000, xdata.synthetic.cov.noevent5000, tol = 1e-6, empirical = FALSE)
xdata.synthetic <- rbind(xdata.synthetic.event5000,xdata.synthetic.noevent5000)
ydata.synthetic <- c(rep(1,500), rep(0,500))
data.Y.synthetic.out <- ydata.synthetic
set.seed(9)
data.synthetic.outliers <- sample(c(1:1000),25, replace=FALSE) # simulating 25 outliers
data.synthetic.outliers
## [1] 222 25 207 216 442 134 389 367 664 984 117 9 873 298 486 494 396
## [18] 961 352 483 882 21 313 111 516
data.Y.synthetic.out[data.synthetic.outliers] <- abs(1-data.Y.synthetic.out[data.synthetic.outliers])
data.synthetic <- xdata.synthetic
# normalized X data
data.synthetic <- (data.synthetic-matrix(apply(data.synthetic,2,mean),nrow(data.synthetic),ncol(data.synthetic),byrow=TRUE))/matrix(apply(data.synthetic,2,sd),nrow(data.synthetic),ncol(data.synthetic),byrow=TRUE)
# Plots
#postscript("Fig_synthetic_outliers_PCA_n1000_p5000_muTNBC_covTNBC_out25.ps")
#tiff("Fig_synthetic_outliers_PCA_n1000_p5000_muTNBC_covTNBC_out25.tiff")
ysynthetic.col <- data.Y.synthetic.out
ysynthetic.col[ysynthetic.col==1] <- "black"
ysynthetic.col[ysynthetic.col==0] <- "gray"
ysynthetic.col[data.synthetic.outliers] <- "red"
synthetic.pca.matrix <- as.data.frame(data.synthetic)
synthetic.pca.matrix.groups <- factor(ysynthetic.col)
synthetic.pca.matrix <- as.matrix(synthetic.pca.matrix)
synthetic.pca <- prcomp(synthetic.pca.matrix, center=FALSE,scale=FALSE)
library(ggfortify)
autoplot(synthetic.pca, data = synthetic.pca.matrix, colour = synthetic.pca.matrix.groups)
#dev.off()
# Ensemble outlier detection
# e.g., for p=5000 and % of outliers = 2.5
set.seed(2010)
my.alpha.synthetic <- seq(0.1,0.9,0.1)
# Optimizing alpha and lambda for logistic regression with Elastic net regularization
nvar.selected.EN.synthetic <- matrix(0,1,length(my.alpha.synthetic) )
MSE.EN.synthetic <- matrix(0,1,length(my.alpha.synthetic))
MSE.EN.synthetic <- matrix(0,1,length(my.alpha.synthetic))
# assigning samples to folds, to be used in cross-validation when tunning alpha
set.seed(2010)
foldid.synthetic <-sample(1:10,size=length(data.Y.synthetic.out),replace=TRUE)
library(glmnet)
for (j in 1:length(my.alpha.synthetic)){
# Logistic model fitting with 10-fold cross-validation for glmnet:
fit.EN.cv.synthetic <- cv.glmnet(as.matrix(data.synthetic),as.factor(data.Y.synthetic.out),family="binomial",foldid=foldid.synthetic,alpha=my.alpha.synthetic[j])
var.selected.EN.synthetic <- which(fit.EN.cv.synthetic$glmnet.fit$beta[,which(fit.EN.cv.synthetic$cvm == min(fit.EN.cv.synthetic$cvm))] != 0)
nvar.selected.EN.synthetic[j] <- length(var.selected.EN.synthetic)
# Predictions obtained by model i:
pred.EN.cv.synthetic <- predict(fit.EN.cv.synthetic,as.matrix(data.synthetic),s="lambda.min",type="response")
# Mean squared error of prediction (MSE):
MSE.EN.synthetic[j] <- mean((data.Y.synthetic.out-pred.EN.cv.synthetic)^2)
}
MSE.EN.synthetic
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.01742921 0.01800071 0.01827128 0.01767633 0.01793559 0.01767032
## [,7] [,8] [,9]
## [1,] 0.01886815 0.01874049 0.01934937
nvar.selected.EN.synthetic
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 557 328 245 209 183 159 142 135 131
my.alpha.synthetic[which(MSE.EN.synthetic == min(MSE.EN.synthetic))]
## [1] 0.1
## Model ensemble
n.models.synthetic <- 3
nvar.selected.synthetic <- matrix(0,1,n.models.synthetic)
MSE.synthetic <- matrix(0,1,n.models.synthetic)
Miscl.synthetic <- matrix(0,dim(data.synthetic)[1],n.models.synthetic)
CookD.synthetic <- matrix(0,dim(data.synthetic)[1],n.models.synthetic)
# logistic regression with Elastic net regularization (LOGIT-EN)
i <- 1
my.alpha.opt.synthetic <- my.alpha.synthetic[which(MSE.EN.synthetic == min(MSE.EN.synthetic))]
fit.EN.cv.synthetic <- cv.glmnet(as.matrix(data.synthetic),as.factor(data.Y.synthetic.out),family="binomial",foldid=foldid.synthetic,alpha=my.alpha.opt.synthetic)
var.selected.EN.synthetic <- which(fit.EN.cv.synthetic$glmnet.fit$beta[,which(fit.EN.cv.synthetic$cvm == min(fit.EN.cv.synthetic$cvm))] != 0)
nvar.selected.synthetic[i] <- length(var.selected.EN.synthetic)
assign(paste("var.selected.synthetic", i, sep = ""), which(fit.EN.cv.synthetic$glmnet.fit$beta[,which(fit.EN.cv.synthetic$cvm == min(fit.EN.cv.synthetic$cvm))] != 0))
# Predictions obtained by model i:
pred.EN.cv.synthetic <- predict(fit.EN.cv.synthetic,as.matrix(data.synthetic),s="lambda.min",type="response")
# Mean squared error of prediction (MSE)
MSE.synthetic[i] <- mean((data.Y.synthetic.out-pred.EN.cv.synthetic)^2)
# Misclassified individuals (‘1’) based on method i:
Miscl.synthetic[,i] <- abs(data.Y.synthetic.out - round(pred.EN.cv.synthetic))
table(data.Y.synthetic.out,round(pred.EN.cv.synthetic))
##
## data.Y.synthetic.out 0 1
## 0 496 17
## 1 5 482
# The Cook’s distance for each individual based on method i:
V <- diag(as.vector(sqrt(pred.EN.cv.synthetic*(1-pred.EN.cv.synthetic))))
H <- V %*% data.synthetic[,var.selected.EN.synthetic] %*% (solve(t(data.synthetic[,var.selected.EN.synthetic]) %*% V %*% data.synthetic[,var.selected.EN.synthetic])) %*% t(data.synthetic[,var.selected.EN.synthetic]) %*% V
CookD.synthetic[,i] <- (data.Y.synthetic.out - pred.EN.cv.synthetic)^2 * diag(H) / ((pred.EN.cv.synthetic*(-pred.EN.cv.synthetic+1))*((-diag(H)+1))^2)
which(Miscl.synthetic[,i]==1)
## [1] 9 21 111 117 134 207 216 222 313 352 367 389 396 442 483 486 494
## [18] 516 664 873 882 961
which(data.synthetic.outliers %in% which(Miscl.synthetic[,i]==1))
## [1] 1 3 4 5 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22 23 24 25
# SPLS-DA
library(spls)
i <- 2
# Optimizing the parameters by cross-validation
#fit.splsda.synthetic.cv <- cv.splsda(as.matrix(data.synthetic),as.factor(data.Y.synthetic.out), fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), kappa=0.5, classifier="logistic", scale.x=FALSE)
# using the parameters optimized above
fit.splsda.synthetic <- splsda(as.matrix(data.synthetic),as.factor(data.Y.synthetic.out), K = 5, eta=0.7, classifier="logistic", scale.x=FALSE)
var.selected.splsda.synthetic <- fit.splsda.synthetic$A
nvar.selected.synthetic[i] <- length(var.selected.splsda.synthetic)
assign(paste("var.selected.synthetic", i, sep = ""), var.selected.splsda.synthetic)
# Predictions obtained by model i:
pred.splsda.synthetic<- predict(fit.splsda.synthetic, as.matrix(data.synthetic),type = "fit","coefficient", fit.type = "response")
# Mean squared error of prediction (MSE)
MSE.synthetic[i] <- mean((data.Y.synthetic.out-pred.splsda.synthetic)^2)
# Misclassified individuals (‘1’) based on method i:
Miscl.synthetic[,i] <- abs(data.Y.synthetic.out - round(pred.splsda.synthetic))
table(data.Y.synthetic.out,round(pred.splsda.synthetic))
##
## data.Y.synthetic.out 0 1
## 0 493 20
## 1 8 479
# The Cook’s distance for each individual based on method i:
V <- diag(as.vector(sqrt(pred.splsda.synthetic*(1-pred.splsda.synthetic))))
H <- V %*% (fit.splsda.synthetic$T) %*% (solve(t(fit.splsda.synthetic$T) %*% V %*% fit.splsda.synthetic$T)) %*% t(fit.splsda.synthetic$T) %*% V # the hat matrix
CookD.synthetic[,i] <- (data.Y.synthetic.out - pred.splsda.synthetic)^2 * diag(H) / ((pred.splsda.synthetic*(-pred.splsda.synthetic+1))*((-diag(H)+1))^2)
which(Miscl.synthetic[,i]==1)
## [1] 9 21 25 66 111 117 130 134 207 216 222 298 313 352 367 389 396
## [18] 442 486 494 516 575 664 873 882 961 984 995
which(data.synthetic.outliers %in% which(Miscl.synthetic[,i]==1))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22 23 24
## [24] 25
# SGPLS
i <- 3
# Optimizing the parameters by cross-validation
#fit.sgpls.synthetic.cv <- cv.sgpls(as.matrix(data.synthetic),data.Y.synthetic.out, fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), scale.x=FALSE)
# using the parameters optimized above
fit.sgpls.synthetic <- sgpls(as.matrix(data.synthetic),data.Y.synthetic.out, K = 5, eta=0.7, scale.x=FALSE)
var.selected.sgpls.synthetic <- fit.sgpls.synthetic$A
nvar.selected.synthetic[i] <- length(var.selected.sgpls.synthetic)
assign(paste("var.selected.synthetic", i, sep = ""), var.selected.sgpls.synthetic)
# Predictions obtained for the train set:
pred.sgpls.synthetic <- predict(fit.sgpls.synthetic,as.matrix(data.synthetic),type="fit",fit.type="response")
# Mean squared error of prediction (MSE)
MSE.synthetic[i] <- mean((data.Y.synthetic.out-pred.sgpls.synthetic)^2)
# Misclassified individuals (‘1’) based on method i:
Miscl.synthetic[,i] <- abs(data.Y.synthetic.out - round(pred.sgpls.synthetic))
table(data.Y.synthetic.out,round(pred.sgpls.synthetic))
##
## data.Y.synthetic.out 0 1
## 0 495 18
## 1 4 483
# The Cook’s distance for each individual based on method i:
V <- diag(as.vector(sqrt(pred.sgpls.synthetic*(1-pred.sgpls.synthetic))))
H <- V %*% (data.synthetic[,var.selected.sgpls.synthetic]%*%fit.sgpls.synthetic$W) %*% (solve(t(data.synthetic[,var.selected.sgpls.synthetic]%*%fit.sgpls.synthetic$W) %*% V %*% data.synthetic[,var.selected.sgpls.synthetic]%*%fit.sgpls.synthetic$W)) %*% t(data.synthetic[,var.selected.sgpls.synthetic]%*%fit.sgpls.synthetic$W) %*% V # the hat matrix
CookD.synthetic[,i] <- (data.Y.synthetic.out - pred.sgpls.synthetic)^2 * diag(H) / ((pred.sgpls.synthetic*(-pred.sgpls.synthetic+1))*((-diag(H)+1))^2)
which(Miscl.synthetic[,i]==1)
## [1] 21 25 111 117 134 207 216 222 298 313 352 367 389 396 486 494 516
## [18] 559 664 815 961 984
which(data.synthetic.outliers %in% which(Miscl.synthetic[,i]==1))
## [1] 1 2 3 4 6 7 8 9 10 11 14 15 16 17 18 19 22 23 24 25
nvar.selected.synthetic
## [,1] [,2] [,3]
## [1,] 557 2897 249
var.selected.synthetic=list(var.selected.synthetic1,var.selected.synthetic2,var.selected.synthetic3)
source("~/Heskes_pvalues.R")
# The rank product:
rank.matrix.synthetic <- apply(CookD.synthetic,2, function(CookD.synthetic) rank(-(CookD.synthetic),ties.method = "first"))
rank.product.synthetic <- apply(rank.matrix.synthetic, 1, prod)
rho.synthetic <-rank.product.synthetic
n.synthetic <- dim(data.synthetic)[1]
k.synthetic <- dim(rank.matrix.synthetic)[2]
# The p-values:
pvalues.synthetic <- as.vector(rankprodbounds(rho.synthetic,n.synthetic,k.synthetic,Delta ='geometric'))
library(qvalue)
# The q-values:
qobj.synthetic <- qvalue(pvalues.synthetic)
qvalues.synthetic <- qobj.synthetic$qvalues
# Misclassifications
Miscl.train.percent.synthetic <- apply(Miscl.synthetic,1, function(Miscl.synthetic) round((sum(Miscl.synthetic)/k.synthetic)*100))
Miscl.train.by.method.synthetic <- apply(Miscl.synthetic,2, sum)
# Building the rank matrix
id.synthetic <- as.vector(seq(1:n.synthetic))
outliers.rank.synthetic <-as.data.frame(cbind(id.synthetic, rank.matrix.synthetic,rank.product.synthetic,pvalues.synthetic,qvalues.synthetic,Miscl.train.percent.synthetic))
outliers.rank.synthetic <- outliers.rank.synthetic[order(qvalues.synthetic),]
outliers.rank.synthetic[1:25,]
## id.synthetic V2 V3 V4 rank.product.synthetic pvalues.synthetic
## 9 9 15 1 2 30 2.880181e-07
## 367 367 18 2 1 36 3.745618e-07
## 961 961 1 8 10 80 1.146239e-06
## 664 664 5 7 4 140 2.445232e-06
## 494 494 9 6 5 270 5.819688e-06
## 117 117 7 5 16 560 1.490003e-05
## 389 389 16 13 3 624 1.709914e-05
## 216 216 17 3 14 714 2.028412e-05
## 207 207 6 11 13 858 2.557809e-05
## 873 873 2 19 23 874 2.617996e-05
## 442 442 20 9 6 1080 3.398691e-05
## 111 111 14 12 8 1344 4.402013e-05
## 486 486 10 14 15 2100 7.389366e-05
## 21 21 11 18 18 3564 1.342893e-04
## 882 882 4 22 48 4224 1.621111e-04
## 222 222 24 17 11 4488 1.733067e-04
## 352 352 23 10 31 7130 2.866408e-04
## 483 483 21 27 17 9639 3.953695e-04
## 995 995 36 21 19 14364 6.007910e-04
## 516 516 3 24 221 15912 6.680386e-04
## 313 313 8 20 145 23200 9.833004e-04
## 298 298 25 4 268 26800 1.137984e-03
## 134 134 13 15 197 38415 1.631746e-03
## 396 396 27 25 65 43875 1.861032e-03
## 38 38 38 33 49 61446 2.587232e-03
## qvalues.synthetic Miscl.train.percent.synthetic
## 9 0.0001872809 67
## 367 0.0001872809 100
## 961 0.0003820797 100
## 664 0.0006113080 100
## 494 0.0011639376 100
## 117 0.0024427342 100
## 389 0.0024427342 100
## 216 0.0025355148 100
## 207 0.0026179960 100
## 873 0.0026179960 67
## 442 0.0030897192 67
## 111 0.0036683444 100
## 486 0.0056841275 100
## 21 0.0095920929 100
## 882 0.0108074080 67
## 222 0.0108316700 100
## 352 0.0168612245 100
## 483 0.0219649745 33
## 995 0.0316205789 33
## 516 0.0334019320 100
## 313 0.0468238287 100
## 298 0.0517265592 67
## 134 0.0709454757 100
## 396 0.0775430185 100
## 38 0.0997775327 0
synthetic.influent <- outliers.rank.synthetic[which(outliers.rank.synthetic[,7] < 0.05),1]
which(data.synthetic.outliers %in% synthetic.influent)
## [1] 1 3 4 5 7 8 9 11 12 13 15 16 18 19 20 21 22 23 24 25
length(which(data.synthetic.outliers %in% synthetic.influent))
## [1] 20
length(which(outliers.rank.synthetic[1:length(data.synthetic.outliers),1] %in% data.synthetic.outliers))
## [1] 23
length(which(outliers.rank.synthetic[which(outliers.rank.synthetic[,2]<=length(data.synthetic.outliers)),1] %in% data.synthetic.outliers))
## [1] 23
length(which(outliers.rank.synthetic[which(outliers.rank.synthetic[,3]<=length(data.synthetic.outliers)),1] %in% data.synthetic.outliers))
## [1] 23
length(which(outliers.rank.synthetic[which(outliers.rank.synthetic[,4]<=length(data.synthetic.outliers)),1] %in% data.synthetic.outliers))
## [1] 16
Genes selected by LOGIT-EN, SPLS-DA and SGPLS
# variables in common between EN and sPLSDA and sgPLS
TNBC.vars.in.common <- as.data.frame(table(c(var.selected1,var.selected2,var.selected3)))
TNBC.vars.in.common <- as.numeric(as.character(TNBC.vars.in.common[which(TNBC.vars.in.common[,2] == 3),1]))
length(TNBC.vars.in.common)
## [1] 26
# 26 vars in common
TNBC.vars.in.common.code <- as.matrix(mcols(ensembl.protein.coding[colnames(data.plus.race.age.rank[,TNBC.vars.in.common]),2]))
TNBC.vars.in.common.code
## gene_name
## [1,] "CA12"
## [2,] "ESR1"
## [3,] "BPI"
## [4,] "TAF7L"
## [5,] "ASPN"
## [6,] "KCNS1"
## [7,] "ACE2"
## [8,] "CLDN10"
## [9,] "PDX1"
## [10,] "PIF1"
## [11,] "HAPLN3"
## [12,] "MFGE8"
## [13,] "DMRTA2"
## [14,] "LYPD1"
## [15,] "ZIC1"
## [16,] "UBASH3B"
## [17,] "SRSF12"
## [18,] "POM121L2"
## [19,] "FTCD"
## [20,] "PGAP3"
## [21,] "VSNL1"
## [22,] "KLK6"
## [23,] "NRG4"
## [24,] "CXXC5"
## [25,] "KIR2DL4"
## [26,] "PPP1R14C"
# summary of gene expression in TNBC and non-TNBC for the 26 genes selected
TNBC.vars.in.common.expression.TNBC <- xdata[ydata[,1]=="TNBC",TNBC.vars.in.common]
colnames(TNBC.vars.in.common.expression.TNBC) <- as.vector(TNBC.vars.in.common.code)
summary(TNBC.vars.in.common.expression.TNBC)
## CA12 ESR1 BPI
## Min. : 0.1188 Min. : 0.01938 Min. : 0.00000
## 1st Qu.: 0.9628 1st Qu.: 0.16063 1st Qu.: 0.08627
## Median : 2.2454 Median : 0.35070 Median : 0.57210
## Mean : 7.1957 Mean : 1.52980 Mean : 1.68510
## 3rd Qu.: 4.7820 3rd Qu.: 0.82760 3rd Qu.: 1.63253
## Max. :198.8447 Max. :29.97893 Max. :28.90646
## TAF7L ASPN KCNS1
## Min. :0.00979 Min. : 0.1891 Min. : 0.006663
## 1st Qu.:0.08815 1st Qu.: 3.7304 1st Qu.: 0.094688
## Median :0.14208 Median : 9.5310 Median : 0.332931
## Mean :0.35790 Mean : 17.7950 Mean : 1.040426
## 3rd Qu.:0.28278 3rd Qu.: 20.6949 3rd Qu.: 0.863642
## Max. :7.30063 Max. :136.1170 Max. :16.869534
## ACE2 CLDN10 PDX1
## Min. : 0.0000 Min. : 0.00000 Min. :0.00000
## 1st Qu.: 0.1258 1st Qu.: 0.02278 1st Qu.:0.00000
## Median : 0.4727 Median : 0.05433 Median :0.01086
## Mean : 4.1248 Mean : 2.65009 Mean :0.22273
## 3rd Qu.: 1.4384 3rd Qu.: 0.46768 3rd Qu.:0.15051
## Max. :149.4913 Max. :91.48534 Max. :3.92918
## PIF1 HAPLN3 MFGE8
## Min. : 0.07712 Min. : 0.919 Min. : 5.714
## 1st Qu.: 0.82028 1st Qu.: 8.362 1st Qu.: 20.410
## Median : 1.34009 Median : 19.276 Median : 36.383
## Mean : 1.84860 Mean : 26.925 Mean : 81.941
## 3rd Qu.: 2.25936 3rd Qu.: 38.167 3rd Qu.: 93.235
## Max. :13.11623 Max. :118.904 Max. :1024.815
## DMRTA2 LYPD1 ZIC1 UBASH3B
## Min. :0.000000 Min. : 0.07908 Min. : 0.0000 Min. : 0.1274
## 1st Qu.:0.009053 1st Qu.: 0.44511 1st Qu.: 0.5488 1st Qu.: 0.9382
## Median :0.030162 Median : 0.78513 Median : 1.8950 Median : 1.8369
## Mean :0.353153 Mean : 1.91865 Mean : 2.3420 Mean : 2.4892
## 3rd Qu.:0.213414 3rd Qu.: 1.82027 3rd Qu.: 3.3878 3rd Qu.: 3.2079
## Max. :6.013674 Max. :36.65243 Max. :15.0472 Max. :15.3149
## SRSF12 POM121L2 FTCD PGAP3
## Min. :0.04384 Min. :0.00000 Min. :0.00000 Min. : 1.250
## 1st Qu.:0.38949 1st Qu.:0.00000 1st Qu.:0.07846 1st Qu.: 5.600
## Median :0.92863 Median :0.01160 Median :0.17386 Median : 7.402
## Mean :1.32648 Mean :0.03602 Mean :0.60743 Mean : 8.647
## 3rd Qu.:1.91600 3rd Qu.:0.03978 3rd Qu.:0.51036 3rd Qu.:10.509
## Max. :8.57894 Max. :0.42138 Max. :8.20286 Max. :48.840
## VSNL1 KLK6 NRG4 CXXC5
## Min. : 0.05713 Min. : 0.000 Min. :0.009831 Min. : 0.9359
## 1st Qu.: 1.02624 1st Qu.: 1.118 1st Qu.:0.059571 1st Qu.: 5.3429
## Median : 2.36642 Median : 11.205 Median :0.103041 Median : 8.3877
## Mean : 4.77549 Mean : 42.474 Mean :0.239635 Mean :10.3194
## 3rd Qu.: 4.58127 3rd Qu.: 49.892 3rd Qu.:0.179789 3rd Qu.:13.0758
## Max. :41.35774 Max. :596.303 Max. :4.219244 Max. :60.0577
## KIR2DL4 PPP1R14C
## Min. :0.00000 Min. : 0.0361
## 1st Qu.:0.04302 1st Qu.: 4.4508
## Median :0.14618 Median : 11.2454
## Mean :0.40262 Mean : 18.4007
## 3rd Qu.:0.45270 3rd Qu.: 27.1789
## Max. :4.30483 Max. :107.9462
TNBC.vars.in.common.expression.nonTNBC <- xdata[ydata[,1]=="NO_TNBC",TNBC.vars.in.common]
colnames(TNBC.vars.in.common.expression.nonTNBC) <- as.vector(TNBC.vars.in.common.code)
summary(TNBC.vars.in.common.expression.nonTNBC)
## CA12 ESR1 BPI
## Min. : 0.0322 Min. : 0.01585 Min. :0.00000
## 1st Qu.: 34.4695 1st Qu.: 16.14359 1st Qu.:0.01528
## Median : 65.3680 Median : 36.66739 Median :0.03789
## Mean : 78.1013 Mean : 47.88104 Mean :0.12478
## 3rd Qu.:107.0139 3rd Qu.: 69.64894 3rd Qu.:0.08706
## Max. :706.3150 Max. :272.20332 Max. :8.19235
## TAF7L ASPN KCNS1
## Min. :0.00000 Min. : 0.231 Min. : 0.00000
## 1st Qu.:0.03541 1st Qu.: 12.960 1st Qu.: 0.01456
## Median :0.06448 Median : 33.650 Median : 0.03252
## Mean :0.09564 Mean : 56.929 Mean : 0.11886
## 3rd Qu.:0.11493 3rd Qu.: 73.519 3rd Qu.: 0.07619
## Max. :1.31195 Max. :541.074 Max. :16.74360
## ACE2 CLDN10 PDX1 PIF1
## Min. : 0.00000 Min. :0.00000 Min. :0.00000 Min. :0.01165
## 1st Qu.: 0.01594 1st Qu.:0.01306 1st Qu.:0.00000 1st Qu.:0.21551
## Median : 0.04167 Median :0.03601 Median :0.00000 Median :0.38055
## Mean : 0.50085 Mean :0.11311 Mean :0.01332 Mean :0.55070
## 3rd Qu.: 0.09604 3rd Qu.:0.09566 3rd Qu.:0.00000 3rd Qu.:0.71488
## Max. :114.41918 Max. :6.81840 Max. :3.05159 Max. :5.43608
## HAPLN3 MFGE8 DMRTA2 LYPD1
## Min. : 0.1285 Min. : 1.203 Min. :0.000000 Min. :0.01422
## 1st Qu.: 1.9119 1st Qu.: 8.542 1st Qu.:0.000000 1st Qu.:0.20975
## Median : 3.2393 Median : 13.611 Median :0.004165 Median :0.33774
## Mean : 4.7742 Mean : 17.810 Mean :0.018961 Mean :0.45644
## 3rd Qu.: 5.3090 3rd Qu.: 20.190 3rd Qu.:0.010962 3rd Qu.:0.52752
## Max. :92.6516 Max. :511.829 Max. :0.895248 Max. :4.31328
## ZIC1 UBASH3B SRSF12
## Min. : 0.000000 Min. : 0.05978 Min. :0.00000
## 1st Qu.: 0.006696 1st Qu.: 0.32953 1st Qu.:0.05075
## Median : 0.021372 Median : 0.50439 Median :0.08674
## Mean : 0.319268 Mean : 0.67541 Mean :0.15722
## 3rd Qu.: 0.070787 3rd Qu.: 0.73772 3rd Qu.:0.15102
## Max. :10.116302 Max. :11.28874 Max. :4.45669
## POM121L2 FTCD PGAP3
## Min. :0.000000 Min. :0.00000 Min. : 2.909
## 1st Qu.:0.000000 1st Qu.:0.02111 1st Qu.: 10.087
## Median :0.000000 Median :0.04947 Median : 14.315
## Mean :0.005048 Mean :0.10523 Mean : 29.555
## 3rd Qu.:0.006027 3rd Qu.:0.11361 3rd Qu.: 20.498
## Max. :0.188325 Max. :3.98651 Max. :579.352
## VSNL1 KLK6 NRG4
## Min. : 0.005134 Min. : 0.0000 Min. :0.00000
## 1st Qu.: 0.364670 1st Qu.: 0.0829 1st Qu.:0.03939
## Median : 0.697485 Median : 0.3741 Median :0.06092
## Mean : 1.062458 Mean : 3.6199 Mean :0.08119
## 3rd Qu.: 1.262764 3rd Qu.: 1.3324 3rd Qu.:0.09877
## Max. :14.256126 Max. :420.5274 Max. :0.97459
## CXXC5 KIR2DL4 PPP1R14C
## Min. : 1.666 Min. :0.00000 Min. : 0.0000
## 1st Qu.: 22.886 1st Qu.:0.01187 1st Qu.: 0.1573
## Median : 31.383 Median :0.03795 Median : 0.4416
## Mean : 34.598 Mean :0.10067 Mean : 1.9582
## 3rd Qu.: 42.703 3rd Qu.:0.09911 3rd Qu.: 1.1118
## Max. :135.424 Max. :2.17800 Max. :152.0083
# ranking coefficients of the variables selected in common by the three methods
EN.coef <- abs(fit.EN.cv$glmnet.fit$beta[,which(fit.EN.cv$cvm == min(fit.EN.cv$cvm))])
EN.coef.matrix <- cbind(c(1:dim(data.plus.race.age.rank)[2]), EN.coef)
rank.EN.coef <- cbind(EN.coef.matrix[order(EN.coef.matrix[,2], decreasing = TRUE),],c(1:dim(data.plus.race.age.rank)[2]))
rank.EN.coef <- rank.EN.coef[which(rank.EN.coef[,1] %in% TNBC.vars.in.common),]
splsda.coef <- abs(coef(fit.splsda.LOGIT))[-1]
splsda.coef.matrix <- cbind(c(1: dim(data.plus.race.age.rank)[2]), splsda.coef)
rank.splsda.coef <- cbind(splsda.coef.matrix[order(splsda.coef.matrix[,2], decreasing = TRUE),],c(1:dim(data.plus.race.age.rank)[2]))
rank.splsda.coef <- rank.splsda.coef[which(rank.splsda.coef[,1] %in% TNBC.vars.in.common),]
sgpls.coef <- abs(coef(fit.sgpls.LOGIT))[-1]
sgpls.coef.matrix <- cbind(c(1: dim(data.plus.race.age.rank)[2]), sgpls.coef)
rank.sgpls.coef <- cbind(sgpls.coef.matrix[order(sgpls.coef.matrix[,2] , decreasing = TRUE),],c(1:dim(data.plus.race.age.rank)[2]))
rank.sgpls.coef <- rank.sgpls.coef[which(rank.sgpls.coef[,1] %in% TNBC.vars.in.common),]
variables.rank.matrix <- cbind(sort(rank.EN.coef[,1]),rank.EN.coef[order(rank.EN.coef[,1]),3], rank.splsda.coef[order(rank.splsda.coef[,1]),3], rank.sgpls.coef[order(rank.sgpls.coef[,1]),3])
colnames(variables.rank.matrix) <- c("variable","LOGIT-EN","SPLSDA","SGPLS")
#variables.rank.matrix <- cbind(as.matrix(mcols(ensembl.protein.coding[rownames(variables.rank.matrix),2])),variables.rank.matrix)
combined.variables.rank.matrix <- cbind(TNBC.vars.in.common.code,apply(variables.rank.matrix[,2:4],1,prod))
combined.variables.rank.matrix
## gene_name
## ENSG00000074410 "CA12" "1669570"
## ENSG00000091831 "ESR1" "38889"
## ENSG00000101425 "BPI" "964070"
## ENSG00000102387 "TAF7L" "33626"
## ENSG00000106819 "ASPN" "281637"
## ENSG00000124134 "KCNS1" "672"
## ENSG00000130234 "ACE2" "3312"
## ENSG00000134873 "CLDN10" "1416"
## ENSG00000139515 "PDX1" "85436"
## ENSG00000140451 "PIF1" "153594"
## ENSG00000140511 "HAPLN3" "622544"
## ENSG00000140545 "MFGE8" "873828"
## ENSG00000142700 "DMRTA2" "12155"
## ENSG00000150551 "LYPD1" "72"
## ENSG00000152977 "ZIC1" "35200"
## ENSG00000154127 "UBASH3B" "264740"
## ENSG00000154548 "SRSF12" "600210"
## ENSG00000158553 "POM121L2" "1521"
## ENSG00000160282 "FTCD" "9900"
## ENSG00000161395 "PGAP3" "100"
## ENSG00000163032 "VSNL1" "25380"
## ENSG00000167755 "KLK6" "2092960"
## ENSG00000169752 "NRG4" "23780"
## ENSG00000171604 "CXXC5" "55626"
## ENSG00000189013 "KIR2DL4" "22344"
## ENSG00000198729 "PPP1R14C" "454230"
combined.variables.rank.matrix[order(as.numeric(combined.variables.rank.matrix[,2])),1]
## ENSG00000150551 ENSG00000161395 ENSG00000124134 ENSG00000134873
## "LYPD1" "PGAP3" "KCNS1" "CLDN10"
## ENSG00000158553 ENSG00000130234 ENSG00000160282 ENSG00000142700
## "POM121L2" "ACE2" "FTCD" "DMRTA2"
## ENSG00000189013 ENSG00000169752 ENSG00000163032 ENSG00000102387
## "KIR2DL4" "NRG4" "VSNL1" "TAF7L"
## ENSG00000152977 ENSG00000091831 ENSG00000171604 ENSG00000139515
## "ZIC1" "ESR1" "CXXC5" "PDX1"
## ENSG00000140451 ENSG00000154127 ENSG00000106819 ENSG00000198729
## "PIF1" "UBASH3B" "ASPN" "PPP1R14C"
## ENSG00000154548 ENSG00000140511 ENSG00000140545 ENSG00000101425
## "SRSF12" "HAPLN3" "MFGE8" "BPI"
## ENSG00000074410 ENSG00000167755
## "CA12" "KLK6"
Genes selected by LOGIT-EN, SPLS-DA and SGPLS when changing the label of the influential observations misclassified in more than 2 methods
data.Y.rank.change <- data.Y.rank
data.Y.rank.change[outliers.rank[which(outliers.rank[1:24,8] >= 67),1]
] <- 1-data.Y.rank.change[outliers.rank[which(outliers.rank[1:24,8] >= 67),1]]
# LOGIT-EN
my.alpha <- seq(0.1,0.9,0.1)
## Optimizing alpha and lambda for logistic regression with Elastic net regularization
nvar.selected.EN.outliers.change <- matrix(0,1,length(my.alpha))
pred.EN.outliers.change <- matrix(0,dim(data.plus.race.age.rank)[1],length(my.alpha))
MSE.EN.outliers.change <- matrix(0,1,length(my.alpha))
set.seed(2010)
for (j in 1:length(my.alpha)){
# Logistic model fitting with 10-fold cross-validation for glmnet:
fit.EN.cv.outliers.change <- cv.glmnet(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank.change),family="binomial",foldid=foldid.tnbc,alpha=my.alpha[j])
var.selected.EN.outliers.change <- which(fit.EN.cv.outliers.change$glmnet.fit$beta[,which(fit.EN.cv.outliers.change$cvm == min(fit.EN.cv.outliers.change$cvm))] != 0)
nvar.selected.EN.outliers.change[j] <- length(var.selected.EN.outliers.change)
# Predictions obtained by model i
pred.EN.outliers.change[,j] <- predict(fit.EN.cv.outliers.change,as.matrix(data.plus.race.age.rank),s="lambda.min",type="response")
# Mean squared error of prediction (MSE)
MSE.EN.outliers.change[j] <- mean((data.Y.rank.change-pred.EN.outliers.change)^2)
}
MSE.EN.outliers.change
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.1482493 0.1311899 0.1141429 0.09705398 0.08017282 0.06327089
## [,7] [,8] [,9]
## [1,] 0.04627029 0.02925698 0.01223522
nvar.selected.EN.outliers.change
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 495 256 202 170 137 122 112 104 94
my.alpha[which(MSE.EN.outliers.change == min(MSE.EN.outliers.change))]
## [1] 0.9
my.alpha.opt <- my.alpha[which(MSE.EN.outliers.change == min(MSE.EN.outliers.change))]
fit.EN.cv.outliers.change <- cv.glmnet(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank.change),family="binomial",foldid=foldid.tnbc,alpha=my.alpha.opt)
var.selected.EN.outliers.change <- which(fit.EN.cv.outliers.change$glmnet.fit$beta[,which(fit.EN.cv.outliers.change$cvm == min(fit.EN.cv.outliers.change$cvm))] != 0)
# Predictions
pred.EN.outliers.change <- predict(fit.EN.cv.outliers.change,as.matrix(data.plus.race.age.rank),s="lambda.min",type="response")
# Mean squared error of prediction (MSE):
MSE.EN.outliers.change <- mean((data.Y.rank.change-pred.EN.outliers.change)^2)
MSE.EN.outliers.change
## [1] 0.01238853
# SPLSDA
#fit.splsda.LOGIT.cv.outliers.change <- cv.splsda(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank.change), fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), kappa=0.5, classifier="logistic", scale.x=FALSE,n.core=10)
fit.splsda.LOGIT.outliers.change <- splsda(as.matrix(data.plus.race.age.rank),as.factor(data.Y.rank.change), K = 5, eta=0.7, classifier="logistic", scale.x=FALSE) # parameters K and eta obtained by cross-validation above
var.selected.splsda.LOGIT.outliers.change <- fit.splsda.LOGIT.outliers.change$A
# Predictions
pred.splsda.LOGIT.outliers.change <- predict(fit.splsda.LOGIT.outliers.change, as.matrix(data.plus.race.age.rank),type = "fit","coefficient", fit.type = "response")
# Mean squared error of prediction (MSE):
MSE.splsda.LOGIT.outliers.change <- mean((data.Y.rank.change-pred.splsda.LOGIT.outliers.change)^2)
MSE.splsda.LOGIT.outliers.change
## [1] 0.009926286
# SGPLS
#fit.sgpls.LOGIT.cv.outliers.out <- cv.sgpls(as.matrix(data.plus.race.age.rank),data.Y.rank.change, fold=10, K = c(1:5), eta = c(0.9,0.8,0.7), scale.x=FALSE,n.core=10)
fit.sgpls.LOGIT.outliers.change <- sgpls(as.matrix(data.plus.race.age.rank),data.Y.rank.change, K = 5, eta=0.8, scale.x=FALSE)
var.selected.sgpls.LOGIT.outliers.change <- fit.sgpls.LOGIT.outliers.change$A
# Predictions
pred.sgpls.LOGIT.outliers.change <- predict(fit.sgpls.LOGIT.outliers.change, as.matrix(data.plus.race.age.rank),type="fit",fit.type="response")
# Mean squared error of prediction (MSE):
MSE.sgpls.LOGIT.outliers.change <- mean((data.Y.rank.change-pred.sgpls.LOGIT.outliers.change)^2)
MSE.sgpls.LOGIT.outliers.change
## [1] 0.07822628
# variables in common between EN and SPLSDA and SGPLS
outliers.change.vars.in.common <- as.data.frame(table(c(var.selected.EN.outliers.change,var.selected.splsda.LOGIT.outliers.change,var.selected.sgpls.LOGIT.outliers.change)))
outliers.change.vars.in.common <- as.numeric(as.character(outliers.change.vars.in.common[which(outliers.change.vars.in.common[,2] == 3),1]))
length(outliers.change.vars.in.common)
## [1] 16
outliers.change.vars.in.common.code <- as.matrix(mcols(ensembl.protein.coding[colnames(data.plus.race.age.rank[,outliers.change.vars.in.common]),2]))
outliers.change.vars.in.common.code
## gene_name
## [1,] "ESR1"
## [2,] "TAF7L"
## [3,] "VSX2"
## [4,] "ACE2"
## [5,] "SLCO1B1"
## [6,] "CLDN10"
## [7,] "TTLL4"
## [8,] "HAPLN3"
## [9,] "DMRTA2"
## [10,] "LYPD1"
## [11,] "SRSF12"
## [12,] "PGAP3"
## [13,] "CXXC5"
## [14,] "PPP1R14C"
## [15,] "CT83"
## [16,] "NOTO"
# variables in common with the previous 26 relevant selected by the three methods
outliers.change.vars.in.common.TNBC <- outliers.change.vars.in.common[which(outliers.change.vars.in.common %in% TNBC.vars.in.common)]
outliers.change.vars.in.common.TNBC.code <- as.matrix(mcols(ensembl.protein.coding[colnames(data.plus.race.age.rank[,outliers.change.vars.in.common.TNBC]),2]))
outliers.change.vars.in.common.TNBC.code
## gene_name
## [1,] "ESR1"
## [2,] "TAF7L"
## [3,] "ACE2"
## [4,] "CLDN10"
## [5,] "HAPLN3"
## [6,] "DMRTA2"
## [7,] "LYPD1"
## [8,] "SRSF12"
## [9,] "PGAP3"
## [10,] "CXXC5"
## [11,] "PPP1R14C"
# summary of gene expression in TNBC and non-TNBC for the 10 genes selected
dim(xdata)
## [1] 1019 19688
outliers.change.vars.in.common.name <- colnames(xdata[,outliers.change.vars.in.common])
outliers.change.vars.in.common.expression.TNBC <- xdata[data.Y=="1",outliers.change.vars.in.common.name]
colnames(outliers.change.vars.in.common.expression.TNBC) <- as.vector(outliers.change.vars.in.common.code)
summary(outliers.change.vars.in.common.expression.TNBC)
## ESR1 TAF7L VSX2 ACE2
## Min. : 0.01938 Min. :0.00979 Min. :0.00000 Min. : 0.0000
## 1st Qu.: 0.16063 1st Qu.:0.08815 1st Qu.:0.00000 1st Qu.: 0.1258
## Median : 0.35070 Median :0.14208 Median :0.01006 Median : 0.4727
## Mean : 1.52980 Mean :0.35790 Mean :0.08516 Mean : 4.1248
## 3rd Qu.: 0.82760 3rd Qu.:0.28278 3rd Qu.:0.04406 3rd Qu.: 1.4384
## Max. :29.97893 Max. :7.30063 Max. :2.47960 Max. :149.4913
## SLCO1B1 CLDN10 TTLL4 HAPLN3
## Min. :0.00000 Min. : 0.00000 Min. : 1.614 Min. : 0.919
## 1st Qu.:0.00000 1st Qu.: 0.02278 1st Qu.: 5.727 1st Qu.: 8.362
## Median :0.00000 Median : 0.05433 Median : 9.034 Median : 19.276
## Mean :0.06594 Mean : 2.65009 Mean :10.526 Mean : 26.925
## 3rd Qu.:0.01368 3rd Qu.: 0.46768 3rd Qu.:14.144 3rd Qu.: 38.167
## Max. :2.23565 Max. :91.48534 Max. :42.183 Max. :118.904
## DMRTA2 LYPD1 SRSF12 PGAP3
## Min. :0.000000 Min. : 0.07908 Min. :0.04384 Min. : 1.250
## 1st Qu.:0.009053 1st Qu.: 0.44511 1st Qu.:0.38949 1st Qu.: 5.600
## Median :0.030162 Median : 0.78513 Median :0.92863 Median : 7.402
## Mean :0.353153 Mean : 1.91865 Mean :1.32648 Mean : 8.647
## 3rd Qu.:0.213414 3rd Qu.: 1.82027 3rd Qu.:1.91600 3rd Qu.:10.509
## Max. :6.013674 Max. :36.65243 Max. :8.57894 Max. :48.840
## CXXC5 PPP1R14C CT83 NOTO
## Min. : 0.9359 Min. : 0.0361 Min. : 0.000 Min. :0.000000
## 1st Qu.: 5.3429 1st Qu.: 4.4508 1st Qu.: 0.000 1st Qu.:0.000000
## Median : 8.3877 Median : 11.2454 Median : 7.716 Median :0.004625
## Mean :10.3194 Mean : 18.4007 Mean :10.697 Mean :0.042799
## 3rd Qu.:13.0758 3rd Qu.: 27.1789 3rd Qu.:16.792 3rd Qu.:0.009891
## Max. :60.0577 Max. :107.9462 Max. :67.889 Max. :2.293503
outliers.change.vars.in.common.expression.nonTNBC <- xdata[data.Y=="0",outliers.change.vars.in.common.name]
colnames(outliers.change.vars.in.common.expression.nonTNBC) <- as.vector(outliers.change.vars.in.common.code)
summary(outliers.change.vars.in.common.expression.nonTNBC)
## ESR1 TAF7L VSX2
## Min. : 0.01585 Min. :0.00000 Min. :0.000000
## 1st Qu.: 16.14359 1st Qu.:0.03541 1st Qu.:0.000000
## Median : 36.66739 Median :0.06448 Median :0.006088
## Mean : 47.88104 Mean :0.09564 Mean :0.018380
## 3rd Qu.: 69.64894 3rd Qu.:0.11493 3rd Qu.:0.015692
## Max. :272.20332 Max. :1.31195 Max. :0.922559
## ACE2 SLCO1B1 CLDN10 TTLL4
## Min. : 0.00000 Min. :0.00000 Min. :0.00000 Min. : 0.7379
## 1st Qu.: 0.01594 1st Qu.:0.00000 1st Qu.:0.01306 1st Qu.: 2.1871
## Median : 0.04167 Median :0.00000 Median :0.03601 Median : 2.9417
## Mean : 0.50085 Mean :0.01287 Mean :0.11311 Mean : 3.4643
## 3rd Qu.: 0.09604 3rd Qu.:0.00000 3rd Qu.:0.09566 3rd Qu.: 3.9226
## Max. :114.41918 Max. :2.05987 Max. :6.81840 Max. :32.5132
## HAPLN3 DMRTA2 LYPD1 SRSF12
## Min. : 0.1285 Min. :0.000000 Min. :0.01422 Min. :0.00000
## 1st Qu.: 1.9119 1st Qu.:0.000000 1st Qu.:0.20975 1st Qu.:0.05075
## Median : 3.2393 Median :0.004165 Median :0.33774 Median :0.08674
## Mean : 4.7742 Mean :0.018961 Mean :0.45644 Mean :0.15722
## 3rd Qu.: 5.3090 3rd Qu.:0.010962 3rd Qu.:0.52752 3rd Qu.:0.15102
## Max. :92.6516 Max. :0.895248 Max. :4.31328 Max. :4.45669
## PGAP3 CXXC5 PPP1R14C CT83
## Min. : 2.909 Min. : 1.666 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 10.087 1st Qu.: 22.886 1st Qu.: 0.1573 1st Qu.: 0.0000
## Median : 14.315 Median : 31.383 Median : 0.4416 Median : 0.0000
## Mean : 29.555 Mean : 34.598 Mean : 1.9582 Mean : 0.7144
## 3rd Qu.: 20.498 3rd Qu.: 42.703 3rd Qu.: 1.1118 3rd Qu.: 0.0000
## Max. :579.352 Max. :135.424 Max. :152.0083 Max. :60.3399
## NOTO
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.010222
## 3rd Qu.:0.007316
## Max. :0.713836
The different models used for ensemble analysis are sparse logistic model with alpha = 0.7 (EN) based on randonly selected 80 % of patients.
source("~/ensembl.rand.patients.R")
library('qvalue')
set.seed(2010)
n.models.rand <- 100
outliers.rand.patients.EN <- ensembl.rand.patients(data.plus.race.age.rank,data.Y.rank,n.models.rand)
# mean number of variables selected in the 100 runs
mean(outliers.rand.patients.EN$nvariables.selected)
## [1] 81.97
# mean MSE for the 100 runs
mean(outliers.rand.patients.EN$MSE)
## [1] 0.03168607
# mean number of misclassification in the 100 runs
mean(outliers.rand.patients.EN$Misclassifications.by.method)
## [1] 34.81
# influent patients
TNBC.rand.patient.influent <- outliers.rand.patients.EN$outliers.rank[which(outliers.rand.patients.EN$outliers.rank[,n.models.rand+4] < 0.05),1]
length(TNBC.rand.patient.influent)
## [1] 40
# influent patients which are suspect
rownames(data.plus.race.age.rank[TNBC.rand.patient.influent,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.rand.patient.influent,])) %in% suspect)]
## [1] "TCGA-A2-A0EQ-01A-11R-A034-07" "TCGA-A2-A04U-01A-11R-A115-07"
## [3] "TCGA-AN-A0FJ-01A-11R-A00Z-07" "TCGA-LL-A5YP-01A-21R-A28M-07"
## [5] "TCGA-AN-A0FX-01A-11R-A034-07" "TCGA-AO-A0JL-01A-11R-A056-07"
TNBC.rand.patient.var.select <- as.data.frame(table(outliers.rand.patients.EN$var.selected))
TNBC.rand.patient.var.select$Var1 <- as.numeric(as.character(TNBC.rand.patient.var.select[,1]))
TNBC.rand.patient.var.select$var.name <- colnames(data.plus.race.age.rank[,TNBC.rand.patient.var.select[,1]])
# variabes selected in more that 75% of the runs
TNBC.rand.patient.var.select.more75 <- TNBC.rand.patient.var.select[which(TNBC.rand.patient.var.select[,2]>75),3]
length(TNBC.rand.patient.var.select.more75)
## [1] 23
# variables selected in common by the ensembles for the original TNBC and for the 'random patient' strategy
TNBC.rand.patient.var.select.common.originalTNBC <- TNBC.rand.patient.var.select.more75[which(TNBC.rand.patient.var.select.more75 %in% colnames(data.plus.race.age.rank[,TNBC.vars.in.common]))]
length(TNBC.rand.patient.var.select.common.originalTNBC)
## [1] 15
# variables code
ensembl.protein.coding[TNBC.rand.patient.var.select.common.originalTNBC,2][,1]
## GRanges object with 15 ranges and 1 metadata column:
## seqnames ranges strand | gene_name
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000074410 15 [ 63321378, 63382161] - | CA12
## ENSG00000091831 6 [151656691, 152129619] + | ESR1
## ENSG00000124134 20 [ 45092310, 45101112] - | KCNS1
## ENSG00000130234 X [ 15561033, 15602148] - | ACE2
## ENSG00000140451 15 [ 64815632, 64825668] - | PIF1
## ... ... ... ... . ...
## ENSG00000160282 21 [ 46136262, 46155567] - | FTCD
## ENSG00000161395 17 [ 39671122, 39696797] - | PGAP3
## ENSG00000171604 5 [139647299, 139683882] + | CXXC5
## ENSG00000189013 19 [ 54803535, 54814517] + | KIR2DL4
## ENSG00000198729 6 [150143076, 150250357] + | PPP1R14C
## -------
## seqinfo: 287 sequences from GRCh38 genome
as.matrix(mcols(ensembl.protein.coding[TNBC.rand.patient.var.select.common.originalTNBC,2]))
## gene_name
## [1,] "CA12"
## [2,] "ESR1"
## [3,] "KCNS1"
## [4,] "ACE2"
## [5,] "PIF1"
## [6,] "HAPLN3"
## [7,] "LYPD1"
## [8,] "ZIC1"
## [9,] "UBASH3B"
## [10,] "POM121L2"
## [11,] "FTCD"
## [12,] "PGAP3"
## [13,] "CXXC5"
## [14,] "KIR2DL4"
## [15,] "PPP1R14C"
The different models used for ensemble analysis are sparse logistic model with alpha = 0.7 (EN), departing from a set of 1000 randomly selected variables.
source("~/ensembl.rand.variables.R")
times <- 100
vars.rand <- 1000
outliers.rand.variables.EN <- ensembl.rand.variables(data.plus.race.age.rank,data.Y.rank, times, vars.rand)
# mean number of variables selected in the 100 runs
mean(outliers.rand.variables.EN$nvariables.selected)
## [1] 64.52
# mean MSE for the 100 runs
mean(outliers.rand.variables.EN$MSE)
## [1] 0.0347633
# mean number of misclassification in the 100 runs
mean(outliers.rand.variables.EN$Misclassifications.by.method)
## [1] 41.1
# influent patients
TNBC.rand.variables.influent <- outliers.rand.variables.EN$outliers.rank[which(outliers.rand.variables.EN$outliers.rank[,times+4] < 0.05),1]
length(TNBC.rand.variables.influent)
## [1] 37
# influent patients which are suspect
rownames(data.plus.race.age.rank[TNBC.rand.variables.influent,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.rand.variables.influent,])) %in% suspect)]
## [1] "TCGA-A2-A04U-01A-11R-A115-07" "TCGA-AN-A0FJ-01A-11R-A00Z-07"
## [3] "TCGA-A2-A0EQ-01A-11R-A034-07" "TCGA-LL-A5YP-01A-21R-A28M-07"
## [5] "TCGA-AO-A0JL-01A-11R-A056-07" "TCGA-AN-A0FX-01A-11R-A034-07"
# which influent patients in common with the ensemble for the original TNBC data and by the 'random patient' and 'random variable' strategies
TNBC.rand.outliers.in.common <- as.data.frame(table(c(TNBC.rand.variables.influent,TNBC.rand.patient.influent,TNBC.influent)))
TNBC.rand.outliers.in.common <- as.numeric(as.character(TNBC.rand.outliers.in.common[which(TNBC.rand.outliers.in.common[,2] == 3),1]))
length(TNBC.rand.outliers.in.common)
## [1] 18
getParticipantCode(rownames(data.plus.race.age.rank[TNBC.rand.outliers.in.common,]))
## [1] "TCGA-AC-A2QJ" "TCGA-OL-A97C" "TCGA-AC-A62X" "TCGA-A2-A0YJ"
## [5] "TCGA-LL-A5YP" "TCGA-E2-A1II" "TCGA-A2-A0EQ" "TCGA-AR-A1AH"
## [9] "TCGA-LL-A740" "TCGA-A2-A3Y0" "TCGA-C8-A3M7" "TCGA-E9-A1ND"
## [13] "TCGA-A2-A1G6" "TCGA-D8-A1JF" "TCGA-OL-A5S0" "TCGA-E9-A22G"
## [17] "TCGA-AR-A251" "TCGA-AR-A1AJ"
# 18 patients in common (TNBC original + rand patients + rand variables) detected as outliers
# influent patients in common for the three emsemble strategies which are suspect
rownames(data.plus.race.age.rank[TNBC.rand.outliers.in.common,])[which(getParticipantCode(rownames(data.plus.race.age.rank[TNBC.rand.outliers.in.common,])) %in% suspect)]
## [1] "TCGA-LL-A5YP-01A-21R-A28M-07" "TCGA-A2-A0EQ-01A-11R-A034-07"