【R>>MOVICS】多组学+亚型秘密武器
MOVICS包整合多组学数据进行分析,其安装的过程太让人心酸,大家安装的过程慢慢体会,下面就来学习下这个由“大鱼海棠”老师开发的R包吧。
1.示例数据
-
brca.tcga.RData
:MOVICS以TCGA乳腺癌的数据为例,包含643例样本的4种完整的组学信息,作为训练集。 -
brca.yau.RData
: 来自BRCA-YAU队列,包含682例样本的信息。
library(MOVICS)
load(system.file("extdata","brca.tcga.RData",package = "MOVICS",mustWork = T))
load(system.file("extdata","brca.yau.RData",package = "MOVICS",mustWork = T))
因多组学聚类过程中相对耗费时间,因此预处理的时候提取mRNAs, lncRNA的前500,promoter CGI的前1000 高变探针或基因,前30个在至少3%的队列中变化的基因。
2.工作流程
2.1 流程概述
MOVICS工作流程分为3个部分:
-
GET module: get subtypes
-
COMP module: compare subtypes
-
RUN module: run marker identification and verify subtypes
2.2 详细步骤
2.2.1 分型(GET Module)
names(brca.tcga)
[1] "mRNA.expr" "lncRNA.expr" "meth.beta" "mut.status" "count"
[6] "fpkm" "maf" "segment" "clin.info"
names(brca.yau)
[1] "mRNA.expr" "clin.info"
# 提取多组学数据
mo.data <- brca.tcga[1:4]
count <- brca.tcga$count
fpkm <- brca.tcga$fpkm
maf <- brca.tcga$maf
segment <- brca.tcga$segment #CNV数据
surv.info <- brca.tcga$clin.info
输入数据过滤
核心函数:getElites()
→提供5种方法,包括mad, sd, cox, freq和na.action。
tmp <- brca.tcga$mRNA.expr
dim(tmp)
[1] 500 643
#假设存在NA的值
tmp[1,1] <- tmp[2,2] <- NA
tmp[1:3,1:3]
BRCA-A03L-01A BRCA-A04R-01A BRCA-A075-01A
SCGB2A2 NA 1.42 7.24
SCGB1D2 10.11 NA 5.88
PIP 4.54 2.59 4.35
elite.tmp <- getElites(dat=tmp,
method="mad",
na.action="rm",
elite.pct=1)
dim(elite.tmp$elite.dat) #将这两个含有NA的值去掉了
[1] 498 643
elite.tmp <- getElites(dat=tmp,
method = "mad",
na.action = "impute",
elite.pct = 1)
dim(elite.tmp$elite.dat) # impute插补
[1] 500 643
#> 用mad/sd筛选
tmp <- brca.tcga$mRNA.expr
elite.tmp <- getElites(dat=tmp,
method = "mad",
elite.pct = 0.1)
dim(elite.tmp$elite.dat) #选取前10%
[1] 50 643
elite.tmp <- getElites(dat = tmp,
method = "sd",
elite.num = 100,
elite.pct = 0.1)
dim(elite.tmp$elite.dat)
[1] 100 643
# pca方法筛选
tmp <- brca.tcga$mRNA.expr # get expression data with 500 features
elite.tmp <- getElites(dat = tmp,
method = "pca",
pca.ratio = 0.95)
dim(elite.tmp$elite.dat)
[1] 204 643
# cox方法筛选
tmp <- brca.tcga$mRNA.expr
elite.tmp <- getElites(dat=tmp,
method = "cox",
surv.info = surv.info,
p.cutoff = 0.05,
elite.num = 100)
5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70% 75% 80% 85% 90% 95% 100%
dim(elite.tmp$elite.dat)
[1] 125 643
table(elite.tmp$unicox.res$pvalue<0.05)
FALSE TRUE
375 125
tmp <- brca.tcga$mut.status
elite.tmp <- getElites(dat=tmp,
method = "cox",
surv.info = surv.info,
p.cutoff = 0.05,
elite.num = 100)
7% 13% 20% 27% 33% 40% 47% 53% 60% 67% 73% 80% 87% 93% 100%
dim(elite.tmp$elite.dat)
[1] 3 643
table(elite.tmp$unicox.res$pvalue<0.05)
FALSE TRUE
27 3
#> 突变频率筛选
tmp <- brca.tcga$mut.status
rowSums(tmp)
PIK3CA TP53 TTN CDH1 GATA3 MLL3 MUC16 MAP3K1 SYNE1 MUC12 DMD
208 186 111 83 58 49 48 38 33 32 31
NCOR1 FLG PTEN RYR2 USH2A SPTA1 MAP2K4 MUC5B NEB SPEN MACF1
31 30 29 27 27 25 25 24 24 23 23
RYR3 DST HUWE1 HMCN1 CSMD1 OBSCN APOB SYNE2
23 22 22 22 21 21 21 21
elite.tmp <- getElites(dat=tmp,
method = "freq",
elite.num = 80,
elite.pct = 0.1)
rowSums(elite.tmp$elite.dat)
PIK3CA TP53 TTN CDH1
208 186 111 83
elite.tmp <- getElites(dat = tmp,
method = "freq",
elite.pct = 0.2)
rowSums(elite.tmp$elite.dat)
PIK3CA TP53
208 186
获取最佳分类
optk.brca <- getClustNum(data = mo.data,
is.binary = c(F,F,F,T), #四种组学数据中只有somatic mutation是二分类矩阵
try.N.clust = 2:8,
fig.name = "CLUSTER NUMBER OF TCGA-BRCA")
Figure.1 确定最佳聚类
根据上图直接选取N.clust=5作为最佳分类,然后用多种方法进行聚类。
moic.res.list <- getMOIC(data=mo.data,
methodslist = list("SNF", "PINSPlus", "NEMO", "COCA", "LRAcluster", "ConsensusClustering", "IntNMF", "CIMLR", "MoCluster"),
N.clust = 5,
type = c("gaussian", "gaussian", "gaussian", "binomial"))
iClusterBayes.res <- getMOIC(data = mo.data,
N.clust = 5,
methodslist = "iClusterBayes", # specify only ONE algorithm here
type = c("gaussian","gaussian","gaussian","binomial"), # data type corresponding to the list
n.burnin = 1800,
n.draw = 1200,
prior.gamma = c(0.5, 0.5, 0.5, 0.5),
sdev = 0.05,
thin = 3)
moic.res.list <- append(moic.res.list,
list("iClusterBayes" = iClusterBayes.res))
save(moic.res.list,file = "moic.res.list.rda")
save(iClusterBayes.res,file = "iClusterBayes.res.rda")
获取聚类矩阵
load("moic.res.list.rda")
cmoic.brca <- getConsensusMOIC(moic.res.list = moic.res.list,
fig.name = "CONSENSUS HEATMAP",
distance = "euclidean",
linkage = "average")
Figure 2.层次聚类热图
silhoutte进行相似性评价
getSilhouette(sil=cmoic.brca$sil,
fig.path = getwd(),
fig.name = "SILHOUETTE",
height = 5.5,
width = 5)
Figure 3. 相似性评价轮廓图
png
2
聚类热图
indata <- mo.data
indata$meth.beta <- log2(indata$meth.beta/(1-indata$meth.beta))
plotdata <- getStdiz(data=indata,
halfwidth = c(2,2,2,NA),
centerFlag = c(T,T,T,F),
scaleFlag = c(T,T,T,F))
值得注意的是iClusterBayes.res$feat.res已经按照每个组学数据特征的后延概率排序,我们可以选取前10个feature,并生成一个feature list,命名为annRow,对热图进行注释。
load("iClusterBayes.res.rda")
feat <- iClusterBayes.res$feat.res
feat1 <- feat[which(feat$dataset == "mRNA.expr"),][1:10,"feature"]
feat2 <- feat[which(feat$dataset == "lncRNA.expr"),][1:10,"feature"]
feat3 <- feat[which(feat$dataset == "meth.beta"),][1:10,"feature"]
feat4 <- feat[which(feat$dataset == "mut.status"),][1:10,"feature"]
annRow <- list(feat1,feat2,feat3,feat4)
# if no color list specified all subheatmaps will be unified to green and red color pattern
mRNA.col <- c("#00FF00", "#008000", "#000000", "#800000", "#FF0000")
lncRNA.col <- c("#6699CC", "white" , "#FF3C38")
meth.col <- c("#0074FE", "#96EBF9", "#FEE900", "#F00003")
mut.col <- c("grey90" , "black")
col.list <- list(mRNA.col, lncRNA.col, meth.col, mut.col)
生成羡慕已久的MoHeatmap
getMoHeatmap(data = plotdata,
row.title = c("mRNA","lncRNA","Methylation","Mutation"),
is.binary = c(F,F,F,T), # the 4th data is mutation which is binary
legend.name = c("mRNA.FPKM","lncRNA.FPKM","M value","Mutated"),
clust.res = iClusterBayes.res$clust.res, # cluster results
clust.dend = NULL, # no dendrogram
show.rownames = c(F,F,F,F), # specify for each omics data
show.colnames = FALSE, # show no sample names
annRow = annRow, # mark selected features
color = col.list,
annCol = NULL, # no annotation for samples
annColors = NULL, # no annotation color
width = 10, # width of each subheatmap
height = 5, # height of each subheatmap
fig.name = "COMPREHENSIVE HEATMAP OF ICLUSTERBAYES")
Figure 4.热图
为热图添加列注释信息
annCol<- surv.info[,c("PAM50", "pstage", "age"), drop = FALSE]
annColors <- list(age = circlize::colorRamp2(breaks = c(min(annCol$age),
median(annCol$age),
max(annCol$age)),
colors = c("#0000AA", "#555555", "#AAAA00")),
PAM50 = c("Basal" = "blue",
"Her2" = "red",
"LumA" = "yellow",
"LumB" = "green",
"Normal" = "black"),
pstage = c("T1" = "green",
"T2" = "blue",
"T3" = "red",
"T4" = "yellow",
"TX" = "black"))
getMoHeatmap(data = plotdata,
row.title = c("mRNA","lncRNA","Methylation","Mutation"),
is.binary = c(F,F,F,T), # the 4th data is mutation which is binary
legend.name = c("mRNA.FPKM","lncRNA.FPKM","M value","Mutated"),
clust.res = cmoic.brca$clust.res, # consensusMOIC results
clust.dend = NULL, # show no dendrogram for samples
show.rownames = c(F,F,F,F), # specify for each omics data
show.colnames = FALSE, # show no sample names
show.row.dend = c(F,F,F,F), # show no dendrogram for features
annRow = NULL, # no selected features
color = col.list,
annCol = annCol, # annotation for samples
annColors = annColors, # annotation color
width = 10, # width of each subheatmap
height = 5, # height of each subheatmap
fig.name = "COMPREHENSIVE HEATMAP OF CONSENSUSMOIC")
Figure 5.带注释信息的热图
2.2.2 亚型比较(COMP Module)
在确定亚型后,往往需要的对亚型的特征进行描述,因此COMICS提供了常见的亚型特征分析。
(1)预后分析
surv.brca <- compSurv(moic.res = cmoic.brca,
surv.info = surv.info,
convt.time = "m",
surv.median.line = "h",
xyrs.est = c(5,10),
fig.name = "KAPLAN-MEIER CURVE OF CONSENSUSMOIC")
Figure 6.预后分析
(2)基线资料
clin.brca <- compClinvar(moic.res = cmoic.brca,
var2comp = surv.info,
strata = "Subtype",
factorVars = c("PAM50","pstage","fustat"),
nonnormalVars = "futime",
exactVars="pstage",
doWord=T,
tab.name="SUMMARIZATION OF CLINICAL FEATURES")
print(clin.brca$compTab)
level CS1 CS2
1 n 170 52
2 fustat (%) 0 144 (84.7) 39 (75.0)
3 1 26 (15.3) 13 (25.0)
4 futime (median [IQR]) 747.50 [461.50, 1392.50] 700.50 [414.75, 1281.25]
5 PAM50 (%) Basal 1 ( 0.6) 2 ( 3.8)
6 Her2 4 ( 2.4) 34 (65.4)
7 LumA 81 (47.6) 2 ( 3.8)
8 LumB 84 (49.4) 9 (17.3)
CS3 CS4 CS5
1 173 137 111
2 163 (94.2) 126 (92.0) 95 (85.6)
3 10 ( 5.8) 11 ( 8.0) 16 (14.4)
4 867.00 [473.00, 1550.00] 680.50 [407.25, 1205.25] 785.00 [479.00, 1628.00]
5 0 ( 0.0) 0 ( 0.0) 108 (97.3)
6 0 ( 0.0) 0 ( 0.0) 0 ( 0.0)
7 154 (89.0) 107 (78.1) 0 ( 0.0)
8 1 ( 0.6) 30 (21.9) 0 ( 0.0)
p test
1
2 0.001
3
4 0.321 nonnorm
5 <0.001
6
7
8
[ reached 'max' / getOption("max.print") -- omitted 7 rows ]
(3)突变频率
mut.brca <- compMut(moic.res = cmoic.brca,
mut.matrix = brca.tcga$mut.status, # binary somatic mutation matrix
doWord = TRUE, # generate table in .docx format
doPlot = TRUE, # draw OncoPrint
freq.cutoff = 0.05, # keep those genes that mutated in at least 5% of samples
p.adj.cutoff = 0.05, # keep those genes with adjusted p value < 0.05 to draw OncoPrint
innerclust = TRUE, # perform clustering within each subtype
annCol = annCol, # same annotation for heatmap
annColors = annColors, # same annotation color for heatmap
width = 6,
height = 2,
fig.name = "ONCOPRINT FOR SIGNIFICANT MUTATIONS",
tab.name = "INDEPENDENT TEST BETWEEN SUBTYPE AND MUTATION")
Figure 7.突变瀑布图
print(mut.brca)
Gene (Mutated) TMB CS1 CS2 CS3 CS4
1 PIK3CA 208 (32%) 44 (25.9%) 17 (32.7%) 80 (46.2%) 65 (47.4%)
2 TP53 186 (29%) 46 (27.1%) 39 (75.0%) 11 ( 6.4%) 8 ( 5.8%)
3 TTN 111 (17%) 31 (18.2%) 20 (38.5%) 25 (14.5%) 15 (10.9%)
4 CDH1 83 (13%) 7 ( 4.1%) 3 ( 5.8%) 57 (32.9%) 15 (10.9%)
5 GATA3 58 ( 9%) 19 (11.2%) 1 ( 1.9%) 16 ( 9.2%) 22 (16.1%)
6 MLL3 49 ( 8%) 14 ( 8.2%) 5 ( 9.6%) 11 ( 6.4%) 14 (10.2%)
7 MUC16 48 ( 8%) 10 ( 5.9%) 9 (17.3%) 14 ( 8.1%) 7 ( 5.1%)
8 MAP3K1 38 ( 6%) 7 (4.1%) 1 (1.9%) 15 (8.7%) 13 (9.5%)
CS5 pvalue padj
1 2 ( 1.8%) 1.30e-20 5.85e-20
2 82 (73.9%) 2.26e-52 2.03e-51
3 20 (18.0%) 8.45e-04 1.52e-03
4 1 ( 0.9%) 4.67e-18 1.40e-17
5 0 ( 0.0%) 5.59e-06 1.26e-05
6 5 ( 4.5%) 4.39e-01 4.39e-01
7 8 ( 7.2%) 9.43e-02 1.06e-01
8 2 (1.8%) 2.20e-02 3.30e-02
[ reached 'max' / getOption("max.print") -- omitted 1 rows ]
(4)TMB
tmb.brca <- compTMB(moic.res = cmoic.brca,
maf = maf,
rmDup = T,
rmFLAGS = F,
exome.size = 38,
test.method = "nonparametric",
fig.name = "DISTRIBUTION OF TMB AND TITV")
-Validating
-Silent variants: 24329
-Summarizing
--Possible FLAGS among top ten genes:
TTN
MUC16
-Processing clinical data
--Missing clinical data
-Finished in 3.340s elapsed (3.170s cpu)
Kruskal-Wallis rank sum test p value = 1.74e-23
post-hoc pairwise wilcoxon rank sum test with Benjamini-Hochberg adjustment presents below:
CS1 CS2 CS3 CS4
CS2 "2.01e-03" " NA" " NA" " NA"
CS3 "1.97e-06" "8.42e-09" " NA" " NA"
CS4 "3.61e-08" "3.61e-11" "8.41e-01" " NA"
CS5 "9.20e-05" "8.81e-01" "1.54e-12" "1.03e-15"
Figure 8.肿瘤突变负荷
head(tmb.brca$TMB.dat)
samID variants TMB log10TMB Subtype
570 BRCA-A1EW-01A 9 0.2368421 0.09231426 CS1
564 BRCA-A0XT-01A 10 0.2631579 0.10145764 CS1
548 BRCA-A1ES-01A 13 0.3421053 0.12778658 CS1
541 BRCA-A1NG-01A 14 0.3684211 0.13621975 CS1
546 BRCA-A273-01A 14 0.3684211 0.13621975 CS1
537 BRCA-A1KP-01A 15 0.3947368 0.14449227 CS1
(5)FGA
FGA: fraction genome altered
colnames(segment) <- c("sample","chrom","start","end","value")
head(segment)
sample chrom start end value
1 BRCA-A090-01A 1 3301765 54730235 -0.1271
2 BRCA-A090-01A 1 54730247 57443819 -0.0899
3 BRCA-A090-01A 1 57448465 57448876 -1.1956
4 BRCA-A090-01A 1 57448951 64426102 -0.1009
5 BRCA-A090-01A 1 64426648 106657734 -0.1252
6 BRCA-A090-01A 1 106657854 106835667 0.1371
fga.brca <- compFGA(moic.res = cmoic.brca,
segment = segment,
iscopynumber = F, #this is a segmented copy number file
cnathreshold = 0.2,
test.method = "nonparametric",
fig.name = "BARPLOT OF FGA")
5% 10% 15% 21% 26% 31% 36% 41% 46% 51% 57% 62% 67% 72% 77% 82% 88% 93% 98%
Figure 9.FGA
head(fga.brca$summary)
samID FGA FGG FGL Subtype
1 BRCA-A03L-01A 0.6217991 0.30867268 0.31312638 CS1
2 BRCA-A04R-01A 0.2531019 0.09132014 0.16178176 CS1
3 BRCA-A075-01A 0.7007067 0.41444237 0.28626433 CS2
4 BRCA-A08O-01A 0.6501287 0.45648145 0.19364725 CS1
5 BRCA-A0A6-01A 0.1468893 0.06356488 0.08332444 CS3
6 BRCA-A0AD-01A 0.1722214 0.03864521 0.13357618 CS4
(6)药物敏感性
drug.brca <- compDrugsen(moic.res = cmoic.brca,
norm.expr = fpkm[,cmoic.brca$clust.res$samID],
drugs = c("Cisplatin", "Paclitaxel"),
tissueType = "breast",
test.method = "nonparametric",
prefix = "BOXVIOLIN OF ESTIMATED IC50")
Cisplatin: Kruskal-Wallis rank sum test p value = 6.61e-61
post-hoc pairwise wilcoxon rank sum test with Benjamini-Hochberg adjustment presents below:
CS1 CS2 CS3 CS4
CS2 "2.18e-19" " NA" " NA" " NA"
CS3 "1.07e-18" "6.45e-08" " NA" " NA"
CS4 "6.93e-01" "2.18e-19" "5.76e-17" " NA"
CS5 "2.92e-33" "1.50e-01" "2.74e-16" "4.45e-32"
Paclitaxel: Kruskal-Wallis rank sum test p value = 3.59e-17
post-hoc pairwise wilcoxon rank sum test with Benjamini-Hochberg adjustment presents below:
CS1 CS2 CS3 CS4
CS2 "2.54e-11" " NA" " NA" " NA"
CS3 "5.42e-04" "3.65e-15" " NA" " NA"
CS4 "7.08e-03" "3.18e-14" "6.85e-01" " NA"
CS5 "7.61e-01" "2.06e-09" "8.87e-03" "3.29e-02"
Figure 10.药物敏感性
(7)与已有亚型对比
surv.info$pstage <- factor(surv.info$pstage,levels = c("TX","T1","T2","T3","T4"))
agree.brca <- compAgree(moic.res = cmoic.brca,
subt2comp = surv.info[,c("PAM50","pstage")],
doPlot = T,
box.width = 0.2,
fig.name = "AGREEMENT OF CONSENSUSMOIC WITH PAM50 AND PSTAGE")
Figure 11.与已知亚型对比
2.2.3 运行模块(RUN module)
前面已进行亚型分析,并确定各个亚型的特征,下面来确定各个亚型潜在的预测标志物和相应通路。
(1)差异分析
dea.method可以选择“edger”,“edseq2”和“limma”方法,当请注意deger和edseq2需要count数据,而 limma需要log2处理的TPM/FPKM数据。
# DEA with edgeR
runDEA(dea.method = "edger",
expr = count,
moic.res = cmoic.brca,
prefix = "TCGA-BRCA")
(2)Biomarker
默认选取每个亚型的前200个差异基因(注意:次差异基因不与其他部分的特征基因重合)。
marker.up <- runMarker(moic.res = cmoic.brca,
dea.method = "edger", # name of DEA method
prefix = "TCGA-BRCA", # MUST be the same of argument in runDEA()
dat.path = getwd(), # path of DEA files
res.path = getwd(), # path to save marker files
p.cutoff = 0.05, # p cutoff to identify significant DEGs
p.adj.cutoff = 0.05, # padj cutoff to identify significant DEGs
dirct = "up", # direction of dysregulation in expression
n.marker = 100, # number of biomarkers for each subtype
doplot = TRUE, # generate diagonal heatmap
norm.expr = fpkm, # use normalized expression as heatmap input
annCol = annCol, # sample annotation in heatmap
annColors = annColors, # colors for sample annotation
show_rownames = FALSE, # show no rownames (biomarker name)
fig.name = "UPREGULATED BIOMARKER HEATMAP")
Figure 12.marker基因
head(marker.up$templates)
probe class dirct
1 CACNG6 CS1 up
2 VSTM2A CS1 up
3 SPDYC CS1 up
4 ARHGAP36 CS1 up
5 CST9 CS1 up
6 ASCL1 CS1 up
(3)GSEA分析
MSIGDB.FILE <- system.file("extdata", "c5.bp.v7.1.symbols.xls", package = "MOVICS", mustWork = TRUE)
gsea.up <- runGSEA(moic.res = cmoic.brca,
dea.method = "edger", # name of DEA method
prefix = "TCGA-BRCA", # MUST be the same of argument in runDEA()
dat.path = getwd(), # path of DEA files
res.path = getwd(), # path to save GSEA files
msigdb.path = MSIGDB.FILE, # MUST be the ABSOLUTE path of msigdb file
norm.expr = fpkm, # use normalized expression to calculate enrichment score
dirct = "up", # direction of dysregulation in pathway
p.cutoff = 0.05, # p cutoff to identify significant pathways
p.adj.cutoff = 0.25, # padj cutoff to identify significant pathways
gsva.method = "gsva", # method to calculate single sample enrichment score
norm.method = "mean", # normalization method to calculate subtype-specific enrichment score
fig.name = "UPREGULATED PATHWAY HEATMAP")
Estimating GSEA scores for 50 gene sets.
Estimating ECDFs with Gaussian kernels
Figure 13.GSEA
(4)GSVA分析
GSET.FILE <-
system.file("extdata", "gene sets of interest.gmt", package = "MOVICS", mustWork = TRUE)
gsva.res <-
runGSVA(moic.res = cmoic.brca,
norm.expr = fpkm,
gset.gmt.path = GSET.FILE, # ABSOLUTE path of gene set file
gsva.method = "gsva", # method to calculate single sample enrichment score
annCol = annCol,
annColors = annColors,
fig.path = getwd(),
fig.name = "GENE SETS OF INTEREST HEATMAP",
height = 5,
width = 8)
Estimating GSVA scores for 21 gene sets.
Estimating ECDFs with Gaussian kernels
Figure 14.GSVA
print(gsva.res$raw.es[1:3,1:3])
BRCA-A03L-01A BRCA-A04R-01A BRCA-A075-01A
Adhesion 0.09351280 -0.2125523 0.10362446
Antigen_Processing 0.05451682 -0.3909617 0.05508321
B-Cell_Functions -0.01931423 -0.6120864 0.18301567
print(gsva.res$scaled.es[1:3,1:3])
BRCA-A03L-01A BRCA-A04R-01A BRCA-A075-01A
Adhesion 0.28466782 -0.6600776 0.3158800
Antigen_Processing 0.12510682 -1.0000000 0.1267097
B-Cell_Functions -0.04056456 -1.0000000 0.4561039
(5)NTP验证外部数据集
yau.ntp.pred <- runNTP(expr = brca.yau$mRNA.expr,
templates = marker.up$templates,
scaleFlag = T,
centerFlag = T,
doPlot = T,
fig.name = "NTP HEATMAP FOR YAU")
Figure 15.NTP验证外部数据集
CS1 CS2 CS3 CS4 CS5 <NA>
114 84 116 92 141 135
head(yau.ntp.pred$ntp.res)
prediction d.CS1 d.CS2 d.CS3 d.CS4 d.CS5 p.value FDR
107 CS4 0.7102 0.7160 0.7407 0.5794 0.7477 0.0010 0.0019
109 CS2 0.7486 0.5743 0.7140 0.7579 0.7484 0.0010 0.0019
11 CS1 0.6832 0.6998 0.7424 0.7186 0.7678 0.1399 0.1564
110 CS1 0.5051 0.7696 0.7521 0.7140 0.7529 0.0010 0.0019
111 CS1 0.6704 0.6884 0.7781 0.7115 0.7891 0.0060 0.0092
113 CS2 0.7386 0.6364 0.6533 0.7302 0.7271 0.0180 0.0243
验证集的预后分析
surv.yau <- compSurv(moic.res = yau.ntp.pred,
surv.info = brca.yau$clin.info,
convt.time = "m",
surv.median.line = "hv",
fig.name = "KAPLAN-MEIER CURVE OF NTP FOR YAU")
Figure 16.test数据集生存曲线
检测下与PAM50分型是否一致
agree.yau <- compAgree(moic.res = yau.ntp.pred,
subt2comp = brca.yau$clin.info[,"PAM50",drop=F],
doPlot = T,
fig.name = "YAU PREDICTEDMOIC WITH PAM50")
Figure 17.test数据集与已知亚型关系
print(agree.yau)
current.subtype other.subtype RI AMI JI FM
1 Subtype PAM50 0.7819319 0.369929 0.3286758 0.4959205
Kappa一致性评价
yau.pam.pred <- runPAM(train.expr = fpkm,
moic.res = cmoic.brca,
test.expr = brca.yau$mRNA.expr)
tcga.ntp.pred <- runNTP(expr = fpkm,
templates = marker.up$templates,
doPlot = FALSE)
CS1 CS2 CS3 CS4 CS5 <NA>
139 79 162 105 108 50
tcga.pam.pred <- runPAM(train.expr = fpkm,
moic.res = cmoic.brca,
test.expr = fpkm)
runKappa(subt1 = cmoic.brca$clust.res$clust,
subt2 = tcga.ntp.pred$clust.res$clust,
subt1.lab = "CMOIC",
subt2.lab = "NTP",
fig.name = "CONSISTENCY HEATMAP FOR TCGA between CMOIC and NTP")
Figure 18.一致性检验NTPvsCMOIC
runKappa(subt1 = cmoic.brca$clust.res$clust,
subt2 = tcga.pam.pred$clust.res$clust,
subt1.lab = "CMOIC",
subt2.lab = "PAM",
fig.name = "CONSISTENCY HEATMAP FOR TCGA between CMOIC and PAM")
Figure 19.一致性检验PAMvsCMOIC
runKappa(subt1 = yau.ntp.pred$clust.res$clust,
subt2 = yau.pam.pred$clust.res$clust,
subt1.lab = "NTP",
subt2.lab = "PAM",
fig.name = "CONSISTENCY HEATMAP FOR YAU")
Figure 20.一致性检验PAMvsNTP
3.彩蛋
通过整套流程,你会发现moic.res
几乎贯穿整个下游分析,甚至可以说如果无法通过GET module获取合适的moic.res
对象,下游分析就凉凉啦。事实上,moic.res
的核心信息是clust.res
。 clust.res
是一个data.frame,有samID列与其他数据匹配即可。另一个参数是mo.method
,作为输出数据的前缀。 例如,需要检测下PAM50分型的预测价值,仅仅需要设置pseudo.moic.res
对象和自定义的mo.method
即可。这几乎为任何亚型分析,打开了洪荒之力的大门。惊叹!!!
pseudo.moic.res <- list("clust.res"=surv.info,
"mo.method"="PAM50")
pseudo.moic.res$clust.res$samID <- rownames(pseudo.moic.res$clust.res)
pseudo.moic.res$clust.res$clust <- sapply(pseudo.moic.res$clust.res$PAM50,
switch,
"Basal" = 1, # relabel Basal as 1
"Her2" = 2, # relabel Her2 as 2
"LumA" = 3, # relabel LumA as 3
"LumB" = 4, # relabel LumnB as 4
"Normal" = 5) # relabel Normal as 5
head(pseudo.moic.res$clust.res)
fustat futime PAM50 pstage age samID clust
BRCA-A03L-01A 0 2442 LumA T3 34 BRCA-A03L-01A 3
BRCA-A04R-01A 0 2499 LumB T1 36 BRCA-A04R-01A 4
BRCA-A075-01A 0 518 LumB T2 42 BRCA-A075-01A 4
BRCA-A08O-01A 0 943 LumA T2 45 BRCA-A08O-01A 3
BRCA-A0A6-01A 0 640 LumA T2 64 BRCA-A0A6-01A 3
BRCA-A0AD-01A 0 1157 LumA T1 83 BRCA-A0AD-01A 3
pam50.brca <- compSurv(moic.res = pseudo.moic.res,
surv.info = surv.info,
convt.time = "y", # convert day unit to year
surv.median.line = "h", # draw horizontal line at median survival
fig.name = "KAPLAN-MEIER CURVE OF PAM50 BY PSEUDO")
Figure 21.pseudo.moic.res
Secssion Information
sessionInfo()
R version 4.1.0 (2021-05-18)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19042)
Matrix products: default
locale:
[1] LC_COLLATE=Chinese (Simplified)_China.936
[2] LC_CTYPE=Chinese (Simplified)_China.936
[3] LC_MONETARY=Chinese (Simplified)_China.936
[4] LC_NUMERIC=C
[5] LC_TIME=Chinese (Simplified)_China.936
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] MOVICS_0.99.16 Biobase_2.52.0 BiocGenerics_0.38.0
[4] rmdformats_1.0.2 knitr_1.33 pacman_0.5.1
[7] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.6
[10] purrr_0.3.4 readr_1.4.0 tidyr_1.1.3
[13] tibble_3.1.2 ggplot2_3.3.3 tidyverse_1.3.1
loaded via a namespace (and not attached):
[1] rsvd_1.0.5 corpcor_1.6.9
[3] aricode_1.0.0 class_7.3-19
[5] foreach_1.5.1 crayon_1.4.1
[7] MASS_7.3-54 rhdf5filters_1.4.0
[9] nlme_3.1-152 backports_1.2.1
[11] reprex_2.0.0 sva_3.40.0
[13] impute_1.66.0 GOSemSim_2.18.0
[15] rlang_0.4.11 svd_0.5
[17] XVector_0.32.0 readxl_1.3.1
[19] heatmap.plus_1.3 irlba_2.3.3
[21] nloptr_1.2.2.2 limma_3.48.0
[23] BiocParallel_1.26.0 rjson_0.2.20
[25] bit64_4.0.5 glue_1.4.2
[27] rngtools_1.5 SNFtool_2.2
[29] AnnotationDbi_1.54.0 oompaData_3.1.1
[31] DOSE_3.18.0 haven_2.4.1
[33] tidyselect_1.1.1 SummarizedExperiment_1.22.0
[35] km.ci_0.5-2 rio_0.5.26
[37] XML_3.99-0.6 zoo_1.8-9
[39] ggpubr_0.4.0 ridge_2.9
[41] maftools_2.8.0 xtable_1.8-4
[43] magrittr_2.0.1 evaluate_0.14
[45] cli_2.5.0 zlibbioc_1.38.0
[47] rstudioapi_0.13 fastmatch_1.1-0
[49] ClassDiscovery_3.3.13 InterSIM_2.2.0
[51] treeio_1.16.1 GSVA_1.40.0
[53] BiocSingular_1.8.0 xfun_0.23
[55] coca_1.1.0 clue_0.3-59
[57] cluster_2.1.2 caTools_1.18.2
[59] tidygraph_1.2.0 ggtext_0.1.1
[61] KEGGREST_1.32.0 flexclust_1.4-0
[63] ggrepel_0.9.1 ape_5.5
[65] permute_0.9-5 Biostrings_2.60.0
[67] png_0.1-7 withr_2.4.2
[69] bitops_1.0-7 ggforce_0.3.3
[71] plyr_1.8.6 cellranger_1.1.0
[73] GSEABase_1.54.0 e1071_1.7-7
[75] survey_4.0
[ reached getOption("max.print") -- omitted 167 entries ]
参考资料:
-
MOVICS:Multi-Omics integration and VIsualization in Cancer Subtyping(MOVICS VIGNETTE)
-
Lu, X., Meng, J., Zhou, Y., Jiang, L., and Yan, F. (2020). MOVICS: an R package for multi-omics integration and visualization in cancer subtyping. bioRxiv, 2020.2009.2015.297820. [doi.org/10.1101/2020.09.15.297820]