【R>>MOVICS】多组学+亚型秘密武器

2021-06-09 本文已影响0人高大石头

MOVICS包整合多组学数据进行分析，其安装的过程太让人心酸，大家安装的过程慢慢体会，下面就来学习下这个由“大鱼海棠”老师开发的R包吧。

1.示例数据

brca.tcga.RData：MOVICS以TCGA乳腺癌的数据为例，包含643例样本的4种完整的组学信息,作为训练集。
brca.yau.RData: 来自BRCA-YAU队列，包含682例样本的信息。

library(MOVICS)
load(system.file("extdata","brca.tcga.RData",package = "MOVICS",mustWork = T))
load(system.file("extdata","brca.yau.RData",package = "MOVICS",mustWork = T))

因多组学聚类过程中相对耗费时间，因此预处理的时候提取mRNAs, lncRNA的前500，promoter CGI的前1000 高变探针或基因，前30个在至少3%的队列中变化的基因。

2.工作流程

2.1 流程概述

MOVICS工作流程分为3个部分：

GET module: get subtypes
COMP module: compare subtypes
RUN module: run marker identification and verify subtypes

2.2 详细步骤

2.2.1 分型(GET Module)

names(brca.tcga)

[1] "mRNA.expr"   "lncRNA.expr" "meth.beta"   "mut.status"  "count"      
[6] "fpkm"        "maf"         "segment"     "clin.info"

names(brca.yau)

[1] "mRNA.expr" "clin.info"

# 提取多组学数据
mo.data <- brca.tcga[1:4]
count <- brca.tcga$count
fpkm <- brca.tcga$fpkm
maf <- brca.tcga$maf
segment <- brca.tcga$segment #CNV数据
surv.info <- brca.tcga$clin.info

输入数据过滤

核心函数：getElites()→提供5种方法，包括mad, sd, cox, freq和na.action。

tmp <- brca.tcga$mRNA.expr
dim(tmp)

[1] 500 643

#假设存在NA的值
tmp[1,1] <- tmp[2,2] <- NA
tmp[1:3,1:3]

        BRCA-A03L-01A BRCA-A04R-01A BRCA-A075-01A
SCGB2A2            NA          1.42          7.24
SCGB1D2         10.11            NA          5.88
PIP              4.54          2.59          4.35

elite.tmp <- getElites(dat=tmp,
                       method="mad",
                       na.action="rm",
                       elite.pct=1)
dim(elite.tmp$elite.dat) #将这两个含有NA的值去掉了

[1] 498 643

elite.tmp <- getElites(dat=tmp,
                       method = "mad",
                       na.action = "impute",
                       elite.pct = 1)
dim(elite.tmp$elite.dat) # impute插补

[1] 500 643

#> 用mad/sd筛选
tmp <- brca.tcga$mRNA.expr
elite.tmp <- getElites(dat=tmp,
                       method = "mad",
                       elite.pct = 0.1)
dim(elite.tmp$elite.dat) #选取前10%

[1]  50 643

elite.tmp <- getElites(dat       = tmp,
                       method    = "sd",
                       elite.num = 100,
                       elite.pct = 0.1)

dim(elite.tmp$elite.dat)

[1] 100 643

# pca方法筛选
tmp       <- brca.tcga$mRNA.expr # get expression data with 500 features
elite.tmp <- getElites(dat       = tmp,
                       method    = "pca",
                       pca.ratio = 0.95)
dim(elite.tmp$elite.dat)

[1] 204 643

# cox方法筛选
tmp <- brca.tcga$mRNA.expr
elite.tmp <- getElites(dat=tmp,
                       method = "cox",
                       surv.info = surv.info,
                       p.cutoff = 0.05,
                       elite.num = 100)

5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70% 75% 80% 85% 90% 95% 100%

dim(elite.tmp$elite.dat)

[1] 125 643

table(elite.tmp$unicox.res$pvalue<0.05)


FALSE  TRUE 
  375   125

tmp <- brca.tcga$mut.status
elite.tmp <- getElites(dat=tmp,
                       method = "cox",
                       surv.info = surv.info,
                       p.cutoff = 0.05,
                       elite.num = 100)

7% 13% 20% 27% 33% 40% 47% 53% 60% 67% 73% 80% 87% 93% 100%

dim(elite.tmp$elite.dat)

[1]   3 643

table(elite.tmp$unicox.res$pvalue<0.05)


FALSE  TRUE 
   27     3

#> 突变频率筛选
tmp <- brca.tcga$mut.status
rowSums(tmp)

PIK3CA   TP53    TTN   CDH1  GATA3   MLL3  MUC16 MAP3K1  SYNE1  MUC12    DMD 
   208    186    111     83     58     49     48     38     33     32     31 
 NCOR1    FLG   PTEN   RYR2  USH2A  SPTA1 MAP2K4  MUC5B    NEB   SPEN  MACF1 
    31     30     29     27     27     25     25     24     24     23     23 
  RYR3    DST  HUWE1  HMCN1  CSMD1  OBSCN   APOB  SYNE2 
    23     22     22     22     21     21     21     21

elite.tmp <- getElites(dat=tmp,
                       method = "freq",
                       elite.num = 80,
                       elite.pct = 0.1)
rowSums(elite.tmp$elite.dat)

PIK3CA   TP53    TTN   CDH1 
   208    186    111     83

elite.tmp <- getElites(dat = tmp,
                       method = "freq",
                       elite.pct = 0.2)
rowSums(elite.tmp$elite.dat)

PIK3CA   TP53 
   208    186

获取最佳分类

optk.brca <- getClustNum(data = mo.data,
                         is.binary = c(F,F,F,T), #四种组学数据中只有somatic mutation是二分类矩阵
                         try.N.clust = 2:8,
                         fig.name = "CLUSTER NUMBER OF TCGA-BRCA")

Figure.1 确定最佳聚类

根据上图直接选取N.clust=5作为最佳分类，然后用多种方法进行聚类。

moic.res.list <- getMOIC(data=mo.data,
                         methodslist = list("SNF", "PINSPlus", "NEMO", "COCA", "LRAcluster", "ConsensusClustering", "IntNMF", "CIMLR", "MoCluster"),
                         N.clust = 5,
                         type = c("gaussian", "gaussian", "gaussian", "binomial"))
iClusterBayes.res <- getMOIC(data        = mo.data,
                             N.clust     = 5,
                             methodslist = "iClusterBayes", # specify only ONE algorithm here
                             type        = c("gaussian","gaussian","gaussian","binomial"), # data type corresponding to the list
                             n.burnin    = 1800,
                             n.draw      = 1200,
                             prior.gamma = c(0.5, 0.5, 0.5, 0.5),
                             sdev        = 0.05,
                             thin        = 3)
moic.res.list <- append(moic.res.list,
                        list("iClusterBayes" = iClusterBayes.res))
save(moic.res.list,file = "moic.res.list.rda")
save(iClusterBayes.res,file = "iClusterBayes.res.rda")

获取聚类矩阵

load("moic.res.list.rda")
cmoic.brca <- getConsensusMOIC(moic.res.list = moic.res.list,
                               fig.name = "CONSENSUS HEATMAP",
                               distance = "euclidean",
                               linkage  = "average")

Figure 2.层次聚类热图

silhoutte进行相似性评价

getSilhouette(sil=cmoic.brca$sil,
              fig.path = getwd(),
              fig.name = "SILHOUETTE",
              height = 5.5,
              width = 5)

Figure 3. 相似性评价轮廓图

png 
  2

聚类热图

indata <- mo.data
indata$meth.beta <- log2(indata$meth.beta/(1-indata$meth.beta))
plotdata <- getStdiz(data=indata,
                     halfwidth = c(2,2,2,NA),
                     centerFlag = c(T,T,T,F),
                     scaleFlag = c(T,T,T,F))

值得注意的是iClusterBayes.res$feat.res已经按照每个组学数据特征的后延概率排序，我们可以选取前10个feature，并生成一个feature list，命名为annRow，对热图进行注释。

load("iClusterBayes.res.rda")
feat <- iClusterBayes.res$feat.res
feat1  <- feat[which(feat$dataset == "mRNA.expr"),][1:10,"feature"] 
feat2  <- feat[which(feat$dataset == "lncRNA.expr"),][1:10,"feature"]
feat3  <- feat[which(feat$dataset == "meth.beta"),][1:10,"feature"]
feat4  <- feat[which(feat$dataset == "mut.status"),][1:10,"feature"]
annRow <- list(feat1,feat2,feat3,feat4)

# if no color list specified all subheatmaps will be unified to green and red color pattern
mRNA.col   <- c("#00FF00", "#008000", "#000000", "#800000", "#FF0000")
lncRNA.col <- c("#6699CC", "white"  , "#FF3C38")
meth.col   <- c("#0074FE", "#96EBF9", "#FEE900", "#F00003")
mut.col    <- c("grey90" , "black")
col.list   <- list(mRNA.col, lncRNA.col, meth.col, mut.col)

生成羡慕已久的MoHeatmap

getMoHeatmap(data          = plotdata,
             row.title     = c("mRNA","lncRNA","Methylation","Mutation"),
             is.binary     = c(F,F,F,T), # the 4th data is mutation which is binary
             legend.name   = c("mRNA.FPKM","lncRNA.FPKM","M value","Mutated"),
             clust.res     = iClusterBayes.res$clust.res, # cluster results
             clust.dend    = NULL, # no dendrogram
             show.rownames = c(F,F,F,F), # specify for each omics data
             show.colnames = FALSE, # show no sample names
             annRow        = annRow, # mark selected features
             color         = col.list,
             annCol        = NULL, # no annotation for samples
             annColors     = NULL, # no annotation color
             width         = 10, # width of each subheatmap
             height        = 5, # height of each subheatmap
             fig.name      = "COMPREHENSIVE HEATMAP OF ICLUSTERBAYES")

Figure 4.热图

为热图添加列注释信息

annCol<- surv.info[,c("PAM50", "pstage", "age"), drop = FALSE]
annColors <- list(age    = circlize::colorRamp2(breaks = c(min(annCol$age),
                                                           median(annCol$age),
                                                           max(annCol$age)), 
                                                colors = c("#0000AA", "#555555", "#AAAA00")),
                  PAM50  = c("Basal" = "blue",
                            "Her2"   = "red",
                            "LumA"   = "yellow",
                            "LumB"   = "green",
                            "Normal" = "black"),
                  pstage = c("T1"    = "green",
                             "T2"    = "blue",
                             "T3"    = "red",
                             "T4"    = "yellow", 
                             "TX"    = "black"))

getMoHeatmap(data          = plotdata,
             row.title     = c("mRNA","lncRNA","Methylation","Mutation"),
             is.binary     = c(F,F,F,T), # the 4th data is mutation which is binary
             legend.name   = c("mRNA.FPKM","lncRNA.FPKM","M value","Mutated"),
             clust.res     = cmoic.brca$clust.res, # consensusMOIC results
             clust.dend    = NULL, # show no dendrogram for samples
             show.rownames = c(F,F,F,F), # specify for each omics data
             show.colnames = FALSE, # show no sample names
             show.row.dend = c(F,F,F,F), # show no dendrogram for features
             annRow        = NULL, # no selected features
             color         = col.list,
             annCol        = annCol, # annotation for samples
             annColors     = annColors, # annotation color
             width         = 10, # width of each subheatmap
             height        = 5, # height of each subheatmap
             fig.name      = "COMPREHENSIVE HEATMAP OF CONSENSUSMOIC")

Figure 5.带注释信息的热图

2.2.2 亚型比较(COMP Module)

在确定亚型后，往往需要的对亚型的特征进行描述，因此COMICS提供了常见的亚型特征分析。

(1)预后分析

surv.brca <- compSurv(moic.res = cmoic.brca,
                      surv.info = surv.info,
                      convt.time = "m",
                      surv.median.line = "h",
                      xyrs.est = c(5,10),
                      fig.name = "KAPLAN-MEIER CURVE OF CONSENSUSMOIC")

Figure 6.预后分析

(2)基线资料

clin.brca <- compClinvar(moic.res = cmoic.brca,
                         var2comp = surv.info,
                         strata = "Subtype",
                         factorVars = c("PAM50","pstage","fustat"),
                         nonnormalVars = "futime",
                         exactVars="pstage",
                         doWord=T,
                         tab.name="SUMMARIZATION OF CLINICAL FEATURES")
print(clin.brca$compTab)

                        level                      CS1                      CS2
1                     n                            170                       52
2            fustat (%)     0              144 (84.7)                39 (75.0) 
3                           1               26 (15.3)                13 (25.0) 
4 futime (median [IQR])       747.50 [461.50, 1392.50] 700.50 [414.75, 1281.25]
5             PAM50 (%) Basal                1 ( 0.6)                 2 ( 3.8) 
6                        Her2                4 ( 2.4)                34 (65.4) 
7                        LumA               81 (47.6)                 2 ( 3.8) 
8                        LumB               84 (49.4)                 9 (17.3) 
                       CS3                      CS4                      CS5
1                      173                      137                      111
2              163 (94.2)               126 (92.0)                95 (85.6) 
3               10 ( 5.8)                11 ( 8.0)                16 (14.4) 
4 867.00 [473.00, 1550.00] 680.50 [407.25, 1205.25] 785.00 [479.00, 1628.00]
5                0 ( 0.0)                 0 ( 0.0)               108 (97.3) 
6                0 ( 0.0)                 0 ( 0.0)                 0 ( 0.0) 
7              154 (89.0)               107 (78.1)                 0 ( 0.0) 
8                1 ( 0.6)                30 (21.9)                 0 ( 0.0) 
       p    test
1               
2  0.001        
3               
4  0.321 nonnorm
5 <0.001        
6               
7               
8               
 [ reached 'max' / getOption("max.print") -- omitted 7 rows ]

(3)突变频率

mut.brca <- compMut(moic.res     = cmoic.brca,
                    mut.matrix   = brca.tcga$mut.status, # binary somatic mutation matrix
                    doWord       = TRUE, # generate table in .docx format
                    doPlot       = TRUE, # draw OncoPrint
                    freq.cutoff  = 0.05, # keep those genes that mutated in at least 5% of samples
                    p.adj.cutoff = 0.05, # keep those genes with adjusted p value < 0.05 to draw OncoPrint
                    innerclust   = TRUE, # perform clustering within each subtype
                    annCol       = annCol, # same annotation for heatmap
                    annColors    = annColors, # same annotation color for heatmap
                    width        = 6, 
                    height       = 2,
                    fig.name     = "ONCOPRINT FOR SIGNIFICANT MUTATIONS",
                    tab.name     = "INDEPENDENT TEST BETWEEN SUBTYPE AND MUTATION")

Figure 7.突变瀑布图

print(mut.brca)

  Gene (Mutated)       TMB        CS1        CS2        CS3        CS4
1         PIK3CA 208 (32%) 44 (25.9%) 17 (32.7%) 80 (46.2%) 65 (47.4%)
2           TP53 186 (29%) 46 (27.1%) 39 (75.0%) 11 ( 6.4%)  8 ( 5.8%)
3            TTN 111 (17%) 31 (18.2%) 20 (38.5%) 25 (14.5%) 15 (10.9%)
4           CDH1  83 (13%)  7 ( 4.1%)  3 ( 5.8%) 57 (32.9%) 15 (10.9%)
5          GATA3  58 ( 9%) 19 (11.2%)  1 ( 1.9%) 16 ( 9.2%) 22 (16.1%)
6           MLL3  49 ( 8%) 14 ( 8.2%)  5 ( 9.6%) 11 ( 6.4%) 14 (10.2%)
7          MUC16  48 ( 8%) 10 ( 5.9%)  9 (17.3%) 14 ( 8.1%)  7 ( 5.1%)
8         MAP3K1  38 ( 6%)   7 (4.1%)   1 (1.9%)  15 (8.7%)  13 (9.5%)
         CS5   pvalue     padj
1  2 ( 1.8%) 1.30e-20 5.85e-20
2 82 (73.9%) 2.26e-52 2.03e-51
3 20 (18.0%) 8.45e-04 1.52e-03
4  1 ( 0.9%) 4.67e-18 1.40e-17
5  0 ( 0.0%) 5.59e-06 1.26e-05
6  5 ( 4.5%) 4.39e-01 4.39e-01
7  8 ( 7.2%) 9.43e-02 1.06e-01
8   2 (1.8%) 2.20e-02 3.30e-02
 [ reached 'max' / getOption("max.print") -- omitted 1 rows ]

(4)TMB

tmb.brca <- compTMB(moic.res = cmoic.brca,
                    maf = maf,
                    rmDup = T,
                    rmFLAGS = F,
                    exome.size = 38,
                    test.method = "nonparametric",
                    fig.name     = "DISTRIBUTION OF TMB AND TITV")

-Validating
-Silent variants: 24329 
-Summarizing
--Possible FLAGS among top ten genes:
  TTN
  MUC16
-Processing clinical data
--Missing clinical data
-Finished in 3.340s elapsed (3.170s cpu) 
Kruskal-Wallis rank sum test p value = 1.74e-23
post-hoc pairwise wilcoxon rank sum test with Benjamini-Hochberg adjustment presents below:
    CS1        CS2        CS3        CS4       
CS2 "2.01e-03" " NA"      " NA"      " NA"     
CS3 "1.97e-06" "8.42e-09" " NA"      " NA"     
CS4 "3.61e-08" "3.61e-11" "8.41e-01" " NA"     
CS5 "9.20e-05" "8.81e-01" "1.54e-12" "1.03e-15"

Figure 8.肿瘤突变负荷

head(tmb.brca$TMB.dat)

            samID variants       TMB   log10TMB Subtype
570 BRCA-A1EW-01A        9 0.2368421 0.09231426     CS1
564 BRCA-A0XT-01A       10 0.2631579 0.10145764     CS1
548 BRCA-A1ES-01A       13 0.3421053 0.12778658     CS1
541 BRCA-A1NG-01A       14 0.3684211 0.13621975     CS1
546 BRCA-A273-01A       14 0.3684211 0.13621975     CS1
537 BRCA-A1KP-01A       15 0.3947368 0.14449227     CS1

(5)FGA

FGA: fraction genome altered

colnames(segment) <- c("sample","chrom","start","end","value")
head(segment)

         sample chrom     start       end   value
1 BRCA-A090-01A     1   3301765  54730235 -0.1271
2 BRCA-A090-01A     1  54730247  57443819 -0.0899
3 BRCA-A090-01A     1  57448465  57448876 -1.1956
4 BRCA-A090-01A     1  57448951  64426102 -0.1009
5 BRCA-A090-01A     1  64426648 106657734 -0.1252
6 BRCA-A090-01A     1 106657854 106835667  0.1371

fga.brca <- compFGA(moic.res = cmoic.brca,
                    segment = segment,
                    iscopynumber = F, #this is a segmented copy number file
                    cnathreshold = 0.2,
                    test.method = "nonparametric",
                    fig.name = "BARPLOT OF FGA")

5% 10% 15% 21% 26% 31% 36% 41% 46% 51% 57% 62% 67% 72% 77% 82% 88% 93% 98%

Figure 9.FGA

head(fga.brca$summary)

          samID       FGA        FGG        FGL Subtype
1 BRCA-A03L-01A 0.6217991 0.30867268 0.31312638     CS1
2 BRCA-A04R-01A 0.2531019 0.09132014 0.16178176     CS1
3 BRCA-A075-01A 0.7007067 0.41444237 0.28626433     CS2
4 BRCA-A08O-01A 0.6501287 0.45648145 0.19364725     CS1
5 BRCA-A0A6-01A 0.1468893 0.06356488 0.08332444     CS3
6 BRCA-A0AD-01A 0.1722214 0.03864521 0.13357618     CS4

(6)药物敏感性

drug.brca <- compDrugsen(moic.res = cmoic.brca,
                         norm.expr = fpkm[,cmoic.brca$clust.res$samID],
                         drugs = c("Cisplatin", "Paclitaxel"),
                         tissueType = "breast",
                         test.method = "nonparametric",
                         prefix = "BOXVIOLIN OF ESTIMATED IC50")

Cisplatin: Kruskal-Wallis rank sum test p value = 6.61e-61
post-hoc pairwise wilcoxon rank sum test with Benjamini-Hochberg adjustment presents below:
    CS1        CS2        CS3        CS4       
CS2 "2.18e-19" " NA"      " NA"      " NA"     
CS3 "1.07e-18" "6.45e-08" " NA"      " NA"     
CS4 "6.93e-01" "2.18e-19" "5.76e-17" " NA"     
CS5 "2.92e-33" "1.50e-01" "2.74e-16" "4.45e-32"

Paclitaxel: Kruskal-Wallis rank sum test p value = 3.59e-17
post-hoc pairwise wilcoxon rank sum test with Benjamini-Hochberg adjustment presents below:
    CS1        CS2        CS3        CS4       
CS2 "2.54e-11" " NA"      " NA"      " NA"     
CS3 "5.42e-04" "3.65e-15" " NA"      " NA"     
CS4 "7.08e-03" "3.18e-14" "6.85e-01" " NA"     
CS5 "7.61e-01" "2.06e-09" "8.87e-03" "3.29e-02"

Figure 10.药物敏感性

(7)与已有亚型对比

surv.info$pstage <- factor(surv.info$pstage,levels = c("TX","T1","T2","T3","T4"))
agree.brca <- compAgree(moic.res = cmoic.brca,
                        subt2comp = surv.info[,c("PAM50","pstage")],
                        doPlot = T,
                        box.width = 0.2,
                        fig.name  = "AGREEMENT OF CONSENSUSMOIC WITH PAM50 AND PSTAGE")

Figure 11.与已知亚型对比

2.2.3 运行模块（RUN module）

前面已进行亚型分析，并确定各个亚型的特征，下面来确定各个亚型潜在的预测标志物和相应通路。

(1)差异分析

dea.method可以选择“edger”,“edseq2”和“limma”方法，当请注意deger和edseq2需要count数据，而 limma需要log2处理的TPM/FPKM数据。

# DEA with edgeR
runDEA(dea.method = "edger",
       expr = count,
       moic.res = cmoic.brca,
       prefix = "TCGA-BRCA")

(2)Biomarker

默认选取每个亚型的前200个差异基因（注意：次差异基因不与其他部分的特征基因重合）。

marker.up <- runMarker(moic.res      = cmoic.brca,
                       dea.method    = "edger", # name of DEA method
                       prefix        = "TCGA-BRCA", # MUST be the same of argument in runDEA()
                       dat.path      = getwd(), # path of DEA files
                       res.path      = getwd(), # path to save marker files
                       p.cutoff      = 0.05, # p cutoff to identify significant DEGs
                       p.adj.cutoff  = 0.05, # padj cutoff to identify significant DEGs
                       dirct         = "up", # direction of dysregulation in expression
                       n.marker      = 100, # number of biomarkers for each subtype
                       doplot        = TRUE, # generate diagonal heatmap
                       norm.expr     = fpkm, # use normalized expression as heatmap input
                       annCol        = annCol, # sample annotation in heatmap
                       annColors     = annColors, # colors for sample annotation
                       show_rownames = FALSE, # show no rownames (biomarker name)
                       fig.name      = "UPREGULATED BIOMARKER HEATMAP")

Figure 12.marker基因

head(marker.up$templates)

     probe class dirct
1   CACNG6   CS1    up
2   VSTM2A   CS1    up
3    SPDYC   CS1    up
4 ARHGAP36   CS1    up
5     CST9   CS1    up
6    ASCL1   CS1    up

(3)GSEA分析

MSIGDB.FILE <- system.file("extdata", "c5.bp.v7.1.symbols.xls", package = "MOVICS", mustWork = TRUE)

gsea.up <- runGSEA(moic.res     = cmoic.brca,
                   dea.method   = "edger", # name of DEA method
                   prefix       = "TCGA-BRCA", # MUST be the same of argument in runDEA()
                   dat.path     = getwd(), # path of DEA files
                   res.path     = getwd(), # path to save GSEA files
                   msigdb.path  = MSIGDB.FILE, # MUST be the ABSOLUTE path of msigdb file
                   norm.expr    = fpkm, # use normalized expression to calculate enrichment score
                   dirct        = "up", # direction of dysregulation in pathway
                   p.cutoff     = 0.05, # p cutoff to identify significant pathways
                   p.adj.cutoff = 0.25, # padj cutoff to identify significant pathways
                   gsva.method  = "gsva", # method to calculate single sample enrichment score
                   norm.method  = "mean", # normalization method to calculate subtype-specific enrichment score
                   fig.name     = "UPREGULATED PATHWAY HEATMAP")

Estimating GSEA scores for 50 gene sets.
Estimating ECDFs with Gaussian kernels

Figure 13.GSEA

(4)GSVA分析

GSET.FILE <- 
  system.file("extdata", "gene sets of interest.gmt", package = "MOVICS", mustWork = TRUE)
gsva.res <- 
  runGSVA(moic.res      = cmoic.brca,
          norm.expr     = fpkm,
          gset.gmt.path = GSET.FILE, # ABSOLUTE path of gene set file
          gsva.method   = "gsva", # method to calculate single sample enrichment score
          annCol        = annCol,
          annColors     = annColors,
          fig.path      = getwd(),
          fig.name      = "GENE SETS OF INTEREST HEATMAP",
          height        = 5,
          width         = 8)

Estimating GSVA scores for 21 gene sets.
Estimating ECDFs with Gaussian kernels

Figure 14.GSVA

print(gsva.res$raw.es[1:3,1:3])

                   BRCA-A03L-01A BRCA-A04R-01A BRCA-A075-01A
Adhesion              0.09351280    -0.2125523    0.10362446
Antigen_Processing    0.05451682    -0.3909617    0.05508321
B-Cell_Functions     -0.01931423    -0.6120864    0.18301567

print(gsva.res$scaled.es[1:3,1:3])

                   BRCA-A03L-01A BRCA-A04R-01A BRCA-A075-01A
Adhesion              0.28466782    -0.6600776     0.3158800
Antigen_Processing    0.12510682    -1.0000000     0.1267097
B-Cell_Functions     -0.04056456    -1.0000000     0.4561039

(5)NTP验证外部数据集

yau.ntp.pred <- runNTP(expr = brca.yau$mRNA.expr,
                       templates = marker.up$templates,
                       scaleFlag = T,
                       centerFlag = T,
                       doPlot = T,
                       fig.name = "NTP HEATMAP FOR YAU")

Figure 15.NTP验证外部数据集


 CS1  CS2  CS3  CS4  CS5 <NA> 
 114   84  116   92  141  135

head(yau.ntp.pred$ntp.res)

    prediction  d.CS1  d.CS2  d.CS3  d.CS4  d.CS5 p.value    FDR
107        CS4 0.7102 0.7160 0.7407 0.5794 0.7477  0.0010 0.0019
109        CS2 0.7486 0.5743 0.7140 0.7579 0.7484  0.0010 0.0019
11         CS1 0.6832 0.6998 0.7424 0.7186 0.7678  0.1399 0.1564
110        CS1 0.5051 0.7696 0.7521 0.7140 0.7529  0.0010 0.0019
111        CS1 0.6704 0.6884 0.7781 0.7115 0.7891  0.0060 0.0092
113        CS2 0.7386 0.6364 0.6533 0.7302 0.7271  0.0180 0.0243

验证集的预后分析

surv.yau <- compSurv(moic.res = yau.ntp.pred,
                     surv.info = brca.yau$clin.info,
                     convt.time = "m",
                     surv.median.line = "hv",
                     fig.name         = "KAPLAN-MEIER CURVE OF NTP FOR YAU")

Figure 16.test数据集生存曲线

检测下与PAM50分型是否一致

agree.yau <- compAgree(moic.res = yau.ntp.pred,
                       subt2comp = brca.yau$clin.info[,"PAM50",drop=F],
                       doPlot = T,
                       fig.name = "YAU PREDICTEDMOIC WITH PAM50")

Figure 17.test数据集与已知亚型关系

print(agree.yau)

  current.subtype other.subtype        RI      AMI        JI        FM
1         Subtype         PAM50 0.7819319 0.369929 0.3286758 0.4959205

Kappa一致性评价

yau.pam.pred <- runPAM(train.expr  = fpkm,
                       moic.res    = cmoic.brca,
                       test.expr   = brca.yau$mRNA.expr)
tcga.ntp.pred <- runNTP(expr      = fpkm,
                        templates = marker.up$templates,
                        doPlot    = FALSE)


 CS1  CS2  CS3  CS4  CS5 <NA> 
 139   79  162  105  108   50

tcga.pam.pred <- runPAM(train.expr  = fpkm,
                        moic.res    = cmoic.brca,
                        test.expr   = fpkm)
runKappa(subt1     = cmoic.brca$clust.res$clust,
         subt2     = tcga.ntp.pred$clust.res$clust,
         subt1.lab = "CMOIC",
         subt2.lab = "NTP",
         fig.name  = "CONSISTENCY HEATMAP FOR TCGA between CMOIC and NTP")

Figure 18.一致性检验NTPvsCMOIC

runKappa(subt1     = cmoic.brca$clust.res$clust,
         subt2     = tcga.pam.pred$clust.res$clust,
         subt1.lab = "CMOIC",
         subt2.lab = "PAM",
         fig.name  = "CONSISTENCY HEATMAP FOR TCGA between CMOIC and PAM")

Figure 19.一致性检验PAMvsCMOIC

runKappa(subt1     = yau.ntp.pred$clust.res$clust,
         subt2     = yau.pam.pred$clust.res$clust,
         subt1.lab = "NTP",
         subt2.lab = "PAM",
         fig.name  = "CONSISTENCY HEATMAP FOR YAU")

Figure 20.一致性检验PAMvsNTP

3.彩蛋

通过整套流程，你会发现moic.res几乎贯穿整个下游分析，甚至可以说如果无法通过GET module获取合适的moic.res对象，下游分析就凉凉啦。事实上，moic.res的核心信息是clust.res。 clust.res是一个data.frame，有samID列与其他数据匹配即可。另一个参数是mo.method，作为输出数据的前缀。例如，需要检测下PAM50分型的预测价值，仅仅需要设置pseudo.moic.res对象和自定义的mo.method即可。这几乎为任何亚型分析，打开了洪荒之力的大门。惊叹！！！

pseudo.moic.res <- list("clust.res"=surv.info,
                        "mo.method"="PAM50")
pseudo.moic.res$clust.res$samID <- rownames(pseudo.moic.res$clust.res)
pseudo.moic.res$clust.res$clust <- sapply(pseudo.moic.res$clust.res$PAM50,
                                          switch,
                                          "Basal"   = 1, # relabel Basal as 1
                                          "Her2"    = 2, # relabel Her2 as 2
                                          "LumA"    = 3, # relabel LumA as 3
                                          "LumB"    = 4, # relabel LumnB as 4
                                          "Normal"  = 5) # relabel Normal as 5
head(pseudo.moic.res$clust.res)

              fustat futime PAM50 pstage age         samID clust
BRCA-A03L-01A      0   2442  LumA     T3  34 BRCA-A03L-01A     3
BRCA-A04R-01A      0   2499  LumB     T1  36 BRCA-A04R-01A     4
BRCA-A075-01A      0    518  LumB     T2  42 BRCA-A075-01A     4
BRCA-A08O-01A      0    943  LumA     T2  45 BRCA-A08O-01A     3
BRCA-A0A6-01A      0    640  LumA     T2  64 BRCA-A0A6-01A     3
BRCA-A0AD-01A      0   1157  LumA     T1  83 BRCA-A0AD-01A     3

pam50.brca <- compSurv(moic.res         = pseudo.moic.res,
                       surv.info        = surv.info,
                       convt.time       = "y", # convert day unit to year
                       surv.median.line = "h", # draw horizontal line at median survival
                       fig.name         = "KAPLAN-MEIER CURVE OF PAM50 BY PSEUDO")

Figure 21.pseudo.moic.res

Secssion Information

sessionInfo()

R version 4.1.0 (2021-05-18)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19042)

Matrix products: default

locale:
[1] LC_COLLATE=Chinese (Simplified)_China.936 
[2] LC_CTYPE=Chinese (Simplified)_China.936   
[3] LC_MONETARY=Chinese (Simplified)_China.936
[4] LC_NUMERIC=C                              
[5] LC_TIME=Chinese (Simplified)_China.936    

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] MOVICS_0.99.16      Biobase_2.52.0      BiocGenerics_0.38.0
 [4] rmdformats_1.0.2    knitr_1.33          pacman_0.5.1       
 [7] forcats_0.5.1       stringr_1.4.0       dplyr_1.0.6        
[10] purrr_0.3.4         readr_1.4.0         tidyr_1.1.3        
[13] tibble_3.1.2        ggplot2_3.3.3       tidyverse_1.3.1    

loaded via a namespace (and not attached):
 [1] rsvd_1.0.5                  corpcor_1.6.9              
 [3] aricode_1.0.0               class_7.3-19               
 [5] foreach_1.5.1               crayon_1.4.1               
 [7] MASS_7.3-54                 rhdf5filters_1.4.0         
 [9] nlme_3.1-152                backports_1.2.1            
[11] reprex_2.0.0                sva_3.40.0                 
[13] impute_1.66.0               GOSemSim_2.18.0            
[15] rlang_0.4.11                svd_0.5                    
[17] XVector_0.32.0              readxl_1.3.1               
[19] heatmap.plus_1.3            irlba_2.3.3                
[21] nloptr_1.2.2.2              limma_3.48.0               
[23] BiocParallel_1.26.0         rjson_0.2.20               
[25] bit64_4.0.5                 glue_1.4.2                 
[27] rngtools_1.5                SNFtool_2.2                
[29] AnnotationDbi_1.54.0        oompaData_3.1.1            
[31] DOSE_3.18.0                 haven_2.4.1                
[33] tidyselect_1.1.1            SummarizedExperiment_1.22.0
[35] km.ci_0.5-2                 rio_0.5.26                 
[37] XML_3.99-0.6                zoo_1.8-9                  
[39] ggpubr_0.4.0                ridge_2.9                  
[41] maftools_2.8.0              xtable_1.8-4               
[43] magrittr_2.0.1              evaluate_0.14              
[45] cli_2.5.0                   zlibbioc_1.38.0            
[47] rstudioapi_0.13             fastmatch_1.1-0            
[49] ClassDiscovery_3.3.13       InterSIM_2.2.0             
[51] treeio_1.16.1               GSVA_1.40.0                
[53] BiocSingular_1.8.0          xfun_0.23                  
[55] coca_1.1.0                  clue_0.3-59                
[57] cluster_2.1.2               caTools_1.18.2             
[59] tidygraph_1.2.0             ggtext_0.1.1               
[61] KEGGREST_1.32.0             flexclust_1.4-0            
[63] ggrepel_0.9.1               ape_5.5                    
[65] permute_0.9-5               Biostrings_2.60.0          
[67] png_0.1-7                   withr_2.4.2                
[69] bitops_1.0-7                ggforce_0.3.3              
[71] plyr_1.8.6                  cellranger_1.1.0           
[73] GSEABase_1.54.0             e1071_1.7-7                
[75] survey_4.0                 
 [ reached getOption("max.print") -- omitted 167 entries ]

参考资料：

MOVICS:Multi-Omics integration and VIsualization in Cancer Subtyping(MOVICS VIGNETTE)
Lu, X., Meng, J., Zhou, Y., Jiang, L., and Yan, F. (2020). MOVICS: an R package for multi-omics integration and visualization in cancer subtyping. bioRxiv, 2020.2009.2015.297820. [doi.org/10.1101/2020.09.15.297820]