差异分析转录组R生信相关

GSVA&ssGSEA:单样本通路富集分析流程

2021-08-13  本文已影响0人  小贝学生信
  • ORA富集分析只需要提供差异基因列表,GSEA富集分析还需要提供对应差异倍数信息。但二者都是基于差异分析得到的结果。
  • Gene set variation analysis (GSVA)与Single sample GSEA (ssGSEA)这两种方法是都基于单样本的基因表达信息计算每个通路的相对表达活性,然后基于此可计算样本间的通路表达活性的差异分析。由于这两种算法相似,就不做过多的区分。
  • 相当于把基因表达矩阵(列:样本,行:基因 )变为一个通路/基因集活性表达矩阵(列:样本,行:通路/基因集)
  • 可使用GSVA包进行分析,以下的学习代码最主要来自该包的官方文档:https://www.bioconductor.org/packages/release/bioc/vignettes/GSVA/inst/doc/GSVA.html

1、gsva()的两个核心参数

#安装加载包
BiocManager::install("GSVA")
library(GSVA)

#示例数据
BiocManager::install("GSVAdata")
library(GSVAdata)

形如gsva(expr, gset.idx.list),进行GSVA分析需要两个必要参数

(1)expr表达矩阵

(2)gset.idx.list通路基因集

c2.cp.kegg = clusterProfiler::read.gmt("c2.cp.kegg.v7.4.entrez.gmt")
c2.cp.reactome = clusterProfiler::read.gmt("c2.cp.reactome.v7.4.entrez.gmt")
genesets = rbind(c2.cp.kegg,c2.cp.reactome)
head(genesets)
#                         term   gene
# 1 KEGG_N_GLYCAN_BIOSYNTHESIS  79868
# 2 KEGG_N_GLYCAN_BIOSYNTHESIS  57171
# 3 KEGG_N_GLYCAN_BIOSYNTHESIS   6184
# 4 KEGG_N_GLYCAN_BIOSYNTHESIS 199857
# 5 KEGG_N_GLYCAN_BIOSYNTHESIS  11253
# 6 KEGG_N_GLYCAN_BIOSYNTHESIS  10195
genesets = split(genesets$gene, genesets$term)
str(head(genesets))

2、gsva()分析示例

2.1 例1:胶质母细胞瘤亚型GSVA分析

表达矩阵数据
data(gbm_VerhaakEtAl)
exprs(gbm_eset)[1:4,1:4]
#           TCGA.02.0003.01A.01 TCGA.02.0010.01A.01 TCGA.02.0011.01B.01 TCGA.02.0014.01A.01
# AACS                0.51451            -0.36739             0.27668             0.71333
# FSTL1              -0.28438            -1.52133            -1.02120            -2.06326
# ELMO2               0.09697             0.32328             0.59386             0.41703
# CREB3L1             0.27977            -0.64587             0.37805            -0.60164
基因集数据
data(brainTxDbSets)
str(brainTxDbSets)
# List of 4
# $ astrocytic_up      : chr [1:85] "GRHL1" "GPAM" "PAPSS2" "MERTK" ...
# $ astroglia_up       : chr [1:88] "BST2" "SERPING1" "ACTA2" "C9orf167" ...
# $ neuronal_up        : chr [1:98] "STXBP1" "JPH4" "CACNG3" "BRUNOL6" ...
# $ oligodendrocytic_up: chr [1:70] "DCT" "ZNF536" "GNG8" "ELOVL6" ...
gsva()分析
gbm_es <- gsva(gbm_eset, brainTxDbSets)
# Estimating GSVA scores for 4 gene sets.
# Estimating ECDFs with Gaussian kernels
# |==========================================================================================================| 100%
class(gbm_es)
# [1] "ExpressionSet"
# attr(,"package")
# [1] "Biobase"
exprs(gbm_es)[1:4,1:3]
#                       TCGA.02.0003.01A.01 TCGA.02.0010.01A.01 TCGA.02.0011.01B.01
# astrocytic_up                -0.2785436          -0.2676394         -0.01994194
# astroglia_up                 -0.5038491          -0.5199252         -0.44371190
# neuronal_up                   0.5406985           0.1856688         -0.12880731
# oligodendrocytic_up           0.3049166           0.2399523          0.26027077
热图可视化
subtypeOrder <- c("Proneural", "Neural", "Classical", "Mesenchymal")
sampleOrderBySubtype <- sort(match(gbm_es$subtype, subtypeOrder),
                             index.return=TRUE)$ix
geneSetOrder <- c("astroglia_up", "astrocytic_up", "neuronal_up",
                  "oligodendrocytic_up")
library(pheatmap)
pheatmap(exprs(gbm_es)[geneSetOrder, sampleOrderBySubtype],
         show_colnames = F, cluster_cols = F, 
         annotation_col = pData(gbm_es[,sampleOrderBySubtype]))

2.2 例2:白血病亚型GSVA分析+差异分析

表达矩阵数据
data(leukemia)
exprs(leukemia_eset)[1:4,1:4]
#           CL2001011101AA.CEL CL2001011102AA.CEL CL2001011104AA.CEL CL2001011105AA.CEL
# 1000_at            11.354426          10.932543          11.185906          11.251631
# 1001_at             9.185470           8.823661           8.687186           8.958305
# 1002_f_at           7.806993           8.127591           7.842353           8.319227
# 1003_s_at          10.164370          10.048514          10.006014          10.474046

#探针的基因名注释
library(hgu95a.db)
ids = as.data.frame(hgu95aENTREZID)
fData(leukemia_eset)$ENTREZID = ids$gene_id[match(rownames(leukemia_eset), ids$probe_id)]
leukemia_eset = leukemia_eset[!is.na(fData(leukemia_eset)$ENTREZID),]
leukemia_eset = leukemia_eset[!duplicated(fData(leukemia_eset)$ENTREZID),]
rownames(leukemia_eset) = fData(leukemia_eset)$ENTREZID
exprs(leukemia_eset)[1:4,1:4]
#       CL2001011101AA.CEL CL2001011102AA.CEL CL2001011104AA.CEL CL2001011105AA.CEL
# 5595          11.354426          10.932543          11.185906          11.251631
# 7075           9.185470           8.823661           8.687186           8.958305
# 1557           7.806993           8.127591           7.842353           8.319227
# 643           10.164370          10.048514          10.006014          10.474046

#样本分组
table(leukemia_eset$subtype)
# ALL MLL 
# 20  17 
通路基因集
length(genesets)
#[1] 1790
str(head(genesets))
# List of 6
# $ KEGG_N_GLYCAN_BIOSYNTHESIS                         : chr [1:46] "79868" "57171" "6184" "199857" ...
# $ KEGG_OTHER_GLYCAN_DEGRADATION                      : chr [1:16] "64772" "2720" "4126" "4125" ...
# $ KEGG_O_GLYCAN_BIOSYNTHESIS                         : chr [1:30] "8693" "117248" "168391" "11226" ...
# $ KEGG_GLYCOSAMINOGLYCAN_DEGRADATION                 : chr [1:21] "9955" "10855" "60495" "2720" ...
# $ KEGG_GLYCOSAMINOGLYCAN_BIOSYNTHESIS_KERATAN_SULFATE: chr [1:15] "9435" "11041" "10678" "8534" ...
# $ KEGG_GLYCEROLIPID_METABOLISM                       : chr [1:49] "129642" "57678" "9388" "8525" ...
gsva()分析
leukemia_es <- gsva(leukemia_eset, genesets, min.sz=10, max.sz=500)
exprs(leukemia_es)[1:4,1:3]         
#                                                     CL2001011101AA.CEL CL2001011102AA.CEL CL2001011104AA.CEL
# KEGG_N_GLYCAN_BIOSYNTHESIS                                 0.056098397        -0.21254643          0.3442295
# KEGG_OTHER_GLYCAN_DEGRADATION                             -0.371743215        -0.43330368          0.3012659
# KEGG_GLYCOSAMINOGLYCAN_DEGRADATION                         0.044683884        -0.04355861         -0.1263370
# KEGG_GLYCOSAMINOGLYCAN_BIOSYNTHESIS_KERATAN_SULFATE       -0.002049164        -0.59098416         -0.1913861
limma差异分析
library(limma)
mod <- model.matrix(~ factor(leukemia_es$subtype))
colnames(mod) <- c("ALL", "MLLvsALL")
fit <- lmFit(leukemia_es, mod)
fit <- eBayes(fit)
tt <- topTable(fit, coef=2, n=Inf)
head(tt)
#                                                       logFC      AveExpr         t      P.Value    adj.P.Val        B
# REACTOME_DCC_MEDIATED_ATTRACTIVE_SIGNALING           -0.4364816 -0.015338411 -6.356475 1.035005e-07 0.0001322736 7.597326
# REACTOME_GLYCOSPHINGOLIPID_METABOLISM                 0.3490613 -0.008042234  5.201199 5.022153e-06 0.0032091556 4.015100
# REACTOME_RESPONSE_TO_METAL_IONS                       0.5102309 -0.009041289  4.864142 1.526901e-05 0.0039353010 2.988990
# REACTOME_REGULATION_OF_MECP2_EXPRESSION_AND_ACTIVITY -0.3540451  0.004755112 -4.859177 1.551936e-05 0.0039353010 2.973992
# REACTOME_OTHER_SEMAPHORIN_INTERACTIONS                0.3638330  0.013275406  4.792116 1.932383e-05 0.0039353010 2.771837
# KEGG_PROXIMAL_TUBULE_BICARBONATE_RECLAMATION          0.2701284 -0.003998529  4.782970 1.990926e-05 0.0039353010 2.744324
差异分析可视化:火山图
library(ggplot2)
library(ggrepel)
#自定义阈值 adj.P.Val < 0.05 ;abs(tt$logFC) > 0.25
tt$change = as.factor(ifelse(tt$adj.P.Val < 0.05 & abs(tt$logFC) > 0.25,
                                   ifelse(tt$logFC > 0.25 ,'UP','DOWN'),'NOT'))

up_gs = rownames(subset(tt,logFC > 0))[1:3]
down_gs = rownames(subset(tt,logFC < 0))[1:3]
label_gs = tt[c(up_gs,down_gs),]
label_gs$name = rownames(label_gs)
ggplot(data=tt, 
           aes(x=logFC, y=-log10(P.Value), 
               color=change)) +
  geom_point(alpha=0.4, size=1.75) +
  theme_set(theme_set(theme_bw(base_size=20)))+
  xlab("log2 fold change") + ylab("-log10 p-value") +
  theme(plot.title = element_text(size=15,hjust = 0.5))+
  scale_colour_manual(values = c('blue','black','red')) +
  geom_text_repel(
    data = label_gs,
    aes(label = name),min.segment.length = 0) 
差异分析可视化:分组热图
tt_sig = subset(tt, adj.P.Val < 0.1)
pheatmap(leukemia_es[rownames(tt_sig),],
         show_colnames = F, cluster_cols = F,
         annotation_col = pData(leukemia_es))
上一篇下一篇

猜你喜欢

热点阅读