GSVA&ssGSEA:单样本通路富集分析流程
2021-08-13 本文已影响0人
小贝学生信
- ORA富集分析只需要提供差异基因列表,GSEA富集分析还需要提供对应差异倍数信息。但二者都是基于差异分析得到的结果。
- Gene set variation analysis (GSVA)与Single sample GSEA (ssGSEA)这两种方法是都基于单样本的基因表达信息计算每个通路的相对表达活性,然后基于此可计算样本间的通路表达活性的差异分析。由于这两种算法相似,就不做过多的区分。
- 相当于把基因表达矩阵(列:样本,行:基因 )变为一个通路/基因集活性表达矩阵(列:样本,行:通路/基因集)
- 可使用
GSVA
包进行分析,以下的学习代码最主要来自该包的官方文档:https://www.bioconductor.org/packages/release/bioc/vignettes/GSVA/inst/doc/GSVA.html
1、gsva()
的两个核心参数
#安装加载包
BiocManager::install("GSVA")
library(GSVA)
#示例数据
BiocManager::install("GSVAdata")
library(GSVAdata)
形如gsva(expr, gset.idx.list)
,进行GSVA分析需要两个必要参数
(1)expr表达矩阵
- 表达矩阵可以是单纯的
matrix
格式,也可以是ExpressionSet/SummarizedExperiment对象; - 对于microarray或者经过log-CPMs, log-RPKMs or log-TPMs之类标准化处理的RNA-seq数据,使用默认的参数即可;如果是原始count的RNA-seq表达水平,需要添加
kcdf="Poisson"
参数。
(2)gset.idx.list通路基因集
- 通路基因集的最直接、简单的格式就是list对象,每个元素包含特定基因集的所有基因,元素名为通路基因名;
- The Molecular Signatures Database (MSigDB)数据库包含了多种主流的注释基因集供下载使用
https://www.gsea-msigdb.org/gsea/msigdb/
- 值得注意的是文件为
gmt
格式,可使用clusterProfiler::read.gmt()
函数读为数据框格式。建议下载Entrez
基因id格式
c2.cp.kegg = clusterProfiler::read.gmt("c2.cp.kegg.v7.4.entrez.gmt")
c2.cp.reactome = clusterProfiler::read.gmt("c2.cp.reactome.v7.4.entrez.gmt")
genesets = rbind(c2.cp.kegg,c2.cp.reactome)
head(genesets)
# term gene
# 1 KEGG_N_GLYCAN_BIOSYNTHESIS 79868
# 2 KEGG_N_GLYCAN_BIOSYNTHESIS 57171
# 3 KEGG_N_GLYCAN_BIOSYNTHESIS 6184
# 4 KEGG_N_GLYCAN_BIOSYNTHESIS 199857
# 5 KEGG_N_GLYCAN_BIOSYNTHESIS 11253
# 6 KEGG_N_GLYCAN_BIOSYNTHESIS 10195
genesets = split(genesets$gene, genesets$term)
str(head(genesets))
2、gsva()
分析示例
2.1 例1:胶质母细胞瘤亚型GSVA分析
- 目的:对于胶质母细胞瘤GBM的四种亚型(proneural, classical, neural and mesenchymal),给定的4个不同大脑细胞类型基因集的富集程度分布情况。
表达矩阵数据
data(gbm_VerhaakEtAl)
exprs(gbm_eset)[1:4,1:4]
# TCGA.02.0003.01A.01 TCGA.02.0010.01A.01 TCGA.02.0011.01B.01 TCGA.02.0014.01A.01
# AACS 0.51451 -0.36739 0.27668 0.71333
# FSTL1 -0.28438 -1.52133 -1.02120 -2.06326
# ELMO2 0.09697 0.32328 0.59386 0.41703
# CREB3L1 0.27977 -0.64587 0.37805 -0.60164
基因集数据
data(brainTxDbSets)
str(brainTxDbSets)
# List of 4
# $ astrocytic_up : chr [1:85] "GRHL1" "GPAM" "PAPSS2" "MERTK" ...
# $ astroglia_up : chr [1:88] "BST2" "SERPING1" "ACTA2" "C9orf167" ...
# $ neuronal_up : chr [1:98] "STXBP1" "JPH4" "CACNG3" "BRUNOL6" ...
# $ oligodendrocytic_up: chr [1:70] "DCT" "ZNF536" "GNG8" "ELOVL6" ...
gsva()分析
gbm_es <- gsva(gbm_eset, brainTxDbSets)
# Estimating GSVA scores for 4 gene sets.
# Estimating ECDFs with Gaussian kernels
# |==========================================================================================================| 100%
class(gbm_es)
# [1] "ExpressionSet"
# attr(,"package")
# [1] "Biobase"
exprs(gbm_es)[1:4,1:3]
# TCGA.02.0003.01A.01 TCGA.02.0010.01A.01 TCGA.02.0011.01B.01
# astrocytic_up -0.2785436 -0.2676394 -0.01994194
# astroglia_up -0.5038491 -0.5199252 -0.44371190
# neuronal_up 0.5406985 0.1856688 -0.12880731
# oligodendrocytic_up 0.3049166 0.2399523 0.26027077
热图可视化
subtypeOrder <- c("Proneural", "Neural", "Classical", "Mesenchymal")
sampleOrderBySubtype <- sort(match(gbm_es$subtype, subtypeOrder),
index.return=TRUE)$ix
geneSetOrder <- c("astroglia_up", "astrocytic_up", "neuronal_up",
"oligodendrocytic_up")
library(pheatmap)
pheatmap(exprs(gbm_es)[geneSetOrder, sampleOrderBySubtype],
show_colnames = F, cluster_cols = F,
annotation_col = pData(gbm_es[,sampleOrderBySubtype]))
2.2 例2:白血病亚型GSVA分析+差异分析
- 目的:研究常规儿童急性淋巴母细胞白血病(ALL)与MLL(混合系白血病基因)易位的通路富集差异,并进行差异分析,得到在两种亚型中明显差异表达的通路基因集。
表达矩阵数据
data(leukemia)
exprs(leukemia_eset)[1:4,1:4]
# CL2001011101AA.CEL CL2001011102AA.CEL CL2001011104AA.CEL CL2001011105AA.CEL
# 1000_at 11.354426 10.932543 11.185906 11.251631
# 1001_at 9.185470 8.823661 8.687186 8.958305
# 1002_f_at 7.806993 8.127591 7.842353 8.319227
# 1003_s_at 10.164370 10.048514 10.006014 10.474046
#探针的基因名注释
library(hgu95a.db)
ids = as.data.frame(hgu95aENTREZID)
fData(leukemia_eset)$ENTREZID = ids$gene_id[match(rownames(leukemia_eset), ids$probe_id)]
leukemia_eset = leukemia_eset[!is.na(fData(leukemia_eset)$ENTREZID),]
leukemia_eset = leukemia_eset[!duplicated(fData(leukemia_eset)$ENTREZID),]
rownames(leukemia_eset) = fData(leukemia_eset)$ENTREZID
exprs(leukemia_eset)[1:4,1:4]
# CL2001011101AA.CEL CL2001011102AA.CEL CL2001011104AA.CEL CL2001011105AA.CEL
# 5595 11.354426 10.932543 11.185906 11.251631
# 7075 9.185470 8.823661 8.687186 8.958305
# 1557 7.806993 8.127591 7.842353 8.319227
# 643 10.164370 10.048514 10.006014 10.474046
#样本分组
table(leukemia_eset$subtype)
# ALL MLL
# 20 17
通路基因集
- 为1.2点整理的kegg、reactome基因集
length(genesets)
#[1] 1790
str(head(genesets))
# List of 6
# $ KEGG_N_GLYCAN_BIOSYNTHESIS : chr [1:46] "79868" "57171" "6184" "199857" ...
# $ KEGG_OTHER_GLYCAN_DEGRADATION : chr [1:16] "64772" "2720" "4126" "4125" ...
# $ KEGG_O_GLYCAN_BIOSYNTHESIS : chr [1:30] "8693" "117248" "168391" "11226" ...
# $ KEGG_GLYCOSAMINOGLYCAN_DEGRADATION : chr [1:21] "9955" "10855" "60495" "2720" ...
# $ KEGG_GLYCOSAMINOGLYCAN_BIOSYNTHESIS_KERATAN_SULFATE: chr [1:15] "9435" "11041" "10678" "8534" ...
# $ KEGG_GLYCEROLIPID_METABOLISM : chr [1:49] "129642" "57678" "9388" "8525" ...
gsva()分析
leukemia_es <- gsva(leukemia_eset, genesets, min.sz=10, max.sz=500)
exprs(leukemia_es)[1:4,1:3]
# CL2001011101AA.CEL CL2001011102AA.CEL CL2001011104AA.CEL
# KEGG_N_GLYCAN_BIOSYNTHESIS 0.056098397 -0.21254643 0.3442295
# KEGG_OTHER_GLYCAN_DEGRADATION -0.371743215 -0.43330368 0.3012659
# KEGG_GLYCOSAMINOGLYCAN_DEGRADATION 0.044683884 -0.04355861 -0.1263370
# KEGG_GLYCOSAMINOGLYCAN_BIOSYNTHESIS_KERATAN_SULFATE -0.002049164 -0.59098416 -0.1913861
limma差异分析
library(limma)
mod <- model.matrix(~ factor(leukemia_es$subtype))
colnames(mod) <- c("ALL", "MLLvsALL")
fit <- lmFit(leukemia_es, mod)
fit <- eBayes(fit)
tt <- topTable(fit, coef=2, n=Inf)
head(tt)
# logFC AveExpr t P.Value adj.P.Val B
# REACTOME_DCC_MEDIATED_ATTRACTIVE_SIGNALING -0.4364816 -0.015338411 -6.356475 1.035005e-07 0.0001322736 7.597326
# REACTOME_GLYCOSPHINGOLIPID_METABOLISM 0.3490613 -0.008042234 5.201199 5.022153e-06 0.0032091556 4.015100
# REACTOME_RESPONSE_TO_METAL_IONS 0.5102309 -0.009041289 4.864142 1.526901e-05 0.0039353010 2.988990
# REACTOME_REGULATION_OF_MECP2_EXPRESSION_AND_ACTIVITY -0.3540451 0.004755112 -4.859177 1.551936e-05 0.0039353010 2.973992
# REACTOME_OTHER_SEMAPHORIN_INTERACTIONS 0.3638330 0.013275406 4.792116 1.932383e-05 0.0039353010 2.771837
# KEGG_PROXIMAL_TUBULE_BICARBONATE_RECLAMATION 0.2701284 -0.003998529 4.782970 1.990926e-05 0.0039353010 2.744324
差异分析可视化:火山图
library(ggplot2)
library(ggrepel)
#自定义阈值 adj.P.Val < 0.05 ;abs(tt$logFC) > 0.25
tt$change = as.factor(ifelse(tt$adj.P.Val < 0.05 & abs(tt$logFC) > 0.25,
ifelse(tt$logFC > 0.25 ,'UP','DOWN'),'NOT'))
up_gs = rownames(subset(tt,logFC > 0))[1:3]
down_gs = rownames(subset(tt,logFC < 0))[1:3]
label_gs = tt[c(up_gs,down_gs),]
label_gs$name = rownames(label_gs)
ggplot(data=tt,
aes(x=logFC, y=-log10(P.Value),
color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) +
geom_text_repel(
data = label_gs,
aes(label = name),min.segment.length = 0)
差异分析可视化:分组热图
tt_sig = subset(tt, adj.P.Val < 0.1)
pheatmap(leukemia_es[rownames(tt_sig),],
show_colnames = F, cluster_cols = F,
annotation_col = pData(leukemia_es))