从GEO数据下载到limma差异分析,再到topGO富集分析
一个失败案例,不要点进来
GSE2034 topGO
1. DEG
1.1 下载GEO数据
library(GEOquery)
f <- 'GSE2034.Rdata'
if(!file.exists(f)){
gset <- getGEO('GSE2034',destdir = '.',
AnnotGPL = F,
getGPL = F)
save(gset,file=f)
}
load('GSE2034.Rdata')
1.2 表达矩阵
eset <- gset[[1]]
expmat <- exprs(eset)
## 检查数据是否已经经过log转换,并进行log2转换
ex <- expmat
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
expmat <- log2(ex)
print("log2 transform finished")}else{print("log2 transform not needed")}
# [1] "log2 transform finished"
fit <- lmFit(expmat, design)
fit1 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit, 0.01)
DEGstb <- topTable(fit2, coef=1, n=Inf)
DEGstb <- DEGstb[order(DEGstb[,4], decreasing = F),]
before:
after:
1.3 设计矩阵和对比矩阵
pdata <- pData(eset)
group_list <- pdata$`bone relapses (1=yes, 0=no):ch1`
library(limma)
design <- model.matrix(~0+factor(group_list))
colnames(design) <- c("free", "relapses")
rownames(design) <- colnames(expmat)
contrast.matrix <- makeContrasts('free-relapses',levels = design)
1.4 差异分析
fit <- lmFit(expmat, design)
fit1 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit1)
DEGstb <- topTable(fit2, coef=1, n=Inf)
DEGmtx <- na.omit(DEGstb)
head(DEGmtx)
# logFC AveExpr t P.Value adj.P.Val
# 209835_x_at 0.7010262 10.616209 6.251930 1.460149e-09 3.253649e-05
# 212014_x_at 0.7264678 10.372145 5.927511 8.785625e-09 9.788505e-05
# 204490_s_at 0.4928252 9.788234 5.558565 6.206262e-08 4.609804e-04
# 209380_s_at -0.5373120 10.062837 -5.421342 1.253664e-07 6.983850e-04
# 209831_x_at 0.2666780 10.594035 5.228288 3.295703e-07 1.338433e-03
# 204489_s_at 0.5619588 10.302607 5.181790 4.142994e-07 1.338433e-03
# B result
# 209835_x_at 11.154503 UP
# 212014_x_at 9.526820 UP
# 204490_s_at 7.757191 UP
# 209380_s_at 7.121927 DOWN
# 209831_x_at 6.249843 NOT
# 204489_s_at 6.043635 UP
1.5 火山图
logFC_cutoff <- with(DEGmtx, mean(abs(logFC)) +
2*sd(abs(logFC)))
DEGdf$result <- ifelse(DEGdf$P.Value < 0.05 &
abs(DEGdf$logFC) > logFC_cutoff,
ifelse(DEGdf$logFC >logFC_cutoff,
"UP", "DOWN"), "NOT")
title <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEGmtx[DEGmtx$result =='UP',]) ,
'\nThe number of down gene is ',nrow(DEGmtx[DEGmtx$result =='DOWN',]))
library(ggplot2)
ggplot(data=DEGmtx, aes(x=logFC, y=-log10(P.Value), color=result)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20))) +
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( title ) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
scale_colour_manual(values = c('blue','black','red'))
1.6 PCA图
pca_data <- prcomp(t(expmat),scale=TRUE)
pcx <- data.frame(pca_data$x)
pcr <- cbind(samples=rownames(pcx),group_list, pcx)
ggplot(pcr, aes(PC1, PC2)) + geom_point(size=5, aes(color = group_list)) +
geom_text(aes(label="samples"),hjust=-0.1, vjust=-0.3)
这个数据中的分组:bone relapses: 1=yes, 0=no
分别是加样本名和不加样本名的亚子:
1.7 热图
选择&调整矩阵,设置各种参数:
chosedgene <- rownames(DEGmtx)[1:50]
chosedmatx <- expmat[chosedgene,]
re <- subset(pdata,pdata[,28]==1)
re <- row.names(re)
fr <- row.names(subset(pdata,pdata[,28]==0))
sampleorder <- as.vector(c(fr,re))
do_mtx <- t(scale(t(chosedmatx)))
do_mtx[do_mtx > 2] = 2
do_mtx[do_mtx < -2] = -2
do_mtx <- do_mtx[,sampleorder]
chosedprob <- row.names(do_mtx)
chosedsymbol <- AnnotationDbi::select(hgu133a.db, chosedprob,
"SYMBOL",
"PROBEID")[,2]
row.names(do_mtx) <- chosedsymbol
anno_col <- data.frame(sample=factor(rep(c("free", "relapses"), c(217, 69))))
row.names(anno_col) <- colnames(do_mtx)
ann_color <- list(sample=c(free = "#3a5fcd", relapses = "#cd0000"))
画图:
png(filename = "GSE2034_hm_nmlmatrix_nocluscol.png", height = 5000, width = 6000,
res = 500, units = "px")
pheatmap::pheatmap(do_mtx, scale = "none", color = color,
annotation_col = anno_col,
annotation_colors = ann_color,
fontsize = 8, fontsize_row = 8,
show_colnames = F,
cluster_cols = F,
main = "GSE2034: free vs relapses HEATMAP (delected outliers)",)
dev.off()
2. topGO
2.1 topGO: Predefined list of interesting genes
过表征分析 (over representation analysis, ORA)
Tests based on gene counts. This is the most popular family of tests, given that it only requires the presence of a list of interesting genes and nothing more. Tests like Fisher's exact test, Hypegeometric test and binomial test belong to this family. Draghici et al. (2006) .
2.1.1 构建 topGOdata
对象
构建 geneID2GO
list
library(topGO)
library(hgu133a.db)
library(dplyr)
probe_id <- row.names(expmat)
entrez_id <- AnnotationDbi::select(hgu133a.db,
probe_id,
"ENTREZID",
"PROBEID")[,2]
geneID2GOtb <- AnnotationDbi::select(hgu133a.db, entrez_id,
"GO", "ENTREZID")
geneID2GOsb <- geneID2GOtb %>%
group_by(ENTREZID) %>%
summarise(GO=list(GO))
geneID2GO <- geneID2GOsb$GO
names(geneID2GO) <- rownames(geneID2GOsb)
构建 geneList
myInterestingDEG <- subset(DEGdf, abs(DEGdf$logFC) > 1)
myInterestingDEG <- subset(myInterestingDEG, myInterestingDEG$adj.P.Val < 0.05)
logFC | AveExpr | t | P.Value | adj.P.Val | B | result | |
---|---|---|---|---|---|---|---|
214777_at | 1.621019 | 7.578387 | 4.723681 | 3.63E-06 | 0.005771 | 4.093824 | UP |
216365_x_at | 1.340734 | 5.631291 | 4.599642 | 6.35E-06 | 0.007075 | 3.592027 | UP |
217157_x_at | 1.02288 | 8.543867 | 4.564419 | 7.43E-06 | 0.007348 | 3.45161 | UP |
216984_x_at | 1.482014 | 7.826338 | 4.56344 | 7.46E-06 | 0.007348 | 3.447722 | UP |
217378_x_at | 1.740957 | 8.055472 | 4.500623 | 9.85E-06 | 0.008778 | 3.199637 | UP |
217258_x_at | 1.217204 | 7.180579 | 4.485677 | 1.05E-05 | 0.009013 | 3.14105 | UP |
217148_x_at | 1.236926 | 9.662857 | 4.468183 | 1.14E-05 | 0.009236 | 3.072682 | UP |
211798_x_at | 1.129989 | 7.765948 | 4.411513 | 1.45E-05 | 0.009236 | 2.852795 | UP |
214669_x_at | 1.146124 | 11.22506 | 4.382578 | 1.65E-05 | 0.009236 | 2.741463 | UP |
211644_x_at | 1.51252 | 8.895729 | 4.353739 | 1.86E-05 | 0.00958 | 2.631127 | UP |
215176_x_at | 1.554616 | 9.781743 | 4.350516 | 1.89E-05 | 0.00958 | 2.618838 | UP |
211637_x_at | 1.319264 | 6.128292 | 4.291975 | 2.42E-05 | 0.01036 | 2.396959 | UP |
215121_x_at | 1.279395 | 12.98246 | 4.247872 | 2.92E-05 | 0.011018 | 2.231533 | UP |
216576_x_at | 1.355247 | 8.468197 | 4.198257 | 3.59E-05 | 0.011951 | 2.047218 | UP |
204540_at | -1.06758 | 8.610066 | -4.19044 | 3.71E-05 | 0.011982 | 2.018367 | DOWN |
216401_x_at | 1.475812 | 7.711042 | 4.180749 | 3.86E-05 | 0.011983 | 1.982631 | UP |
207430_s_at | -1.3399 | 6.097727 | -4.15096 | 4.37E-05 | 0.012368 | 1.873294 | DOWN |
213201_s_at | -1.27114 | 6.641464 | -4.13499 | 4.66E-05 | 0.01267 | 1.814941 | DOWN |
215214_at | 1.239997 | 7.376844 | 4.125793 | 4.84E-05 | 0.012763 | 1.781441 | UP |
204933_s_at | 1.073296 | 4.540879 | 4.031533 | 7.10E-05 | 0.015467 | 1.441822 | UP |
217281_x_at | 1.355866 | 5.417214 | 4.015399 | 7.58E-05 | 0.015784 | 1.384391 | UP |
214858_at | -1.08985 | 6.455748 | -4.00976 | 7.75E-05 | 0.015861 | 1.36435 | DOWN |
216510_x_at | 1.132241 | 6.304655 | 3.99392 | 8.26E-05 | 0.016195 | 1.308245 | UP |
215379_x_at | 1.123251 | 11.77926 | 3.943786 | 0.000101 | 0.016689 | 1.131931 | UP |
211634_x_at | 1.014891 | 6.92074 | 3.93688 | 0.000104 | 0.016689 | 1.107799 | UP |
210297_s_at | -1.201 | 6.705364 | -3.93263 | 0.000105 | 0.016689 | 1.092959 | DOWN |
209138_x_at | 1.136452 | 13.28617 | 3.9291 | 0.000107 | 0.016689 | 1.080658 | UP |
211645_x_at | 1.244501 | 9.308405 | 3.889339 | 0.000125 | 0.017262 | 0.942694 | UP |
216491_x_at | 1.292163 | 5.893802 | 3.845647 | 0.000148 | 0.018642 | 0.79254 | UP |
216560_x_at | 1.278329 | 5.810675 | 3.778756 | 0.000192 | 0.020615 | 0.565602 | UP |
221286_s_at | 1.10116 | 7.542805 | 3.777608 | 0.000192 | 0.020615 | 0.561739 | UP |
206391_at | 1.082072 | 6.946425 | 3.505906 | 0.000528 | 0.031189 | -0.32275 | UP |
205509_at | -1.20626 | 8.869439 | -3.48369 | 0.000572 | 0.032322 | -0.39241 | DOWN |
209374_s_at | 1.034312 | 10.40725 | 3.443591 | 0.000659 | 0.034236 | -0.51714 | UP |
204623_at | -1.04503 | 11.21622 | -3.25463 | 0.001271 | 0.045188 | -1.08699 | DOWN |
geneNames <- names(geneID2GO)
myInterestingGenestb <- AnnotationDbi::select(hgu133a.db, row.names(myInterestingDEG),
"ENTREZID", "PROBEID")
myInterestingGenes <- geneNames[match(myInterestingGenestb[,2],
geneNames)]
myInterestingGenes <- na.omit(myInterestingGenes)
ORAgeneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(ORAgeneList) <- geneNames
str(ORAgeneList)
# Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
# - attr(*, "names")= chr [1:13755] "10" "100" "1000" "10000" ...
构建 topGOdata
对象:
ORAGOdata <- new("topGOdata",
ontology = "BP",
allGenes = ORAgeneList,
annot = annFUN.gene2GO,
gene2GO = geneID2GO)
#
# Building most specific GOs .....
# ( 11302 GO terms found. )
#
# Build GO DAG topology ..........
# ( 15326 GO terms and 36394 relations. )
#
# Annotating nodes ...............
# ( 11973 genes annotated to the GO terms. )
ORAGOdata
#
# ------------------------- topGOdata object -------------------------
#
# Description:
# -
#
# Ontology:
# - BP
#
# 13755 available genes (all genes from the array):
# - symbol: 10 100 1000 10000 100008586 ...
# - 29 significant genes.
#
# 11973 feasible genes (genes that can be used in the analysis):
# - symbol: 10 100 1000 10000 10001 ...
# - 21 significant genes.
#
# GO graph (nodes with at least 1 genes):
# - a graph with directed edges
# - number of nodes = 15326
# - number of edges = 36394
#
# ------------------------- topGOdata object -------------------------
#
2.1.2 富集分析
we perform a classical enrichment analysis by testing the over-representation of GO terms within the group of di erentially expressed genes.
ORAresultFisher <- runTest(ORAGOdata, algorithm = "classic", statistic = "fisher")
#
# -- Classic Algorithm --
#
# the algorithm is scoring 449 nontrivial nodes
# parameters:
# test statistic: fisher
#
ORAresultFisher
#
# Description:
# Ontology: BP
# 'classic' algorithm with the 'fisher' test
# 15326 GO terms scored: 86 terms with p < 0.01
# Annotation data:
# Annotated genes: 11973
# Significant genes: 21
# Min. no. of genes annotated to a GO: 1
# Nontrivial nodes: 449
使用 topGO
包时,所构建的 topGOdata
包含了 gene universe (ORAGOdata) 和 p-value (GSEAGOdata), ORA 和 GSEA 对应的统计方法可以同时应用于同一个 topGOdata
对象。所以区别只是基于不同的已知信息,构建 topGOdata
的方法不同。(maybe?
基于 classic 和 elim 算法,对 ORAdata
进行 Kolmogorov-Smirnov test:
ORAresultKS <- runTest(ORAGOdata, algorithm = "classic", statistic = "ks")
#
# -- Classic Algorithm --
#
# the algorithm is scoring 15326 nontrivial nodes
# parameters:
# test statistic: ks
# score order: increasing
ORAresultKS
#
# Description:
# Ontology: BP
# 'classic' algorithm with the 'ks' test
# 15326 GO terms scored: 903 terms with p < 0.01
# Annotation data:
# Annotated genes: 11973
# Significant genes: 21
# Min. no. of genes annotated to a GO: 1
# Nontrivial nodes: 15326
ORAresult.elim <- runTest(ORAGOdata, algorithm = "elim", statistic = "ks")
#
# -- Elim Algorithm --
#
# the algorithm is scoring 15326 nontrivial nodes
# parameters:
# test statistic: ks
# cutOff: 0.01
# score order: increasing
# ...
ORAresult.elim
#
# Description:
# Ontology: BP
# 'elim' algorithm with the 'ks : 0.01' test
# 15326 GO terms scored: 291 terms with p < 0.01
# Annotation data:
# Annotated genes: 11973
# Significant genes: 21
# Min. no. of genes annotated to a GO: 1
# Nontrivial nodes: 15326
2.1.3 Summary
ORAallRes <- GenTable(ORAGOdata,classicFisher = ORAresultFisher,
classicKS = ORAresultKS,
elimKS = ORAresult.elim,
orderBy = "elimKS",
ranksOf = "classicFisher",
topNodes = 20)
GO.ID | Term | Annotated | Significant | Expected | Rank in classicFisher | classicFisher | classicKS | elimKS | |
---|---|---|---|---|---|---|---|---|---|
1 | GO:0030198 | extracellular matrix organization | 303 | 1 | 0.53 | 233 | 0.42 | 2.20E-11 | 4.10E-08 |
2 | GO:0019886 | antigen processing and presentation of e... | 88 | 0 | 0.15 | 449 | 1 | 5.50E-08 | 5.50E-08 |
3 | GO:0098664 | G protein-coupled serotonin receptor sig... | 23 | 0 | 0.04 | 450 | 1 | 7.60E-08 | 7.60E-08 |
4 | GO:0006120 | mitochondrial electron transport, NADH t... | 42 | 0 | 0.07 | 451 | 1 | 2.10E-07 | 2.10E-07 |
5 | GO:0007190 | activation of adenylate cyclase activity | 29 | 0 | 0.05 | 452 | 1 | 5.10E-07 | 5.10E-07 |
6 | GO:0060337 | type I interferon signaling pathway | 87 | 0 | 0.15 | 453 | 1 | 3.70E-06 | 3.70E-06 |
7 | GO:0007193 | adenylate cyclase-inhibiting G protein-c... | 63 | 0 | 0.11 | 454 | 1 | 1.70E-08 | 4.80E-06 |
8 | GO:0006069 | ethanol oxidation | 9 | 0 | 0.02 | 455 | 1 | 4.90E-06 | 4.90E-06 |
9 | GO:0060078 | regulation of postsynaptic membrane pote... | 107 | 0 | 0.19 | 456 | 1 | 5.60E-07 | 5.70E-06 |
10 | GO:0007155 | cell adhesion | 1134 | 0 | 1.99 | 457 | 1 | 1.30E-06 | 6.20E-06 |
11 | GO:0042531 | positive regulation of tyrosine phosphor... | 54 | 0 | 0.09 | 458 | 1 | 6.60E-06 | 6.60E-06 |
12 | GO:0000184 | nuclear-transcribed mRNA catabolic proce... | 114 | 0 | 0.2 | 459 | 1 | 9.90E-06 | 9.90E-06 |
13 | GO:0048013 | ephrin receptor signaling pathway | 81 | 0 | 0.14 | 460 | 1 | 1.10E-05 | 1.10E-05 |
14 | GO:0006749 | glutathione metabolic process | 43 | 0 | 0.08 | 461 | 1 | 1.40E-05 | 1.40E-05 |
15 | GO:0006123 | mitochondrial electron transport, cytoch... | 13 | 0 | 0.02 | 462 | 1 | 1.40E-05 | 1.40E-05 |
16 | GO:0009636 | response to toxic substance | 446 | 0 | 0.78 | 463 | 1 | 1.00E-08 | 1.70E-05 |
17 | GO:1901687 | glutathione derivative biosynthetic proc... | 19 | 0 | 0.03 | 464 | 1 | 2.70E-05 | 2.70E-05 |
18 | GO:1901701 | cellular response to oxygen-containing c... | 924 | 1 | 1.62 | 335 | 0.82 | 3.30E-06 | 3.20E-05 |
19 | GO:1904322 | cellular response to forskolin | 10 | 0 | 0.02 | 465 | 1 | 3.20E-05 | 3.20E-05 |
20 | GO:0097267 | omega-hydroxylase P450 pathway | 9 | 0 | 0.02 | 466 | 1 | 3.50E-05 | 3.50E-05 |
2.2 topGO: Using the genes score
基因富集分析 (gene set enrichment analysis, GSEA)
Tests based on gene scores or gene ranks. It includes Kolmogorov-Smirnov like tests (also known as GSEA), Gentleman's Category, t-test, etc. Ackermann and Strimmer (2009).
2.2.1 构建 topGOdata
对象
构建 geneList
GSEAgeneList <- subset(DEGdf, select = "adj.P.Val")
GSEAgeneList <- as.numeric(GSEAgeneList[,1])
names(GSEAgeneList) <- row.names(DEGdf)
str(GSEAgeneList)
# Named num [1:22283] 3.25e-05 9.79e-05 4.61e-04 6.98e-04 1.34e-03 ...
# - attr(*, "names")= chr [1:22283] "209835_x_at" "212014_x_at" "204490_s_at" "209380_s_at" ...
通过函数 topDiffGenes()
筛选q值小于0.01的基因。
topDiffGenes <- function(allScore) {
return(allScore < 0.01)
}
length(GSEAgeneList)
# [1] 22283
sum(topDiffGenes(GSEAgeneList))
# [1] 50
构建 topGOdata
对象:
GSEAGOdata <- new("topGOdata",
description = "GO analysis of GSE2034 data; free vs relapses",
ontology = "BP",
allGenes = GSEAgeneList,
geneSel = topDiffGenes,
nodeSize = 5,
annot = annFUN.db,
affyLib = "hgu133a.db")
#
# Building most specific GOs .....
# ( 11302 GO terms found. )
#
# Build GO DAG topology ..........
# ( 15326 GO terms and 36394 relations. )
#
# Annotating nodes ...............
# ( 19375 genes annotated to the GO terms. )
GSEAGOdata
# ------------------------- topGOdata object -------------------------
#
# Description:
# - GO analysis of GSE2034 data; free vs relapses
#
# Ontology:
# - BP
#
# 22283 available genes (all genes from the array):
# - symbol: 209835_x_at 212014_x_at 204490_s_at 209380_s_at 209831_x_at ...
# - score : 3.25364921e-05 9.78850462e-05 0.000460980422 0.000698385002 0.0013384333 ...
# - 50 significant genes.
#
# 19375 feasible genes (genes that can be used in the analysis):
# - symbol: 209835_x_at 212014_x_at 204490_s_at 209380_s_at 209831_x_at ...
# - score : 3.25364921e-05 9.78850462e-05 0.000460980422 0.000698385002 0.0013384333 ...
# - 47 significant genes.
#
# GO graph (nodes with at least 5 genes):
# - a graph with directed edges
# - number of nodes = 10532
# - number of edges = 24696
#
# ------------------------- topGOdata object -------------------------
#
2.2.2 富集分析
2.2.3 Summary
GenTable
导出表格
未完不更
References
最后,向大家隆重推荐生信技能树的一系列干货!
- 生信技能树全球公益巡讲:https://mp.weixin.qq.com/s/E9ykuIbc-2Ja9HOY0bn_6g
- B站公益74小时生信工程师教学视频合辑:https://mp.weixin.qq.com/s/IyFK7l_WBAiUgqQi8O7Hxw
- 招学徒:https://mp.weixin.qq.com/s/KgbilzXnFjbKKunuw7NVfw