从GEO数据下载到limma差异分析，再到topGO富集分析

2019-07-05 本文已影响30人美式永不加糖

一个失败案例，不要点进来

GSE2034 topGO

1. DEG

1.1 下载GEO数据

library(GEOquery)
f <- 'GSE2034.Rdata'
if(!file.exists(f)){
  gset <- getGEO('GSE2034',destdir = '.',
                 AnnotGPL = F,
                 getGPL = F)
  save(gset,file=f)
}
load('GSE2034.Rdata')

1.2 表达矩阵

eset <- gset[[1]]
expmat <- exprs(eset)

## 检查数据是否已经经过log转换，并进行log2转换
ex <- expmat
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))

LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)

if (LogC) { ex[which(ex <= 0)] <- NaN
expmat <- log2(ex)
print("log2 transform finished")}else{print("log2 transform not needed")}
# [1] "log2 transform finished"

fit <- lmFit(expmat, design)
fit1 <- contrasts.fit(fit, contrast.matrix) 
fit2 <- eBayes(fit, 0.01)

DEGstb <- topTable(fit2, coef=1, n=Inf)
DEGstb <- DEGstb[order(DEGstb[,4], decreasing = F),]

before:

after:

1.3 设计矩阵和对比矩阵

pdata <- pData(eset)
group_list <- pdata$`bone relapses (1=yes, 0=no):ch1`
library(limma)
design <- model.matrix(~0+factor(group_list))
colnames(design) <- c("free", "relapses")
rownames(design) <- colnames(expmat)
contrast.matrix <- makeContrasts('free-relapses',levels = design)

1.4 差异分析

fit <- lmFit(expmat, design)
fit1 <- contrasts.fit(fit, contrast.matrix) 
fit2 <- eBayes(fit1)
DEGstb <- topTable(fit2, coef=1, n=Inf)
DEGmtx <- na.omit(DEGstb)
head(DEGmtx)
#                  logFC   AveExpr         t      P.Value    adj.P.Val
# 209835_x_at  0.7010262 10.616209  6.251930 1.460149e-09 3.253649e-05
# 212014_x_at  0.7264678 10.372145  5.927511 8.785625e-09 9.788505e-05
# 204490_s_at  0.4928252  9.788234  5.558565 6.206262e-08 4.609804e-04
# 209380_s_at -0.5373120 10.062837 -5.421342 1.253664e-07 6.983850e-04
# 209831_x_at  0.2666780 10.594035  5.228288 3.295703e-07 1.338433e-03
# 204489_s_at  0.5619588 10.302607  5.181790 4.142994e-07 1.338433e-03
#                     B result
# 209835_x_at 11.154503     UP
# 212014_x_at  9.526820     UP
# 204490_s_at  7.757191     UP
# 209380_s_at  7.121927   DOWN
# 209831_x_at  6.249843    NOT
# 204489_s_at  6.043635     UP

1.5 火山图

logFC_cutoff <- with(DEGmtx, mean(abs(logFC)) + 
                       2*sd(abs(logFC)))

DEGdf$result <- ifelse(DEGdf$P.Value < 0.05 &
                                    abs(DEGdf$logFC) > logFC_cutoff,
                                  ifelse(DEGdf$logFC >logFC_cutoff,
                                         "UP", "DOWN"), "NOT")

title <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3), 
                '\nThe number of up gene is ',nrow(DEGmtx[DEGmtx$result =='UP',]) ,
                '\nThe number of down gene is ',nrow(DEGmtx[DEGmtx$result =='DOWN',]))

library(ggplot2)
ggplot(data=DEGmtx, aes(x=logFC, y=-log10(P.Value), color=result)) +
  geom_point(alpha=0.4, size=1.75) +
  theme_set(theme_set(theme_bw(base_size=20))) +
  xlab("log2 fold change") + ylab("-log10 p-value") +
  ggtitle( title ) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
  scale_colour_manual(values = c('blue','black','red'))

1.6 PCA图

pca_data <- prcomp(t(expmat),scale=TRUE)
pcx <- data.frame(pca_data$x)
pcr <- cbind(samples=rownames(pcx),group_list, pcx) 
ggplot(pcr, aes(PC1, PC2)) + geom_point(size=5, aes(color = group_list)) +
  geom_text(aes(label="samples"),hjust=-0.1, vjust=-0.3)

这个数据中的分组：bone relapses: 1=yes, 0=no

分别是加样本名和不加样本名的亚子：

1.7 热图

选择&调整矩阵，设置各种参数：

chosedgene <- rownames(DEGmtx)[1:50]
chosedmatx <- expmat[chosedgene,]      

re <- subset(pdata,pdata[,28]==1)
re <- row.names(re)
fr <- row.names(subset(pdata,pdata[,28]==0))
sampleorder <- as.vector(c(fr,re))

do_mtx <- t(scale(t(chosedmatx)))
do_mtx[do_mtx > 2] = 2
do_mtx[do_mtx < -2] = -2

do_mtx <- do_mtx[,sampleorder]

chosedprob <- row.names(do_mtx)
chosedsymbol <- AnnotationDbi::select(hgu133a.db, chosedprob,
                                      "SYMBOL",
                                      "PROBEID")[,2]
row.names(do_mtx) <- chosedsymbol

anno_col <- data.frame(sample=factor(rep(c("free", "relapses"), c(217, 69))))
row.names(anno_col) <- colnames(do_mtx)
ann_color <- list(sample=c(free = "#3a5fcd", relapses = "#cd0000"))

画图：

png(filename = "GSE2034_hm_nmlmatrix_nocluscol.png", height = 5000, width = 6000,
   res = 500, units = "px")

pheatmap::pheatmap(do_mtx, scale = "none", color = color,
                   annotation_col = anno_col,
                   annotation_colors = ann_color,
                   fontsize = 8, fontsize_row = 8, 
                   show_colnames = F,
                   cluster_cols = F,
                   main = "GSE2034: free vs relapses HEATMAP (delected outliers)",)
dev.off()

2. topGO

2.1 topGO: Predefined list of interesting genes

过表征分析 (over representation analysis, ORA)

Tests based on gene counts. This is the most popular family of tests, given that it only requires the presence of a list of interesting genes and nothing more. Tests like Fisher's exact test, Hypegeometric test and binomial test belong to this family. Draghici et al. (2006) .

2.1.1 构建 `topGOdata` 对象

构建 geneID2GO list

library(topGO)
library(hgu133a.db)
library(dplyr)

probe_id <- row.names(expmat)
entrez_id <- AnnotationDbi::select(hgu133a.db,
                                   probe_id,
                                   "ENTREZID",
                                   "PROBEID")[,2]

geneID2GOtb <- AnnotationDbi::select(hgu133a.db, entrez_id,
                                     "GO", "ENTREZID")
geneID2GOsb <- geneID2GOtb %>%
  group_by(ENTREZID) %>%
  summarise(GO=list(GO))
geneID2GO <- geneID2GOsb$GO
names(geneID2GO) <- rownames(geneID2GOsb)

构建 geneList

myInterestingDEG <- subset(DEGdf, abs(DEGdf$logFC) > 1)
myInterestingDEG <- subset(myInterestingDEG, myInterestingDEG$adj.P.Val < 0.05)

	logFC	AveExpr	t	P.Value	adj.P.Val	B	result
214777_at	1.621019	7.578387	4.723681	3.63E-06	0.005771	4.093824	UP
216365_x_at	1.340734	5.631291	4.599642	6.35E-06	0.007075	3.592027	UP
217157_x_at	1.02288	8.543867	4.564419	7.43E-06	0.007348	3.45161	UP
216984_x_at	1.482014	7.826338	4.56344	7.46E-06	0.007348	3.447722	UP
217378_x_at	1.740957	8.055472	4.500623	9.85E-06	0.008778	3.199637	UP
217258_x_at	1.217204	7.180579	4.485677	1.05E-05	0.009013	3.14105	UP
217148_x_at	1.236926	9.662857	4.468183	1.14E-05	0.009236	3.072682	UP
211798_x_at	1.129989	7.765948	4.411513	1.45E-05	0.009236	2.852795	UP
214669_x_at	1.146124	11.22506	4.382578	1.65E-05	0.009236	2.741463	UP
211644_x_at	1.51252	8.895729	4.353739	1.86E-05	0.00958	2.631127	UP
215176_x_at	1.554616	9.781743	4.350516	1.89E-05	0.00958	2.618838	UP
211637_x_at	1.319264	6.128292	4.291975	2.42E-05	0.01036	2.396959	UP
215121_x_at	1.279395	12.98246	4.247872	2.92E-05	0.011018	2.231533	UP
216576_x_at	1.355247	8.468197	4.198257	3.59E-05	0.011951	2.047218	UP
204540_at	-1.06758	8.610066	-4.19044	3.71E-05	0.011982	2.018367	DOWN
216401_x_at	1.475812	7.711042	4.180749	3.86E-05	0.011983	1.982631	UP
207430_s_at	-1.3399	6.097727	-4.15096	4.37E-05	0.012368	1.873294	DOWN
213201_s_at	-1.27114	6.641464	-4.13499	4.66E-05	0.01267	1.814941	DOWN
215214_at	1.239997	7.376844	4.125793	4.84E-05	0.012763	1.781441	UP
204933_s_at	1.073296	4.540879	4.031533	7.10E-05	0.015467	1.441822	UP
217281_x_at	1.355866	5.417214	4.015399	7.58E-05	0.015784	1.384391	UP
214858_at	-1.08985	6.455748	-4.00976	7.75E-05	0.015861	1.36435	DOWN
216510_x_at	1.132241	6.304655	3.99392	8.26E-05	0.016195	1.308245	UP
215379_x_at	1.123251	11.77926	3.943786	0.000101	0.016689	1.131931	UP
211634_x_at	1.014891	6.92074	3.93688	0.000104	0.016689	1.107799	UP
210297_s_at	-1.201	6.705364	-3.93263	0.000105	0.016689	1.092959	DOWN
209138_x_at	1.136452	13.28617	3.9291	0.000107	0.016689	1.080658	UP
211645_x_at	1.244501	9.308405	3.889339	0.000125	0.017262	0.942694	UP
216491_x_at	1.292163	5.893802	3.845647	0.000148	0.018642	0.79254	UP
216560_x_at	1.278329	5.810675	3.778756	0.000192	0.020615	0.565602	UP
221286_s_at	1.10116	7.542805	3.777608	0.000192	0.020615	0.561739	UP
206391_at	1.082072	6.946425	3.505906	0.000528	0.031189	-0.32275	UP
205509_at	-1.20626	8.869439	-3.48369	0.000572	0.032322	-0.39241	DOWN
209374_s_at	1.034312	10.40725	3.443591	0.000659	0.034236	-0.51714	UP
204623_at	-1.04503	11.21622	-3.25463	0.001271	0.045188	-1.08699	DOWN

geneNames <- names(geneID2GO) 
myInterestingGenestb <- AnnotationDbi::select(hgu133a.db, row.names(myInterestingDEG),
                                            "ENTREZID", "PROBEID")

myInterestingGenes <- geneNames[match(myInterestingGenestb[,2], 
                                      geneNames)]
myInterestingGenes <- na.omit(myInterestingGenes)

ORAgeneList <- factor(as.integer(geneNames %in% myInterestingGenes)) 
names(ORAgeneList) <- geneNames 
str(ORAgeneList) 
#  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#  - attr(*, "names")= chr [1:13755] "10" "100" "1000" "10000" ...

构建 topGOdata 对象：

ORAGOdata <- new("topGOdata",
              ontology = "BP",
              allGenes = ORAgeneList,
              annot = annFUN.gene2GO,
              gene2GO = geneID2GO)
# 
# Building most specific GOs .....
#   ( 11302 GO terms found. )
# 
# Build GO DAG topology ..........
#   ( 15326 GO terms and 36394 relations. )
# 
# Annotating nodes ...............
#   ( 11973 genes annotated to the GO terms. )
ORAGOdata
# 
# ------------------------- topGOdata object -------------------------
# 
#  Description:
#    -   
# 
#  Ontology:
#    -  BP 
# 
#  13755 available genes (all genes from the array):
#    - symbol:  10 100 1000 10000 100008586  ...
#    - 29  significant genes. 
# 
#  11973 feasible genes (genes that can be used in the analysis):
#    - symbol:  10 100 1000 10000 10001  ...
#    - 21  significant genes. 
# 
#  GO graph (nodes with at least  1  genes):
#    - a graph with directed edges
#    - number of nodes = 15326 
#    - number of edges = 36394 
# 
# ------------------------- topGOdata object -------------------------
#

2.1.2 富集分析

we perform a classical enrichment analysis by testing the over-representation of GO terms within the group of di erentially expressed genes.

ORAresultFisher <- runTest(ORAGOdata, algorithm = "classic", statistic = "fisher") 
# 
#            -- Classic Algorithm -- 
# 
#        the algorithm is scoring 449 nontrivial nodes
#        parameters: 
#            test statistic: fisher
# 
ORAresultFisher
# 
# Description:  
# Ontology: BP 
# 'classic' algorithm with the 'fisher' test
# 15326 GO terms scored: 86 terms with p < 0.01
# Annotation data:
#     Annotated genes: 11973 
#     Significant genes: 21 
#     Min. no. of genes annotated to a GO: 1 
#     Nontrivial nodes: 449

使用 topGO 包时，所构建的 topGOdata 包含了 gene universe (ORAGOdata) 和 p-value (GSEAGOdata), ORA 和 GSEA 对应的统计方法可以同时应用于同一个 topGOdata 对象。所以区别只是基于不同的已知信息，构建 topGOdata 的方法不同。（maybe?

基于 classic 和 elim 算法，对 ORAdata 进行 Kolmogorov-Smirnov test:

ORAresultKS <- runTest(ORAGOdata, algorithm = "classic", statistic = "ks") 
# 
#            -- Classic Algorithm -- 
# 
#        the algorithm is scoring 15326 nontrivial nodes
#        parameters: 
#            test statistic: ks
#            score order: increasing
ORAresultKS
# 
# Description:  
# Ontology: BP 
# 'classic' algorithm with the 'ks' test
# 15326 GO terms scored: 903 terms with p < 0.01
# Annotation data:
#     Annotated genes: 11973 
#     Significant genes: 21 
#     Min. no. of genes annotated to a GO: 1 
#     Nontrivial nodes: 15326

ORAresult.elim <- runTest(ORAGOdata, algorithm = "elim", statistic = "ks") 
# 
#            -- Elim Algorithm -- 
# 
#        the algorithm is scoring 15326 nontrivial nodes
#        parameters: 
#            test statistic: ks
#            cutOff: 0.01
#            score order: increasing
# ...
ORAresult.elim
# 
# Description:  
# Ontology: BP 
# 'elim' algorithm with the 'ks : 0.01' test
# 15326 GO terms scored: 291 terms with p < 0.01
# Annotation data:
#     Annotated genes: 11973 
#     Significant genes: 21 
#     Min. no. of genes annotated to a GO: 1 
#     Nontrivial nodes: 15326

2.1.3 Summary

ORAallRes <- GenTable(ORAGOdata,classicFisher = ORAresultFisher,
                   classicKS = ORAresultKS,
                   elimKS = ORAresult.elim,
                   orderBy = "elimKS",
                   ranksOf = "classicFisher",
                   topNodes = 20)

	GO.ID	Term	Annotated	Significant	Expected	Rank in classicFisher	classicFisher	classicKS	elimKS
1	GO:0030198	extracellular matrix organization	303	1	0.53	233	0.42	2.20E-11	4.10E-08
2	GO:0019886	antigen processing and presentation of e...	88	0	0.15	449	1	5.50E-08	5.50E-08
3	GO:0098664	G protein-coupled serotonin receptor sig...	23	0	0.04	450	1	7.60E-08	7.60E-08
4	GO:0006120	mitochondrial electron transport, NADH t...	42	0	0.07	451	1	2.10E-07	2.10E-07
5	GO:0007190	activation of adenylate cyclase activity	29	0	0.05	452	1	5.10E-07	5.10E-07
6	GO:0060337	type I interferon signaling pathway	87	0	0.15	453	1	3.70E-06	3.70E-06
7	GO:0007193	adenylate cyclase-inhibiting G protein-c...	63	0	0.11	454	1	1.70E-08	4.80E-06
8	GO:0006069	ethanol oxidation	9	0	0.02	455	1	4.90E-06	4.90E-06
9	GO:0060078	regulation of postsynaptic membrane pote...	107	0	0.19	456	1	5.60E-07	5.70E-06
10	GO:0007155	cell adhesion	1134	0	1.99	457	1	1.30E-06	6.20E-06
11	GO:0042531	positive regulation of tyrosine phosphor...	54	0	0.09	458	1	6.60E-06	6.60E-06
12	GO:0000184	nuclear-transcribed mRNA catabolic proce...	114	0	0.2	459	1	9.90E-06	9.90E-06
13	GO:0048013	ephrin receptor signaling pathway	81	0	0.14	460	1	1.10E-05	1.10E-05
14	GO:0006749	glutathione metabolic process	43	0	0.08	461	1	1.40E-05	1.40E-05
15	GO:0006123	mitochondrial electron transport, cytoch...	13	0	0.02	462	1	1.40E-05	1.40E-05
16	GO:0009636	response to toxic substance	446	0	0.78	463	1	1.00E-08	1.70E-05
17	GO:1901687	glutathione derivative biosynthetic proc...	19	0	0.03	464	1	2.70E-05	2.70E-05
18	GO:1901701	cellular response to oxygen-containing c...	924	1	1.62	335	0.82	3.30E-06	3.20E-05
19	GO:1904322	cellular response to forskolin	10	0	0.02	465	1	3.20E-05	3.20E-05
20	GO:0097267	omega-hydroxylase P450 pathway	9	0	0.02	466	1	3.50E-05	3.50E-05

2.2 topGO: Using the genes score

基因富集分析 (gene set enrichment analysis, GSEA)

Tests based on gene scores or gene ranks. It includes Kolmogorov-Smirnov like tests (also known as GSEA), Gentleman's Category, t-test, etc. Ackermann and Strimmer (2009).

2.2.1 构建 `topGOdata` 对象

构建 geneList

GSEAgeneList <- subset(DEGdf, select = "adj.P.Val")
GSEAgeneList <- as.numeric(GSEAgeneList[,1])
names(GSEAgeneList) <- row.names(DEGdf)
str(GSEAgeneList)
#  Named num [1:22283] 3.25e-05 9.79e-05 4.61e-04 6.98e-04 1.34e-03 ...
#  - attr(*, "names")= chr [1:22283] "209835_x_at" "212014_x_at" "204490_s_at" "209380_s_at" ...

通过函数 topDiffGenes() 筛选q值小于0.01的基因。

topDiffGenes <- function(allScore) {
  return(allScore < 0.01)
}
length(GSEAgeneList)
# [1] 22283
sum(topDiffGenes(GSEAgeneList))
# [1] 50

构建 topGOdata 对象：

GSEAGOdata <- new("topGOdata",
                  description = "GO analysis of GSE2034 data; free vs relapses",
                  ontology = "BP",
                  allGenes = GSEAgeneList,
                  geneSel = topDiffGenes,
                  nodeSize = 5,
                  annot = annFUN.db,
                  affyLib = "hgu133a.db")
# 
# Building most specific GOs .....
#   ( 11302 GO terms found. )
# 
# Build GO DAG topology ..........
#   ( 15326 GO terms and 36394 relations. )
# 
# Annotating nodes ...............
#   ( 19375 genes annotated to the GO terms. )

GSEAGOdata
# ------------------------- topGOdata object -------------------------
# 
#  Description:
#    -  GO analysis of GSE2034 data; free vs relapses 
# 
#  Ontology:
#    -  BP 
# 
#  22283 available genes (all genes from the array):
#    - symbol:  209835_x_at 212014_x_at 204490_s_at 209380_s_at 209831_x_at  ...
#    - score :  3.25364921e-05 9.78850462e-05 0.000460980422 0.000698385002 0.0013384333  ...
#    - 50  significant genes. 
# 
#  19375 feasible genes (genes that can be used in the analysis):
#    - symbol:  209835_x_at 212014_x_at 204490_s_at 209380_s_at 209831_x_at  ...
#    - score :  3.25364921e-05 9.78850462e-05 0.000460980422 0.000698385002 0.0013384333  ...
#    - 47  significant genes. 
# 
#  GO graph (nodes with at least  5  genes):
#    - a graph with directed edges
#    - number of nodes = 10532 
#    - number of edges = 24696 
# 
# ------------------------- topGOdata object -------------------------
#

2.2.2 富集分析

2.2.3 Summary

GenTable 导出表格

未完不更

References

https://davetang.org/muse/2018/05/15/making-a-heatmap-in-r-with-the-pheatmap-package/

最后，向大家隆重推荐生信技能树的一系列干货！

生信技能树全球公益巡讲：https://mp.weixin.qq.com/s/E9ykuIbc-2Ja9HOY0bn_6g
B站公益74小时生信工程师教学视频合辑：https://mp.weixin.qq.com/s/IyFK7l_WBAiUgqQi8O7Hxw
招学徒：https://mp.weixin.qq.com/s/KgbilzXnFjbKKunuw7NVfw

从GEO数据下载到limma差异分析，再到topGO富集分析

GSE2034 topGO

1. DEG

1.1 下载GEO数据

1.2 表达矩阵

1.3 设计矩阵和对比矩阵

1.4 差异分析

1.5 火山图

1.6 PCA图

1.7 热图

2. topGO

2.1 topGO: Predefined list of interesting genes

2.1.1 构建 `topGOdata` 对象

2.1.2 富集分析

2.1.3 Summary

2.2 topGO: Using the genes score

2.2.1 构建 `topGOdata` 对象

2.2.2 富集分析

2.2.3 Summary

References

最后，向大家隆重推荐生信技能树的一系列干货！

猜你喜欢

热点阅读

从GEO数据下载到limma差异分析，再到topGO富集分析

GSE2034 topGO

1. DEG

1.1 下载GEO数据

1.2 表达矩阵

1.3 设计矩阵和对比矩阵

1.4 差异分析

1.5 火山图

1.6 PCA图

1.7 热图

2. topGO

2.1 topGO: Predefined list of interesting genes

2.1.1 构建 topGOdata 对象

2.1.2 富集分析

2.1.3 Summary

2.2 topGO: Using the genes score

2.2.1 构建 topGOdata 对象

2.2.2 富集分析

2.2.3 Summary

References

最后，向大家隆重推荐生信技能树的一系列干货！

猜你喜欢

热点阅读

2.1.1 构建 `topGOdata` 对象

2.2.1 构建 `topGOdata` 对象