GEO数据挖掘富集分析

第三步:简单的GEO数据的KEGG分析和GSEA分析

2020-05-24  本文已影响0人  碌碌无为的杰少

KEGG分析

gene_up = deg[deg$change == 'up','ENTREZID'] 
gene_down = deg[deg$change == 'down','ENTREZID'] 
gene_diff = c(gene_up,gene_down)
gene_all = deg[,'ENTREZID']
#(2)对上调/下调/所有差异基因进行富集分析
#注意这里又有个F
if(T){
  kk.up <- enrichKEGG(gene         = gene_up,
                      organism     = 'hsa',
                      universe     = gene_all,
                      pvalueCutoff = 0.9,
                      qvalueCutoff = 0.9)
  kk.down <- enrichKEGG(gene         =  gene_down,
                        organism     = 'hsa',
                        universe     = gene_all,
                        pvalueCutoff = 0.9,
                        qvalueCutoff =0.9)
  kk.diff <- enrichKEGG(gene         = gene_diff,
                        organism     = 'hsa',
                        pvalueCutoff = 0.9)
  save(kk.diff,kk.down,kk.up,file = "GSE4107kegg.Rdata")
}
load("GSE4107kegg.Rdata")
#(3)从富集结果中提取出结果数据框
kegg_diff_dt <-  kk.up@result

#(4)按照pvalue筛选通路
#在enrichkegg时没有设置pvaluecutoff,在此处筛选
down_kegg <- kk.down@result %>%
  filter(pvalue<0.01) %>% #筛选行
  mutate(group=-1) #新增列

up_kegg <- kk.up@result %>%
  filter(pvalue<0.01) %>%
  mutate(group=1)
#(5)可视化
g_kegg <- kegg_plot(up_kegg,down_kegg)
g_kegg 
g_kegg +scale_y_continuous(labels = c(15,10,5,0,5,10))
ggsave(g_kegg,filename = 'kegg_up_down.png')
image.png

按照count数目进行KEGG

count数目进行kegg,并且可视化

if(F){
  x = kk.up
  df = data.frame(x)
  ## 计算富集分数
  x@result$richFactor =x@result$Count / as.numeric(sub("/\\d+", "", x@result$BgRatio))
  y =x@result
  library(dplyr)
  library(ggplot2)
  showCategory = 20
  y %>% 
    arrange(p.adjust) %>% 
    slice(1:showCategory) %>% 
    ggplot(aes(richFactor,forcats::fct_reorder(Description, richFactor))) + 
    geom_segment(aes(xend=0, yend = Description)) +
    geom_point(aes(color=p.adjust, size = Count)) +
    ## 调整颜色的区间,begin越大,整体颜色越明艳
    scale_color_viridis_c(begin = 0.3, end = 1) +
    ## 调整泡泡的大小
    scale_size_continuous(range=c(2, 10)) +
    theme_minimal() + 
    xlab("rich factor") +
    ylab(NULL) + 
    ggtitle("")
}
browseKEGG( kk.up, 'hsa04510')
image.png
image.png

GSEA分析

GSEA与KEGG不同的是,GSEA是所有基因的增减,而kegg是争对 上调基因下调基因

library(GSEABase) 
library(clusterProfiler)
library(DOSE)
library(org.Hs.eg.db)
library(ggplot2)
library(stringr)
options(stringsAsFactors = F)

data(geneList)
geneList = deg$logFC
names(geneList) = deg$ENTREZID
geneList = sort(geneList,decreasing = T)

#准备gmt文件
geneset <- read.gmt("h.all.v7.1.entrez.gmt")  
geneset$ont = str_remove(geneset$ont,"HALLMARK_")
egmt <- GSEA(geneList, TERM2GENE=geneset,verbose=F)
#> Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize = minGSSize, : There are ties in the preranked stats (0.05% of the list).
#> The order of those tied genes will be arbitrary, which may produce unexpected results.
egmt2<- setReadable(egmt,OrgDb=org.Hs.eg.db, keyType = "ENTREZID")
y=data.frame(egmt2)
#气泡图,展示geneset被激活还是抑制
dotplot(egmt2,split=".sign")+facet_grid(~.sign)
#dotplot(egmt2,split=".sign")+facet_grid(~.sign)+
  #scale_y_discrete(labels=function(x) str_wrap(str_replace_all(x,"_"," "), width=30))
#  scale_y_discrete(labels=function(x) str_remove(x,"HALLMARK_"))
#dotplot(y,showCategory=20,split=".sign")+facet_grid(~.sign)
#山峦图,展示每个geneset的基因logFC分布
ridgeplot(egmt)
image.png
image.png

单通道和多通道的gsea

library(enrichplot)
gseaplot2(egmt, geneSetID = 1, title = egmt$Description[1])
#多个gnenset合并展示
gseaplot2(egmt, geneSetID = 1:3,pvalue_table = T)
#gseaplot2(egmt,2,color = "blue",pvalue_table = T)
image.png
image.png
上一篇下一篇

猜你喜欢

热点阅读