R语言GEO数据挖掘:步骤四:富集分析KEGG,GO
2019-03-27 本文已影响211人
mayoneday
1.读取第三部存储数据(基因差异表达情况)
rm(list = ls()) ## 魔幻操作,一键清空~
load(file = 'deg.Rdata')
head(deg)
deg
2.设定阈值计算基因上调下调数量
## 不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。
logFC_t=1.5
deg$g=ifelse(deg$P.Value>0.05,'stable',
ifelse( deg$logFC > logFC_t,'UP',
ifelse( deg$logFC < -logFC_t,'DOWN','stable') )
)#P>0.05输出stable,其中设定当logFC大于1.5为上调输出'UP',大于-1.5为下调输出'DOWN',,如果都不是则输出'stable',从而增加了一列g,筛选出了上调和下调的基因
table(deg$g)
得出上调41,下调88
head(deg)
deg多了提示基因上下调的一列g
deg$symbol=rownames(deg)
给deg增加一列基因名
3.ID转换
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(unique(deg$symbol), fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db)
#bitr功能为ID转换,
#bitr(geneID, fromType, toType, OrgDb, drop = TRUE);
#geneid :基因ID输入 ; fromtype : 输入ID型;toType:输出ID型;orgdb :注释数据库)
head(df)
QQ截图20190327194950.jpg
DEG=deg#把deg数据赋值给DEG数据
head(DEG)
得到DEG
DEG=merge(DEG,df,by.y='SYMBOL',by.x='symbol')
#把数据DEG,df通过,DEG的'symbol'列,df的'SYMBOL'列连接在一起,转化ID
head(DEG)
save(DEG,file = 'anno_DEG.Rdata')
DEG
4.得出差异基因
gene_up= DEG[DEG$g == 'UP','ENTREZID'] #选出上调基因ID
gene_down=DEG[DEG$g == 'DOWN','ENTREZID'] #选出下调基因ID
gene_diff=c(gene_up,gene_down)#得出上下调基因ID
gene_all=as.character(DEG[ ,'ENTREZID'] )#得出所有基因ID
data(geneList, package="DOSE")#得出geneList数据
head(geneList)#查看数据
geneList
boxplot(geneList)#画箱线图
boxplot(DEG$logFC)#画箱线图
geneList=DEG$logFC#把DEG数据logFC列值赋值给数据geneList
QQ截图20190327214317.jpg
names(geneList)=DEG$ENTREZID#把ID赋值给geneList数据的名字
得到geneList:ID和表达量的关系
geneList=sort(geneList,decreasing = T)#把数据进行排序
排序之后的geneList
5. KEGG pathway analysis
做KEGG数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
if(T){
### over-representation test
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.up)[,1:6]
dotplot(kk.up );ggsave('kk.up.dotplot.png')
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.down)[,1:6]
dotplot(kk.down );ggsave('kk.down.dotplot.png')
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.05)
head(kk.diff)[,1:6]
dotplot(kk.diff );ggsave('kk.diff.dotplot.png')
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1
source('functions.R')
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = 'kegg_up_down.png')
6. GSEA
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.9,
verbose = FALSE)
head(kk_gse)[,1:6]
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
}
7.GO database analysis
做GO数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
{
g_list=list(gene_up=gene_up,
gene_down=gene_down,
gene_diff=gene_diff)
if(F){
go_enrich_results <- lapply( g_list , function(gene) {
lapply( c('BP','MF','CC') , function(ont) {
cat(paste('Now process ',ont ))
ego <- enrichGO(gene = gene,
universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
print( head(ego) )
return(ego)
})
})
save(go_enrich_results,file = 'go_enrich_results.Rdata')
}
load(file = 'go_enrich_results.Rdata')
n1= c('gene_up','gene_down','gene_diff')
n2= c('BP','MF','CC')
for (i in 1:3){
for (j in 1:3){
fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
cat(paste0(fn,'\n'))
png(fn,res=150,width = 1080)
print( dotplot(go_enrich_results[[i]][[j]] ))
dev.off()
}
}
}
把之前的数据设置好之后,后面的富集分析也是傻瓜式的