GSEA富集分析流程及解释(R代码)【待完善】
2022-12-01 本文已影响0人
zyp1997
GSEA简介
数据准备
GSEA只需要gene id和log2FoldChange两列
library(clusterProfiler)
library(enrichplot)
library(org.Mm.eg.db) # mouse全基因组注释包,用于不同版本基因名的转换
# 这个是属于注释包,每个基因集可能对应的注释包不一样,要从基因集所在的平台找到对应的注释包,然后从bioconductor获取安装。
library(dplyr)
library(stringr)
library(msigdbr) # 提供多个物种的MSigDB数据
dif_genes <- read.csv("dif_genes.csv")
# 得到差异分析结果:all_different_genes
geneList <- dif_genes$avg_logFC
# 如果是Ensemble ID,并且如果还带着版本号,需要去除版本号,再进行基因ID转换,得到Entrez ID
names(geneList) <- dif_genes$gene_id
# 最后从大到小排序,得到一个字符串
geneList <- sort(geneList, decreasing = T)
# 检查是否有重复基因名
genelist <- dif_genes$gene_id
genelist[duplicated(genelist)]
数据集准备
MSigDB数据库是GSEA的官方数据库
misgdbr包
msigdbr包可提供多个物种的MSigDB数据
# GSEA #mdigbd dataset
msigdbr_df <- msigdbr(species = "Mus musculus",category = "C2")
# msigdbr_df$gene_symbol:"SYMBOL";$entrez_gene:"ENTREZID"
# msigdbr_list <- split(x = msigdbr_df$entrez_gene, f = msigdbr_df$gs_name)
# msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, entrez_gene) %>% as.data.frame()
# msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, ensembl_gene) %>% as.data.frame()
msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, gene_symbol) %>% as.data.frame()
用clusterProfiler实现GSEA
gsea_result <- GSEA(geneList = geneList,
minGSSize = 1,
maxGSSize = 1000,
pvalueCutoff = 1,
TERM2GENE = msigdbr_t2g)
# 将其中的基因名变成symbol ID
#gsea_result <- setReadable(gsea_result, OrgDb = org.Mm.eg.db)
# 还可以直接点击查看,只需要转成数据框
#gsea_result_df <- as.data.frame(gsea_result)
# 导出结果
write.csv(gsea_result@result, file = "gsea_result.csv")
# 对感兴趣的通路可视化
gseaplot2(gsea_result, geneSetID = c("PATHWAY_NAME"))
或进行GO分析
## gseGO
# 想看biological process(BP),可以先不过滤p值
go_result <- gseGO(geneList = geneList,
ont = "BP",
OrgDb = org.Mm.eg.db,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1)
# 将其中的基因名变成symbol ID
go_result <- setReadable(go_result, OrgDb = org.Mm.eg.db)
# 还可以直接点击查看,只需要转成数据框
go_result_df <- as.data.frame(go_result)
# 导出结果
write.csv(go_result@result, file = "gseGO_result.csv")
# 对感兴趣的通路可视化
gseaplot2(go_result, geneSetID = c("GO:0032981"))
KEGG分析
##gseKEGG
kk <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
verbose = FALSE)
gseaplot(kk, geneSetID = "hsa04110")
结果注释,可视化解读
enrichmentscore 或 NES 大于0表示上调,小于0表示下调
GSEA详细解释及结果解读
===================================================
完整代码
rm(list=ls())
setwd("C:\\Users\\DELL\\Desktop\\gsea")
memory.limit(102400)
library(clusterProfiler)
library(enrichplot)
library(org.Mm.eg.db) # mouse全基因组注释包,用于不同版本基因名的转换
library(dplyr)
library(stringr)
library(msigdbr)
# GSEA #mdigbd dataset
msigdbr_df <- msigdbr(species = "Mus musculus", category = "C2")
# msigdbr_df$gene_symbol:"SYMBOL";$entrez_gene:"ENTREZID"
# msigdbr_list <- split(x = msigdbr_df$entrez_gene, f = msigdbr_df$gs_name)
# msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, entrez_gene) %>% as.data.frame()
# msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, ensembl_gene) %>% as.data.frame()
msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, gene_symbol) %>% as.data.frame()
files <- list.files(getwd(), pattern = "*.csv")
# 多组数据,用for循环
for (i in 1:length(files)){
file <- files[i]
print(file)
dif_genes <- read.csv(file)
# 得到差异分析结果:all_different_genes
geneList <- dif_genes$avg_logFC
# 如果是Ensemble ID,并且如果还带着版本号,需要去除版本号,再进行基因ID转换,得到Entrez ID
names(geneList) <- dif_genes$gene_id
# 最后从大到小排序,得到一个字符串
geneList <- sort(geneList, decreasing = T)
# 检查是否有重复基因名
genelist <- dif_genes$gene_id
genelist <- genelist[duplicated(genelist)]
gsea_result <- GSEA(geneList = geneList,
minGSSize = 1,
maxGSSize = 1000,
pvalueCutoff = 1,
TERM2GENE = msigdbr_t2g)
# 将其中的基因名变成symbol ID
#gsea_result <- setReadable(gsea_result, OrgDb = org.Mm.eg.db)
# 还可以直接点击查看,只需要转成数据框
#gsea_result_df <- as.data.frame(gsea_result)
# 导出结果
write.csv(gsea_result@result, file = "gsea_result.csv")
}