GSEA&GSVA分析-单细胞转录组高阶分析03

2023-09-25 本文已影响0人一车小面包人

背景：分析单细胞转录组中不同细胞类型的特定基因集的差异表达。

1.GSEA分析
下载并加载会用到的R包：

library(tidyverse)
library(data.table)
library(org.Hs.eg.db)
library(clusterProfiler)
library(biomaRt)
library(enrichplot)
library(dplyr)
library(GSEABase)
library(Seurat)
library(ggsci)

函数代码：

#' sc the seurat object
#' group_name the Idents(sc)
#' group_1 the FindMarkers ident.1
#' group_2 the FindMarkers ident.2
my_GSEA<-function(sc,group_name,group_1,group_2,min.pct=0.25,logfc.threshold = 0.25,thresh.use = 0.99,gmtpath="./geneset_gmt/",gmt=NULL,nrow_all=10,nrow_down=10,nrow_up=10){
#' 数据预处理
#sc<-readRDS("./sc_umap.rds")
dir.create("./GSEA_results")
sc<-sc
Idents(sc)<-sc@meta.data[,group_name]
deg<-FindMarkers(sc,ident.1=group_1,ident.2=group_2,min.pct=min.pct,logfc.threshold = logfc.threshold)
deg$gene<-rownames(deg)
sub.deg<-deg[,c("gene","avg_log2FC")]
genelist_input<-sub.deg
genename <- as.character(genelist_input[,1])
gene_map<-bitr(genename,fromType="SYMBOL",toType="ENTREZID",OrgDb = "org.Hs.eg.db")
colnames(genelist_input)<-c("SYMBOL","avg_log2FC")
final.gene<-left_join(gene_map,genelist_input,"SYMBOL")
final.gene<-final.gene%>%arrange(desc(avg_log2FC))
geneList=final.gene$avg_log2FC
names(geneList)<-as.character(final.gene$ENTREZID)

#' GSEA分析-GO
Go_gseresult <- gseGO(geneList, 'org.Hs.eg.db', keyType = "ENTREZID", ont="all", nPerm = 1000, minGSSize = 10, maxGSSize = 1000, pvalueCutoff=1)
#' GSEA分析-KEGG
KEGG_gseresult <- gseKEGG(geneList, nPerm = 1000, minGSSize = 10, maxGSSize = 1000, pvalueCutoff=1)
#' GSEA分析-Reactome
Go_Reactomeresult <- gsePathway(geneList, nPerm = 1000, minGSSize = 10, maxGSSize = 1000, pvalueCutoff=1)
#' 保存结果文件 csv
write.table (Go_gseresult, file ="./GSEA_results/Go_gseresult.csv", sep =",", row.names =TRUE)
write.table (KEGG_gseresult, file ="./GSEA_results/KEGG_gseresult.csv", sep =",", row.names =TRUE)
write.table (Go_Reactomeresult, file ="./GSEA_results/Go_Reactomeresult.csv", sep =",", row.names =TRUE)

#' 绘图
mycol<-pal_nejm()(8)
pdf("./GSEA_results/Go_gseresult.pdf",height=10,width=12)
lapply(1:nrow_all, function(i){
  pp<-gseaplot2(Go_gseresult,Go_gseresult$ID[i]
            title=Go_gseresult$Description[i],pvalue_table = T)
  print(pp)
})
dev.off()
pdf("./GSEA_results/KEGG_gseresult.pdf",height=10,width=12)
lapply(1:nrow_all, function(i){
  pp<-gseaplot2(KEGG_gseresult,KEGG_gseresult$ID[i],
            title=KEGG_gseresult$Description[i],pvalue_table = T)
  print(pp)
})
dev.off()
pdf("./GSEA_results/Go_Reactomeresult.pdf",height=10,width=12)
lapply(1:nrow_all, function(i){
  pp<-gseaplot2(Go_Reactomeresult,Go_Reactomeresult$ID[i],
            title=Go_Reactomeresult$Description[i],pvalue_table = T)
  print(pp)
})
dev.off()

使用msigdb数据库的基因集，需要先去msigdbGSEA | MSigDB (gsea-msigdb.org)官网下载基因集信息。

genesets.png
下载完的基因集保存在文件夹geneset_gmt中：

geneset_gmt.png
接着使用该数据库的基因集进行GSEA分析：

#' msigdb数据库
if(!is.null(gmt)){
gmtfiles<-list.files(gmtpath)
geneset<-read.gmt(paste0("./geneset_gmt/",gmt,".all.v2023.1.Hs.entrez.gmt"))
egmt <- GSEA(geneList, TERM2GENE=geneset,
               minGSSize = 1,
               pvalueCutoff = 0.99,
               verbose=FALSE)
gsea_results_df <- egmt@result
write.csv(gsea_results_df,file = paste0('./GSEA_results/',gmt,'_gsea_results_df.csv'))
down<-gsea_results_df[gsea_results_df$pvalue<0.05 & gsea_results_df$enrichmentScore < -0.3,]
down$group=-1
up<-gsea_results_df[gsea_results_df$pvalue<0.05 & gsea_results_df$enrichmentScore > 0.3,]
up$group=1
pro=paste0('GSEA_',gmt)
if(nrow(down)>0){
nrow_down=ifelse(nrow_down<nrow(down),nrow_down,nrow(down))
pdf(paste0('./GSEA_results/',pro,'_down.pdf'),height=10,width=12)
lapply(1:nrow_down, function(i){
  pp<-gseaplot2(egmt,down$ID[i],
            title=down$Description[i],pvalue_table = T)
  print(pp)
})
dev.off()
}
if(nrow(up)>0){
nrow_up=ifelse(nrow_up<nrow(up),nrow_up,nrow(up))
pdf(paste0('./GSEA_results/',pro,'_up.pdf'),height=10,width=12)
lapply(1:nrow_up, function(i){
  pp<-gseaplot2(egmt,up$ID[i],
            title=up$Description[i],pvalue_table = T)
  print(pp)
})
dev.off()
}
}
}

结果文件：

results.png

2.GSVA分析
加载用到的包：

library(Seurat)
library(tidyverse)
library(ggplot2)
library(ggsci)
library(org.Hs.eg.db)
library(clusterProfiler)
library(GSVA)
library(limma)

函数代码：

my_GSVA<-function(sc,nor="log",gmt="h",group_list_object="groups",barplot=T,sample_number=20){
sc<-sc
#' 数据预处理
#sc<-readRDS("sc_umap.rds")
if(nor=="log"){
expr<-as.data.frame(sc$RNA@data)
}else{
expr<-as.data.frame(sc$SCT@counts)
}
expr$ID<-rownames(expr)
s2e<-bitr(expr$ID,fromType="SYMBOL",toType="ENTREZID",OrgDb = "org.Hs.eg.db")
expr<-inner_join(expr,s2e,by=c("ID"="SYMBOL"))
rownames(expr)<-expr$ENTREZID
expr<-expr[,-c((ncol(expr)-1),ncol(expr))]
mycol<-pal_nejm()(8)
geneset<-read.gmt(paste0("./geneset_gmt/",gmt,".all.v2023.1.Hs.entrez.gmt"))
geneset_list<-split(geneset$gene,geneset$term) #' 通过geneset$term对geneset$gene进行分组
expr<-as.matrix(expr)
#' GSVA
gsval<-gsva(expr,geneset_list,kcdf="Gaussian",method="gsva",parallel.sz=12)
dir.create("./GSVA_results")
save(gsval,file="./GSVA_results/gsval.rds")

基于GSVA的差异分析：

#' DEGSVA
meta<-as.data.frame(sc@meta.data[,group_list_object,drop=F])
mydata<-t(as.data.frame(gsval))%>%as.data.frame()
mydata<-merge(meta,mydata,by.x=0,by.y=0)
rownames(mydata)<-mydata$Row.names
mydata<-mydata[,c(-1)] #' 只留下了groups列，删除了Row.names/orig.ident和seurat_clusters列
mydata<-mydata%>%arrange(desc(groups)) #' 排序
group_list=factor(mydata[,group_list_object])
design<-model.matrix(~0+group_list)
colnames(design)<-levels(group_list)
rownames(design)<-rownames(mydata)
contrast.matrix<-makeContrasts(paste0(rle(mydata[,group_list_object])$values[1],'-',rle(mydata[,group_list_object])$values[2]),levels=design)
fit<-lmFit(t(mydata[,-1]),design=design)
fit2<-contrasts.fit(fit,contrast.matrix)
fit2<-eBayes(fit2)
alldiff<-topTable(fit2,coef=1,n=Inf)
#sigdiff<-subset(alldiff,abs(logFC)>0.5&adj.P.Val<0.05)

绘制热图：

#' 绘制热图
alldiff<-alldiff[order(abs(alldiff$logFC),decreasing=T),]
plotdata<-t(mydata[,rownames(alldiff[1:20,])])
library(ComplexHeatmap)
library(circlize)
Groups<-mycol[c(1:2)]
names(Groups)<-c(rle(mydata[,group_list_object])$values[1],rle(mydata[,group_list_object])$values[2])
Top<-HeatmapAnnotation(Group=factor(mydata[,group_list_object]),
                        border=T,
                        col=list(Group=Groups),
                        show_annotation_name=T,
                        annotation_name_side="left",
                        annotation_name_gp=gpar(fontsize=10))
pdf("./GSVA_results/GSVA_heatmap.pdf",height=10,width=12)
pp<-Heatmap(plotdata,top_annotation=Top,cluster_rows=F,cluster_columns=F,border=T,
                row_names_side="left",column_order=NULL,show_column_names=F,
                column_split=c(rep(rle(mydata[,group_list_object])$values[1],rle(mydata[,group_list_object])$lengths[1]),rep(rle(mydata[,group_list_object])$values[2],rle(mydata[,group_list_object])$lengths[2])),
                row_names_gp = gpar(fontsize = 8))
print(pp)
dev.off()

绘制条形图：

if(barplot&gmt=="h"){
library(stringr)
library(ggplot2)
library(ggprism)
library(ggthemes)
Diff<-alldiff
Diff_sample<-sample_n(Diff,sample_number, replace = FALSE)
dat_plot <- data.frame(id = row.names(Diff_sample),t = Diff_sample$t)
dat_plot$id <- str_replace(dat_plot$id , "HALLMARK_","")
dat_plot$threshold = factor(ifelse(dat_plot$t  >-2, ifelse(dat_plot$t >= 2 ,'Up','Stable'),'Down'),levels=c('Up','Down','Stable'))
dat_plot <- dat_plot %>% arrange(t)
dat_plot$id <- factor(dat_plot$id,levels = dat_plot$id)
p <- ggplot(data = dat_plot,aes(x = id,y = t,fill = threshold)) +  geom_col()+  coord_flip() +  scale_fill_manual(values = c('Up'= '#36638a','NoSignifi'='#cccccc','Down'='#7bcd7b')) +  geom_hline(yintercept = c(-2,2),             color = 'white',             size = 0.5,lty='dashed') +  xlab('') +   ylab(paste0('t value of GSVA score,',rle(mydata[,group_list_object])$values[1],'-VS-',rle(mydata[,group_list_object])$values[2]) )+ guides(fill=F)+theme_prism(border = T) +  theme(    axis.text.y = element_blank(),    axis.ticks.y = element_blank())
low1 <- dat_plot %>% filter(t < -2) %>% nrow()
#low0 <- dat_plot %>% filter( t < 0) %>% nrow()
#high0 <- dat_plot %>% filter(t>0)%>%filter(t<2) %>% nrow()
low0 <- dat_plot %>% filter(t>-2)%>%filter( t < 2) %>% nrow()
high1 <- nrow(dat_plot)
pdf("./GSVA_results/GSVA_barplot.pdf",height=10,width=12)
pp<-p + geom_text(data = dat_plot[1:low1,],aes(x = id,y = 0.1,label = id), hjust = 0,color = 'black',size = 2.5) + geom_text(data = dat_plot[(low1 +1):(low1+low0),],aes(x = id,y = 0.1,label = id), hjust = 0,color = 'grey',size = 2.5) +geom_text(data = dat_plot[(low1+low0+1):high1,],aes(x = id,y = -0.1,label = id), hjust = 1,color = 'black',size = 2.5)
print(pp)
dev.off()
}

结果文件：

results.png

heatmap.png

barplot.png

总结：再见单细胞。

GSEA&GSVA分析-单细胞转录组高阶分析03

猜你喜欢

热点阅读