单细胞功能集GSVA以及转录调控SCENIC分析流程
2020-03-20 本文已影响0人
小光amateur
image
基因集变异分析(GSVA)
方法:
基因集变异分析在传统转录组分析时就已经使用了,针对单细胞数据,其实是一样的。首先通过GSVA
函数将细胞-基因表达矩阵转换为细胞-基因集表达矩阵,然后针对该矩阵计算两组或者多组之间的差异,并通过热图展示该差异。
单细胞的分组可能有多个层面,例如不同细胞活着亚类分组,对照组和实验组,不同时间段的样本等等。因此最好先按照不同的分组将表达量矩阵和细胞分组信息提取出来,以便后续分析
需要的R包:
- GSEAbase
- GSVA
- limma
- Seurat
- Biobase
- genefilter
- dplyr
- pystr (git)
- purrr
- tidyr
- tibble
提取不同分组的表达矩阵:
相同细胞分类下不同时间样本的表达矩阵和细胞分组
library(Seurat)
library(dplyr)
##我这里有7个cluster,5个时间阶段
for (i in 1:7){
name<-paste0("./cluster_",i,"/cluster_",i,".afterclu.rds")
endo<-readRDS(name)
time<-endo@meta.data$case
time[which(grepl("^pf_",time))]<-"PF"
time[which(grepl("^po3_",time))]<-"PO3"
time[which(grepl("^po7_",time))]<-"PO7"
time[which(grepl("^po11_",time))]<-"PO11"
time[which(grepl("^rif_",time))]<-"RIF"
endo@meta.data$time<-time
endo[['Cell.type']]<-endo[['seurat_clusters']]
endo2<-subset(endo,subset=time%in%c("PF","PO3","PO7","PO11"))
for (cls in unique(endo2$Cell.type)){
exam<-subset(endo2,idents=cls)
DefaultAssay(exam)<-"integrated"
exprMat<-GetAssayData(exam,slot="data")
exprMat<-as.matrix(exprMat)
cellInfo<-exam@meta.data%>%select(Cell.type,time)
write.table(exprMat,paste0("./cluster_",i,"/cluster_",i,"_",cls,".exprMat_.txt"),sep="\t",quote=FALSE)
write.table(cellInfo,paste0("./cluster_",i,"/cluster_",i,"_",cls,".cellInfo_.txt"),sep="\t",quote=FALSE)
}}
相同时间不同细胞的矩阵和信息
library(Seurat)
library(dplyr)
for (i in 1:7){
name<-paste0("./cluster_",i,"/cluster_",i,".afterclu.rds")
endo<-readRDS(name)
time<-endo@meta.data$case
time[which(grepl("^pf_",time))]<-"PF"
time[which(grepl("^po3_",time))]<-"PO3"
time[which(grepl("^po7_",time))]<-"PO7"
time[which(grepl("^po11_",time))]<-"PO11"
time[which(grepl("^rif_",time))]<-"RIF"
endo@meta.data$time<-time
endo[['Cell.type']]<-endo[['seurat_clusters']]
for (ti in c("PF","PO3","PO7","PO11")){
exam<-subset(endo,subset=time==ti)
DefaultAssay(exam)<-"integrated"
exprMat<-GetAssayData(exam,slot="data")
exprMat<-as.matrix(exprMat)
cellInfo<-exam@meta.data%>%select(Cell.type,time)
write.table(exprMat,paste0("./cluster_",i,"/cluster_",i,".exprMat_",ti,".txt"),sep="\t",quote=FALSE)
write.table(cellInfo,paste0("./cluster_",i,"/cluster_",i,".cellInfo_",ti,".txt"),sep="\t",quote=FALSE)
}
}
去GSEA官网下载基因集合gmt文件
这里有多个基因集,具体的介绍参考:
https://www.gsea-msigdb.org/gsea/msigdb/index.jsp
整合流程完成GSEA分析和差异分析
一些注意事项:
接着,做GSVA分析,获取GSVA矩阵,做GSVA分析可以参考传统bulk-RNA分析的方法
这里还需要注意几个问题:
- 处理普通RNA数据需要预先过滤,但是单细胞数据取自Seurat对象,已经预先过滤好了
- 如果输入是原始counts值,需要设置参数
kcdf="Possion"
,但如果是TPM值,默认就好,因为我们输入是标准化后的数据,所以用默认参数- 默认参数
mx.diff=TURE
,结果是一个类似于负二项分布,因为后面要做差异分析,所以需要使用该参数,如果设置mx.diff=FALSE
,则为高斯分布
先计算相同时间不同细胞的基因集差异
library(GSEABase)
library(Biobase)
library(genefilter)
library(limma)
library(RColorBrewer)
library(GSVA)
library(dplyr)
library(pystr)
##基因集这里看你用哪个了
geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c7.all.v7.0.symbols.gmt")
#geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c3.tft.v7.0.symbols.gmt")
###因为分析的是亚类,这里是针对细胞大类,意思是这
###里针对的是细胞大类1的所有亚类进行的分析
clus<-1
###定义不同时间点
time<-c("PF","PO3","PO7","PO11")
###输入一个列表或向量,进行重新编组
re_grp<-function(x,s){
purrr::map_chr(x,function(haha){if (haha==s){"A"}else{"B"}})
}
##计算差异并返回差异数据框,x=分组信息,cellInfo=细胞分类信息(必须是数据框,
##行名为细胞,列名为Cell.type,里面是分组信息),res_es=GSVA表达矩阵,
##num=想要留下top-n个基因集
cal_Diff<-function(x,cellInfo,res_es,num){
need<-cellInfo%>%transmute(new_group=re_grp(Cell.type,x))%>%.$new_group
grouP<-as.factor(need)
desigN<-model.matrix(~grouP+0)
rownames(desigN)<-rownames(cellInfo)
comparE<-makeContrasts(grouPA-grouPB,levels=desigN)
fiT <- lmFit(res_es, desigN)
fiT2 <- contrasts.fit(fiT, comparE)
fiT3 <- eBayes(fiT2)
Diff<-topTable(fiT3,p.value=0.05,coef=1,num=num)
if (nrow(Diff)>0){
Diff$cluster<-x
Diff<-Diff%>%tibble::rownames_to_column(var="Name")%>%tbl_df()%>%select(Name,logFC,t,P.Value,adj.P.Val,cluster)
}else{
Diff<-tibble::tibble(Name=NA,logFC=NA,t=NA,P.Value=NA,adj.P.Val=NA,cluster=NA)
}
}
###计算不同分组的平均表达量,mat=表达量矩阵,info=cellInfo
Average_express<-function(mat,info){
new_res<-mat%>%tibble::rownames_to_column(var="Name")%>%tidyr::gather(cell,value,-Name)
new_cellinfo<-info
new_cellinfo<-tibble::rownames_to_column(new_cellinfo,var="cell")
all<-left_join(new_res,new_cellinfo,by="cell")
fuck<-all%>%group_by(Name,Cell.type)%>%summarise(Mean=mean(value))%>%tidyr::spread(Cell.type,Mean)
fuck
}
for(i in time){
expMat<- read.table(pystr_format("cluster_{1}.exprMat_{2}.txt",clus,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
cellInfo<-read.table(pystr_format("cluster_{1}.cellInfo_{2}.txt",clus,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
mydata<-as.matrix(expMat)
all_cell<-unique(cellInfo$Cell.type)
res_es <- gsva(mydata, geneSets, min.sz=10, max.sz=500, verbose=FALSE, parallel.sz=8)
res_es<-as.data.frame(res_es)
write.table(res_es,pystr_format("./cell_diff/cluster_{1}_{2}_c7.txt",clus,i),sep="\t",quote=FALSE)
fin<-do.call('rbind',lapply(all_cell,cal_Diff,cellInfo=cellInfo,res_es=res_es,num=10))
fin<-fin%>%filter(!is.na(cluster))
write.table(fin,pystr_format("./cell_diff/cluster_{1}_{2}_c7_diff_geneset.txt",clus,i),sep="\t",quote=FALSE,row.names=FALSE)
heihei2<-Average_express(res_es,cellInfo)
heihei_need<-filter(heihei2,Name%in%unique(fin$Name))
if (nrow(heihei_need)>0){
houhou2<-tibble::column_to_rownames(heihei_need,var="Name")
pheatmap::pheatmap(houhou2,file=pystr_format("./cell_diff/cluster_{1}_{2}_c7_diff_heat.pdf",clus,i),fontsize=8)
}else{
next
}
}
再计算相同细胞不同时间的差异基因集
library(GSEABase)
library(Biobase)
library(genefilter)
library(limma)
library(RColorBrewer)
library(GSVA)
library(dplyr)
library(pystr)
geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c7.all.v7.0.symbols.gmt")
#geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c3.tft.v7.0.symbols.gmt")
##clus是细胞亚类分型,这个不能多写
clus<-c(0,1,2,3)
##细胞大类的名字
big_cls<-1
re_grp<-function(x,s){
purrr::map_chr(x,function(haha){if (haha==s){"A"}else{"B"}})
}
cal_Diff<-function(x,cellInfo,res_es,num){
need<-cellInfo%>%transmute(new_group=re_grp(Cell.type,x))%>%.$new_group
grouP<-as.factor(need)
desigN<-model.matrix(~grouP+0)
rownames(desigN)<-rownames(cellInfo)
comparE<-makeContrasts(grouPA-grouPB,levels=desigN)
fiT <- lmFit(res_es, desigN)
fiT2 <- contrasts.fit(fiT, comparE)
fiT3 <- eBayes(fiT2)
Diff<-topTable(fiT3,p.value=0.05,coef=1,num=num)
if (nrow(Diff)>0){
Diff$cluster<-x
Diff<-Diff%>%tibble::rownames_to_column(var="Name")%>%tbl_df()%>%select(Name,logFC,t,P.Value,adj.P.Val,cluster)
}else{
Diff<-tibble::tibble(Name=NA,logFC=NA,t=NA,P.Value=NA,adj.P.Val=NA,cluster=NA)
}
}
Average_express<-function(mat,info){
new_res<-mat%>%tibble::rownames_to_column(var="Name")%>%tidyr::gather(cell,value,-Name)
new_cellinfo<-info
new_cellinfo<-tibble::rownames_to_column(new_cellinfo,var="cell")
all<-left_join(new_res,new_cellinfo,by="cell")
fuck<-all%>%group_by(Name,Cell.type)%>%summarise(Mean=mean(value))%>%tidyr::spread(Cell.type,Mean)
fuck
}
for(i in clus){
expMat<- read.table(pystr_format("cluster_{1}_{2}.exprMat_.txt",big_cls,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
cellInfo<-read.table(pystr_format("cluster_{1}_{2}.cellInfo_.txt",big_cls,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
colnames(cellInfo)<-c("haha","Cell.type")
all_time<-unique(cellInfo$Cell.type)
mydata<-as.matrix(expMat)
res_es <- gsva(mydata, geneSets, min.sz=10, max.sz=500, verbose=FALSE, parallel.sz=8)
res_es<-as.data.frame(res_es)
write.table(res_es,pystr_format("./time_diff/cluster_{1}_{2}_c7.txt",big_cls,i),sep="\t",quote=FALSE)
fin<-do.call('rbind',lapply(all_time,cal_Diff,cellInfo=cellInfo,res_es=res_es,num=10))
fin<-fin%>%filter(!is.na(cluster))
write.table(fin,pystr_format("./time_diff/cluster_{1}_{2}_c7_diff_geneset.txt",big_cls,i),sep="\t",quote=FALSE,row.names=FALSE)
heihei2<-Average_express(res_es,cellInfo)
heihei_need<-filter(heihei2,Name%in%unique(fin$Name))
if (nrow(heihei_need)>0){
houhou2<-tibble::column_to_rownames(heihei_need,var="Name")
pheatmap::pheatmap(houhou2,file=pystr_format("./time_diff/cluster_{1}_{2}_c7_diff_heat.pdf",big_cls,i),fontsize=8)
}else{
next
}
}
经过以上两轮的分析,你将每个层面得到三个结果
- GSVA富集分数矩阵,正态分布
- 每次比对的差异Top10基因集
- 一个多组差异热图
转录调控分析(SCENIC)
按照SCENIC的官方教程,跑完整个流程(这里耗时很长,所以不能直接跑,要提交作业或者screen,避免中间间断
SCENIC 基因互作数据库需要提前下载,人类的一共两个,每个1.2G
hg19-500bp-upstream-7species.mc9nr.feather
hg19-tss-centered-10kb-7species.mc9nr.feather
由于R-Genie计算网络需要随机森林,所以耗时非常长,一个6K细胞的矩阵大概需要数天,所以,我们分三步计算
- 提取表达量矩阵和细胞分组信息
- 计算SCENIC网络
- 计算差异并输出最后结果
需要的R包
- SCENIC
- AUCell
提取表达矩阵
SCENIC 把整个矩阵一起打分,避免多次计算网络,消耗时间
library(Seurat)
library(dplyr)
for (i in 1:7){
name<-paste0("./cluster_",i,"/cluster_",i,".afterclu.rds")
endo<-readRDS(name)
time<-endo@meta.data$case
time[which(grepl("^pf_",time))]<-"PF"
time[which(grepl("^po3_",time))]<-"PO3"
time[which(grepl("^po7_",time))]<-"PO7"
time[which(grepl("^po11_",time))]<-"PO11"
time[which(grepl("^rif_",time))]<-"RIF"
endo@meta.data$time<-time
endo[['Cell.type']]<-endo[['seurat_clusters']]
exam<-subset(endo,subset=time%in%c("PF","PO3","PO7","PO11"))
DefaultAssay(exam)<-"integrated"
cellInfo<-exam@meta.data%>%select(Cell.type,time)
exprMat<-GetAssayData(exam,slot="data")
exprMat<-as.matrix(exprMat)
write.table(exprMat,paste0("cluster_",i,".exprMat_noRIF.txt"),sep="\t",quote=FALSE)
write.table(cellInfo,paste0("./cluster_",i,"/cluster_",i,".cellInfo_noRIF.txt"),sep="\t",quote=FALSE)
}
计算TF网络
library(SCENIC)
library(AUCell)
library(pystr)
##填写不同的clus,对不同细胞进行分析
clus<-1
exprMat<-read.table(pystr_format("cluster_{1}.exprMat_noPO7.txt",clus),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
exprMat<-as.matrix(exprMat)
cellInfo<-read.table(pystr_format("cluster_{1}.cellInfo_noPO7.txt",clus),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
cellInfo<-dplyr::select(cellInfo,Cell.type)
colnames(cellInfo)<-"CellType"
dir.create("int")
saveRDS(cellInfo, file="int/cellInfo.Rds")
org="hgnc"
##这个是你自己下载数据库存放的位置
dbDir="/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/SCENIC/cisTarget_databases"
myDatasetTitle="cluster_1_SCENIC_analysis"
data(defaultDbNames)
dbs <- defaultDbNames[[org]]
scenicOptions <- initializeScenic(org=org, dbDir=dbDir, dbs=dbs, datasetTitle=myDatasetTitle, nCores=4)
scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds"
saveRDS(scenicOptions, file="int/scenicOptions.Rds")
exprMat_filtered <- exprMat
runCorrelation(exprMat_filtered, scenicOptions)
runGenie3(exprMat_filtered, scenicOptions)
scenicOptions@settings$verbose <- TRUE
scenicOptions@settings$nCores <- 4
scenicOptions@settings$seed <- 123
runSCENIC_1_coexNetwork2modules(scenicOptions)
runSCENIC_2_createRegulons(scenicOptions)
runSCENIC_3_scoreCells(scenicOptions, exprMat)
批量计算针对每个细胞类型的不同分组的差异TF
library(dplyr)
library(SCENIC)
library(AUCell)
library(pystr)
library(limma)
re_grp<-function(x,s){
purrr::map_chr(x,function(haha){if (haha==s){"A"}else{"B"}})
}
cal_Diff<-function(x,cellInfo,res_es,num){
need<-cellInfo%>%transmute(new_group=re_grp(Cell.type,x))%>%.$new_group
grouP<-as.factor(need)
desigN<-model.matrix(~grouP+0)
rownames(desigN)<-rownames(cellInfo)
comparE<-makeContrasts(grouPA-grouPB,levels=desigN)
fiT <- lmFit(res_es, desigN)
fiT2 <- contrasts.fit(fiT, comparE)
fiT3 <- eBayes(fiT2)
Diff<-topTable(fiT3,p.value=0.05,coef=1,num=num)
if (nrow(Diff)>0){
Diff$cluster<-x
Diff<-Diff%>%tibble::rownames_to_column(var="Name")%>%tbl_df()%>%select(Name,logFC,t,P.Value,adj.P.Val,cluster)
}else{
Diff<-tibble::tibble(Name=NA,logFC=NA,t=NA,P.Value=NA,adj.P.Val=NA,cluster=NA)
}
}
Average_express<-function(mat,info){
new_res<-mat%>%tibble::rownames_to_column(var="Name")%>%tidyr::gather(cell,value,-Name)
new_cellinfo<-info
new_cellinfo<-tibble::rownames_to_column(new_cellinfo,var="cell")
all<-left_join(new_res,new_cellinfo,by="cell")
fuck<-all%>%group_by(Name,Cell.type)%>%summarise(Mean=mean(value))%>%tidyr::spread(Cell.type,Mean)
fuck
}
###不同的clus针对不同的细胞类
clus<-1
scenicOptions <- readRDS("int/scenicOptions.Rds")
regulons <- loadInt(scenicOptions, "regulons")
regulons2 <- loadInt(scenicOptions, "aucell_regulons")
regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
regulonAUC <- regulonAUC[onlyNonDuplicatedExtended(rownames(regulonAUC)),]
cellInfo<-read.table(pystr_format("cluster_{1}.cellInfo_noPO7.txt",clus),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
for (i in c("PF","PO3","PO7","PO11")){
cellInfo2<-cellInfo[which(cellInfo$time==i),]
exp<-as.data.frame(getAUC(regulonAUC))[,rownames(cellInfo2)]
exp2<-t(scale(t(exp), center = T, scale=T))
fin<-do.call('rbind',lapply(unique(cellInfo2$Cell.type),cal_Diff,cellInfo=cellInfo2,res_es=exp2,num=10))
fin<-fin%>%filter(!is.na(cluster))
write.table(fin,pystr_format("cluster_{1}_{2}_TF_diff_geneset.txt",clus,i),sep="\t",quote=FALSE,row.names=FALSE)
haha<-Average_express(exp,cellInfo2)%>%tibble::column_to_rownames(var="Name")%>%as.data.frame()
haha<-haha[unique(fin$Name),]
if (nrow(haha)>0){
haha_scale<-t(scale(t(haha), center = T, scale=T))
pheatmap::pheatmap(haha_scale,file=pystr_format("cluster_{1}_{2}_TF_heatmap.pdf",clus,i))
}else{
next
}
all_tf<-t(purrr::map_dfr(regulons2[unique(fin$Name)],function(x) {stringr::str_c(x,collapse=",")}))
write.table(all_tf,pystr_format("cluster_{1}_{2}_TF_duiying.txt",clus,i),sep="\t",quote=FALSE,col.names=FALSE)
}
for (i in unique(cellInfo$Cell.type)){
cellInfo2<-cellInfo[which(cellInfo$Cell.type==i),]
colnames(cellInfo2)<-c("haha","Cell.type")
exp<-as.data.frame(getAUC(regulonAUC))[,rownames(cellInfo2)]
exp2<-t(scale(t(exp), center = T, scale=T))
fin<-do.call('rbind',lapply(unique(cellInfo2$Cell.type),cal_Diff,cellInfo=cellInfo2,res_es=exp2,num=10))
fin<-fin%>%filter(!is.na(cluster))
write.table(fin,pystr_format("cluster_{1}_{2}_TF_diff_geneset.txt",clus,i),sep="\t",quote=FALSE,row.names=FALSE)
haha<-Average_express(exp,cellInfo2)%>%tibble::column_to_rownames(var="Name")%>%as.data.frame()
haha<-haha[unique(fin$Name),]
if (nrow(haha)>0){
haha_scale<-t(scale(t(haha), center = T, scale=T))
pheatmap::pheatmap(haha_scale,file=pystr_format("cluster_{1}_{2}_TF_heatmap.pdf",clus,i))
}else{
next
}
all_tf<-t(purrr::map_dfr(regulons2[unique(fin$Name)],function(x) {stringr::str_c(x,collapse=",")}))
write.table(all_tf,pystr_format("cluster_{1}_{2}_TF_duiying.txt",clus,i),sep="\t",quote=FALSE,col.names=FALSE)
}
经过以上分析后,同样会得到三个文件:
- 一个不同组间差异的top10转录因子基因集合
- 一个不同分组的差异热图
- 一个TF对应的所有基因表格