单细胞功能集GSVA以及转录调控SCENIC分析流程

2020-03-20 本文已影响0人小光amateur

image

基因集变异分析（GSVA）

方法：

基因集变异分析在传统转录组分析时就已经使用了，针对单细胞数据，其实是一样的。首先通过GSVA函数将细胞-基因表达矩阵转换为细胞-基因集表达矩阵，然后针对该矩阵计算两组或者多组之间的差异，并通过热图展示该差异。

单细胞的分组可能有多个层面，例如不同细胞活着亚类分组，对照组和实验组，不同时间段的样本等等。因此最好先按照不同的分组将表达量矩阵和细胞分组信息提取出来，以便后续分析

需要的R包:

GSEAbase
GSVA
limma
Seurat
Biobase
genefilter
dplyr
pystr (git)
purrr
tidyr
tibble

提取不同分组的表达矩阵：

相同细胞分类下不同时间样本的表达矩阵和细胞分组

library(Seurat)
library(dplyr)
##我这里有7个cluster，5个时间阶段
for (i in 1:7){
    name<-paste0("./cluster_",i,"/cluster_",i,".afterclu.rds")
    endo<-readRDS(name)
    time<-endo@meta.data$case
    time[which(grepl("^pf_",time))]<-"PF"
    time[which(grepl("^po3_",time))]<-"PO3"
    time[which(grepl("^po7_",time))]<-"PO7"
    time[which(grepl("^po11_",time))]<-"PO11"
    time[which(grepl("^rif_",time))]<-"RIF"
    endo@meta.data$time<-time
    endo[['Cell.type']]<-endo[['seurat_clusters']]
    endo2<-subset(endo,subset=time%in%c("PF","PO3","PO7","PO11"))
    for (cls in unique(endo2$Cell.type)){
        exam<-subset(endo2,idents=cls)
        DefaultAssay(exam)<-"integrated"
        exprMat<-GetAssayData(exam,slot="data")
        exprMat<-as.matrix(exprMat)
        cellInfo<-exam@meta.data%>%select(Cell.type,time)
        write.table(exprMat,paste0("./cluster_",i,"/cluster_",i,"_",cls,".exprMat_.txt"),sep="\t",quote=FALSE)
        write.table(cellInfo,paste0("./cluster_",i,"/cluster_",i,"_",cls,".cellInfo_.txt"),sep="\t",quote=FALSE)
}}

相同时间不同细胞的矩阵和信息

library(Seurat)
library(dplyr)
for (i in 1:7){
    name<-paste0("./cluster_",i,"/cluster_",i,".afterclu.rds")
    endo<-readRDS(name)
    time<-endo@meta.data$case
    time[which(grepl("^pf_",time))]<-"PF"
    time[which(grepl("^po3_",time))]<-"PO3"
    time[which(grepl("^po7_",time))]<-"PO7"
    time[which(grepl("^po11_",time))]<-"PO11"
    time[which(grepl("^rif_",time))]<-"RIF"
    endo@meta.data$time<-time
    endo[['Cell.type']]<-endo[['seurat_clusters']]
    for (ti in c("PF","PO3","PO7","PO11")){
        exam<-subset(endo,subset=time==ti)
        DefaultAssay(exam)<-"integrated"
        exprMat<-GetAssayData(exam,slot="data")
        exprMat<-as.matrix(exprMat)
        cellInfo<-exam@meta.data%>%select(Cell.type,time)
        write.table(exprMat,paste0("./cluster_",i,"/cluster_",i,".exprMat_",ti,".txt"),sep="\t",quote=FALSE)
        write.table(cellInfo,paste0("./cluster_",i,"/cluster_",i,".cellInfo_",ti,".txt"),sep="\t",quote=FALSE)
}
}

去GSEA官网下载基因集合gmt文件

GSEA Download

这里有多个基因集，具体的介绍参考：
https://www.gsea-msigdb.org/gsea/msigdb/index.jsp

整合流程完成GSEA分析和差异分析

一些注意事项：

接着，做GSVA分析，获取GSVA矩阵，做GSVA分析可以参考传统bulk-RNA分析的方法

博客园

使用GSVA方法计算某基因集在各个样本的表现

这里还需要注意几个问题：

处理普通RNA数据需要预先过滤，但是单细胞数据取自Seurat对象，已经预先过滤好了

如果输入是原始counts值，需要设置参数kcdf="Possion"，但如果是TPM值，默认就好，因为我们输入是标准化后的数据，所以用默认参数

默认参数mx.diff=TURE，结果是一个类似于负二项分布，因为后面要做差异分析，所以需要使用该参数，如果设置mx.diff=FALSE，则为高斯分布

先计算相同时间不同细胞的基因集差异

library(GSEABase)
library(Biobase)
library(genefilter)
library(limma)
library(RColorBrewer)
library(GSVA)
library(dplyr)
library(pystr)
##基因集这里看你用哪个了
geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c7.all.v7.0.symbols.gmt")
#geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c3.tft.v7.0.symbols.gmt")
###因为分析的是亚类，这里是针对细胞大类，意思是这
###里针对的是细胞大类1的所有亚类进行的分析
clus<-1
###定义不同时间点
time<-c("PF","PO3","PO7","PO11")
###输入一个列表或向量，进行重新编组
re_grp<-function(x,s){
    purrr::map_chr(x,function(haha){if (haha==s){"A"}else{"B"}})
    }
##计算差异并返回差异数据框，x=分组信息，cellInfo=细胞分类信息（必须是数据框，
##行名为细胞，列名为Cell.type，里面是分组信息），res_es=GSVA表达矩阵，
##num=想要留下top-n个基因集
cal_Diff<-function(x,cellInfo,res_es,num){
    need<-cellInfo%>%transmute(new_group=re_grp(Cell.type,x))%>%.$new_group
    grouP<-as.factor(need)
    desigN<-model.matrix(~grouP+0)
    rownames(desigN)<-rownames(cellInfo)
    comparE<-makeContrasts(grouPA-grouPB,levels=desigN)
    fiT <- lmFit(res_es, desigN)
    fiT2 <- contrasts.fit(fiT, comparE)
    fiT3 <- eBayes(fiT2)
    Diff<-topTable(fiT3,p.value=0.05,coef=1,num=num)
    if (nrow(Diff)>0){
        Diff$cluster<-x
        Diff<-Diff%>%tibble::rownames_to_column(var="Name")%>%tbl_df()%>%select(Name,logFC,t,P.Value,adj.P.Val,cluster)
    }else{
        Diff<-tibble::tibble(Name=NA,logFC=NA,t=NA,P.Value=NA,adj.P.Val=NA,cluster=NA)
    }
}
###计算不同分组的平均表达量，mat=表达量矩阵，info=cellInfo
Average_express<-function(mat,info){
    new_res<-mat%>%tibble::rownames_to_column(var="Name")%>%tidyr::gather(cell,value,-Name)
    new_cellinfo<-info
    new_cellinfo<-tibble::rownames_to_column(new_cellinfo,var="cell")
    all<-left_join(new_res,new_cellinfo,by="cell")
    fuck<-all%>%group_by(Name,Cell.type)%>%summarise(Mean=mean(value))%>%tidyr::spread(Cell.type,Mean)
    fuck
}
for(i in time){
    expMat<- read.table(pystr_format("cluster_{1}.exprMat_{2}.txt",clus,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
    cellInfo<-read.table(pystr_format("cluster_{1}.cellInfo_{2}.txt",clus,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
    mydata<-as.matrix(expMat)
    all_cell<-unique(cellInfo$Cell.type)
    res_es <- gsva(mydata, geneSets, min.sz=10, max.sz=500, verbose=FALSE, parallel.sz=8)
    res_es<-as.data.frame(res_es)
    write.table(res_es,pystr_format("./cell_diff/cluster_{1}_{2}_c7.txt",clus,i),sep="\t",quote=FALSE)
    fin<-do.call('rbind',lapply(all_cell,cal_Diff,cellInfo=cellInfo,res_es=res_es,num=10))
    fin<-fin%>%filter(!is.na(cluster))
    write.table(fin,pystr_format("./cell_diff/cluster_{1}_{2}_c7_diff_geneset.txt",clus,i),sep="\t",quote=FALSE,row.names=FALSE)
    heihei2<-Average_express(res_es,cellInfo)
    heihei_need<-filter(heihei2,Name%in%unique(fin$Name))
    if (nrow(heihei_need)>0){
      houhou2<-tibble::column_to_rownames(heihei_need,var="Name")
      pheatmap::pheatmap(houhou2,file=pystr_format("./cell_diff/cluster_{1}_{2}_c7_diff_heat.pdf",clus,i),fontsize=8)
    }else{
      next
    }
}

再计算相同细胞不同时间的差异基因集

library(GSEABase)
library(Biobase)
library(genefilter)
library(limma)
library(RColorBrewer)
library(GSVA)
library(dplyr)
library(pystr)
geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c7.all.v7.0.symbols.gmt")
#geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c3.tft.v7.0.symbols.gmt")
##clus是细胞亚类分型，这个不能多写
clus<-c(0,1,2,3)
##细胞大类的名字
big_cls<-1
re_grp<-function(x,s){
    purrr::map_chr(x,function(haha){if (haha==s){"A"}else{"B"}})
    }
cal_Diff<-function(x,cellInfo,res_es,num){
    need<-cellInfo%>%transmute(new_group=re_grp(Cell.type,x))%>%.$new_group
    grouP<-as.factor(need)
    desigN<-model.matrix(~grouP+0)
    rownames(desigN)<-rownames(cellInfo)
    comparE<-makeContrasts(grouPA-grouPB,levels=desigN)
    fiT <- lmFit(res_es, desigN)
    fiT2 <- contrasts.fit(fiT, comparE)
    fiT3 <- eBayes(fiT2)
    Diff<-topTable(fiT3,p.value=0.05,coef=1,num=num)
    if (nrow(Diff)>0){
        Diff$cluster<-x
        Diff<-Diff%>%tibble::rownames_to_column(var="Name")%>%tbl_df()%>%select(Name,logFC,t,P.Value,adj.P.Val,cluster)
    }else{
        Diff<-tibble::tibble(Name=NA,logFC=NA,t=NA,P.Value=NA,adj.P.Val=NA,cluster=NA)
    }
}
Average_express<-function(mat,info){
    new_res<-mat%>%tibble::rownames_to_column(var="Name")%>%tidyr::gather(cell,value,-Name)
    new_cellinfo<-info 
    new_cellinfo<-tibble::rownames_to_column(new_cellinfo,var="cell")
    all<-left_join(new_res,new_cellinfo,by="cell")
    fuck<-all%>%group_by(Name,Cell.type)%>%summarise(Mean=mean(value))%>%tidyr::spread(Cell.type,Mean)
    fuck
}
for(i in clus){
    expMat<- read.table(pystr_format("cluster_{1}_{2}.exprMat_.txt",big_cls,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
    cellInfo<-read.table(pystr_format("cluster_{1}_{2}.cellInfo_.txt",big_cls,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
    colnames(cellInfo)<-c("haha","Cell.type")
    all_time<-unique(cellInfo$Cell.type)
    mydata<-as.matrix(expMat)
    res_es <- gsva(mydata, geneSets, min.sz=10, max.sz=500, verbose=FALSE, parallel.sz=8)
    res_es<-as.data.frame(res_es)
    write.table(res_es,pystr_format("./time_diff/cluster_{1}_{2}_c7.txt",big_cls,i),sep="\t",quote=FALSE)
    fin<-do.call('rbind',lapply(all_time,cal_Diff,cellInfo=cellInfo,res_es=res_es,num=10))
    fin<-fin%>%filter(!is.na(cluster))
    write.table(fin,pystr_format("./time_diff/cluster_{1}_{2}_c7_diff_geneset.txt",big_cls,i),sep="\t",quote=FALSE,row.names=FALSE)
    heihei2<-Average_express(res_es,cellInfo)
    heihei_need<-filter(heihei2,Name%in%unique(fin$Name))
    if (nrow(heihei_need)>0){
      houhou2<-tibble::column_to_rownames(heihei_need,var="Name")
      pheatmap::pheatmap(houhou2,file=pystr_format("./time_diff/cluster_{1}_{2}_c7_diff_heat.pdf",big_cls,i),fontsize=8)
    }else{
      next
    }
}

经过以上两轮的分析，你将每个层面得到三个结果

GSVA富集分数矩阵，正态分布
每次比对的差异Top10基因集
一个多组差异热图

转录调控分析（SCENIC）

按照SCENIC的官方教程，跑完整个流程（这里耗时很长，所以不能直接跑，要提交作业或者screen,避免中间间断
SCENIC 基因互作数据库需要提前下载，人类的一共两个，每个1.2G
hg19-500bp-upstream-7species.mc9nr.feather
hg19-tss-centered-10kb-7species.mc9nr.feather

由于R-Genie计算网络需要随机森林，所以耗时非常长，一个6K细胞的矩阵大概需要数天，所以，我们分三步计算

提取表达量矩阵和细胞分组信息
计算SCENIC网络
计算差异并输出最后结果

需要的R包

SCENIC
AUCell

提取表达矩阵

SCENIC 把整个矩阵一起打分，避免多次计算网络，消耗时间

library(Seurat)
library(dplyr)
for (i in 1:7){
    name<-paste0("./cluster_",i,"/cluster_",i,".afterclu.rds")
    endo<-readRDS(name)
    time<-endo@meta.data$case
    time[which(grepl("^pf_",time))]<-"PF"
    time[which(grepl("^po3_",time))]<-"PO3"
    time[which(grepl("^po7_",time))]<-"PO7"
    time[which(grepl("^po11_",time))]<-"PO11"
    time[which(grepl("^rif_",time))]<-"RIF"
    endo@meta.data$time<-time
    endo[['Cell.type']]<-endo[['seurat_clusters']]
    exam<-subset(endo,subset=time%in%c("PF","PO3","PO7","PO11"))
    DefaultAssay(exam)<-"integrated"
    cellInfo<-exam@meta.data%>%select(Cell.type,time)
    exprMat<-GetAssayData(exam,slot="data")
    exprMat<-as.matrix(exprMat)
    write.table(exprMat,paste0("cluster_",i,".exprMat_noRIF.txt"),sep="\t",quote=FALSE)
    write.table(cellInfo,paste0("./cluster_",i,"/cluster_",i,".cellInfo_noRIF.txt"),sep="\t",quote=FALSE)
}

计算TF网络

library(SCENIC)
library(AUCell)
library(pystr)
##填写不同的clus,对不同细胞进行分析
clus<-1
exprMat<-read.table(pystr_format("cluster_{1}.exprMat_noPO7.txt",clus),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
exprMat<-as.matrix(exprMat)
cellInfo<-read.table(pystr_format("cluster_{1}.cellInfo_noPO7.txt",clus),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
cellInfo<-dplyr::select(cellInfo,Cell.type)
colnames(cellInfo)<-"CellType"
dir.create("int")
saveRDS(cellInfo, file="int/cellInfo.Rds")
org="hgnc"
##这个是你自己下载数据库存放的位置
dbDir="/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/SCENIC/cisTarget_databases"
myDatasetTitle="cluster_1_SCENIC_analysis"
data(defaultDbNames)
dbs <- defaultDbNames[[org]]
scenicOptions <- initializeScenic(org=org, dbDir=dbDir, dbs=dbs, datasetTitle=myDatasetTitle, nCores=4)
scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds"
saveRDS(scenicOptions, file="int/scenicOptions.Rds")
exprMat_filtered <- exprMat
runCorrelation(exprMat_filtered, scenicOptions)
runGenie3(exprMat_filtered, scenicOptions)
scenicOptions@settings$verbose <- TRUE
scenicOptions@settings$nCores <- 4
scenicOptions@settings$seed <- 123
runSCENIC_1_coexNetwork2modules(scenicOptions)
runSCENIC_2_createRegulons(scenicOptions)
runSCENIC_3_scoreCells(scenicOptions, exprMat)

批量计算针对每个细胞类型的不同分组的差异TF

library(dplyr)
library(SCENIC)
library(AUCell)
library(pystr)
library(limma)
re_grp<-function(x,s){
    purrr::map_chr(x,function(haha){if (haha==s){"A"}else{"B"}})
    }
cal_Diff<-function(x,cellInfo,res_es,num){
    need<-cellInfo%>%transmute(new_group=re_grp(Cell.type,x))%>%.$new_group
    grouP<-as.factor(need)
    desigN<-model.matrix(~grouP+0)
    rownames(desigN)<-rownames(cellInfo)
    comparE<-makeContrasts(grouPA-grouPB,levels=desigN)
    fiT <- lmFit(res_es, desigN)
    fiT2 <- contrasts.fit(fiT, comparE)
    fiT3 <- eBayes(fiT2)
    Diff<-topTable(fiT3,p.value=0.05,coef=1,num=num)
    if (nrow(Diff)>0){
        Diff$cluster<-x
        Diff<-Diff%>%tibble::rownames_to_column(var="Name")%>%tbl_df()%>%select(Name,logFC,t,P.Value,adj.P.Val,cluster)
    }else{
        Diff<-tibble::tibble(Name=NA,logFC=NA,t=NA,P.Value=NA,adj.P.Val=NA,cluster=NA)
    }
}
Average_express<-function(mat,info){
    new_res<-mat%>%tibble::rownames_to_column(var="Name")%>%tidyr::gather(cell,value,-Name)
    new_cellinfo<-info
    new_cellinfo<-tibble::rownames_to_column(new_cellinfo,var="cell")
    all<-left_join(new_res,new_cellinfo,by="cell")
    fuck<-all%>%group_by(Name,Cell.type)%>%summarise(Mean=mean(value))%>%tidyr::spread(Cell.type,Mean)
    fuck
}
###不同的clus针对不同的细胞类
clus<-1
scenicOptions <- readRDS("int/scenicOptions.Rds")
regulons <- loadInt(scenicOptions, "regulons")
regulons2 <- loadInt(scenicOptions, "aucell_regulons")
regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
regulonAUC <- regulonAUC[onlyNonDuplicatedExtended(rownames(regulonAUC)),]
cellInfo<-read.table(pystr_format("cluster_{1}.cellInfo_noPO7.txt",clus),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
for (i in c("PF","PO3","PO7","PO11")){
    cellInfo2<-cellInfo[which(cellInfo$time==i),]
    exp<-as.data.frame(getAUC(regulonAUC))[,rownames(cellInfo2)]
    exp2<-t(scale(t(exp), center = T, scale=T))
    fin<-do.call('rbind',lapply(unique(cellInfo2$Cell.type),cal_Diff,cellInfo=cellInfo2,res_es=exp2,num=10))
    fin<-fin%>%filter(!is.na(cluster))
    write.table(fin,pystr_format("cluster_{1}_{2}_TF_diff_geneset.txt",clus,i),sep="\t",quote=FALSE,row.names=FALSE)
    haha<-Average_express(exp,cellInfo2)%>%tibble::column_to_rownames(var="Name")%>%as.data.frame()
    haha<-haha[unique(fin$Name),]
    if (nrow(haha)>0){
        haha_scale<-t(scale(t(haha), center = T, scale=T))
        pheatmap::pheatmap(haha_scale,file=pystr_format("cluster_{1}_{2}_TF_heatmap.pdf",clus,i))
    }else{
        next
    }
    all_tf<-t(purrr::map_dfr(regulons2[unique(fin$Name)],function(x) {stringr::str_c(x,collapse=",")}))
    write.table(all_tf,pystr_format("cluster_{1}_{2}_TF_duiying.txt",clus,i),sep="\t",quote=FALSE,col.names=FALSE)
}
for (i in unique(cellInfo$Cell.type)){
    cellInfo2<-cellInfo[which(cellInfo$Cell.type==i),]
    colnames(cellInfo2)<-c("haha","Cell.type")
    exp<-as.data.frame(getAUC(regulonAUC))[,rownames(cellInfo2)]
    exp2<-t(scale(t(exp), center = T, scale=T))
    fin<-do.call('rbind',lapply(unique(cellInfo2$Cell.type),cal_Diff,cellInfo=cellInfo2,res_es=exp2,num=10))
    fin<-fin%>%filter(!is.na(cluster))
    write.table(fin,pystr_format("cluster_{1}_{2}_TF_diff_geneset.txt",clus,i),sep="\t",quote=FALSE,row.names=FALSE)
    haha<-Average_express(exp,cellInfo2)%>%tibble::column_to_rownames(var="Name")%>%as.data.frame()
    haha<-haha[unique(fin$Name),]
    if (nrow(haha)>0){
        haha_scale<-t(scale(t(haha), center = T, scale=T))
        pheatmap::pheatmap(haha_scale,file=pystr_format("cluster_{1}_{2}_TF_heatmap.pdf",clus,i))
    }else{
        next
    }
    all_tf<-t(purrr::map_dfr(regulons2[unique(fin$Name)],function(x) {stringr::str_c(x,collapse=",")}))
    write.table(all_tf,pystr_format("cluster_{1}_{2}_TF_duiying.txt",clus,i),sep="\t",quote=FALSE,col.names=FALSE)
}

经过以上分析后，同样会得到三个文件：

一个不同组间差异的top10转录因子基因集合
一个不同分组的差异热图
一个TF对应的所有基因表格