seurat单细胞数据处理小技巧
1 亚群合并
当有几类亚群同属于某类细胞时,比如CD4+ T细胞和CD8+ T细胞均属于T细胞,想要将他们合并在一起时,可以使用此代码。
#接seurat标准流程代码
#命名pbmc为sce,表明其是seurat对象
sce=pbmc
Idents(sce)#细胞身份,属于哪个亚群
levels(sce)#获取sce的亚群名
> [1] "Naive CD4 T" "CD14+ Mono" "Memory > CD4 T"
> [4] "B" "CD8 T" "FCGR3A+ > Mono"
> [7] "NK" "DC" "Platelet"
#可以看到Naive CD4 T,Memory CD4T及CD8 T都属于T细胞
head(sce@meta.data) #可以看到meta.data中多了seurat_clusters列,由0-9构成。
# 方法1
#根据levels(sce)可将要合并的亚群写作如下
new.cluster.ids <- c("T", "Mono", "T",
"B", "T", "Mono",
"T", "DC", "Platelet")
names(new.cluster.ids) <- levels(sce)
#更改前
p1<-DimPlot(pbmc, reduction = 'umap',
label = TRUE, pt.size = 0.5) + NoLegend()
#重命名亚群
sce <- RenameIdents(sce, new.cluster.ids)
#更改后
p2<-DimPlot(sce, reduction = 'umap',
label = TRUE, pt.size = 0.5) + NoLegend()
#方法2
根据亚群对应关系,创建一个向量
cluster2celltype <- c("0"="T",
"1"="Mono",
"2"="T",
"3"= "B",
"4"= "T",
"5"= "Mono",
"6"= "T",
"7"= "DC",
"8"= "Platelet")
#unname去掉对象的名(行名列名)
sce[['cell_type']] = unname(cluster2celltype[sce@meta.data$seurat_clusters])
#绘制散点图,reduction:使用哪种降维。如果没有指定,首先搜索 umap,然后是 tsne,然后是 pca;label:是否标记分群
DimPlot(sce, reduction = 'umap', group.by = 'cell_type',
label = TRUE, pt.size = 0.5) + NoLegend()#删除图例
p3<-DimPlot(sce, reduction = 'umap', group.by = 'cell_type',
label = TRUE, pt.size = 0.5) + NoLegend()#删除图例
p1+p2+p3#此函数需要library(dplyr)
推荐使用第二种方法,这样不更改原始亚群分群,只是在metadata中增加了一列
2 提取子集
当我们想把表达感兴趣基因的细胞提取出来单独分析时,可使用此函数。
p1<-DimPlot(pbmc, reduction = 'umap', group.by ="seurat_clusters",label = TRUE, pt.size = 0.5) + NoLegend()
p2<-FeaturePlot(pbmc, features = c("CD4"))
p3<-DotPlot(sce, group.by = 'seurat_clusters',
features = unique(genes_to_check)) + RotatedAxis()
p1+p2+p3
比如我们想把某几个亚群提取出来
sce=pbmc
#R 中,提取子集, 需要知道逻辑值,坐标及名字
# seurat 取0,2分群的子集
cd4_sce1 = sce[,sce@meta.data$seurat_clusters %in% c(0,2)]
#取Naive CD4 T , Memory CD4 T
cd4_sce2 = sce[, Idents(sce) %in% c( "Naive CD4 T" , "Memory CD4 T" )]
取子集后就是新的项目,需要重新标准化,重新用子集走一遍seurat标准流程
3 如何分群是合理的
我们知道FindClusters函数中不同的resolution参数会带来不同的结果,而且即使某个亚群内部的细胞也会有一定的异质性,那么到底分群多少是合适的呢?
#先执行不同resolution 下的分群
library(Seurat)
library(clustree)
sce <- FindClusters(
object = sce,
resolution = c(seq(from=.1,to=1.6,by=.2))
)
#clustree创建一个聚类树图,依赖于clustree包,显示不同分辨率的聚类之间的关系。prefix指示包含聚类信息的列的字符串
colnames(sce@meta.data)
pz<-clustree(sce@meta.data, prefix = "RNA_snn_res.")
p2=DimPlot(sce, reduction = 'umap', group.by = 'RNA_snn_res.0.5',
label = TRUE, pt.size = 0.5) + NoLegend()
p3=DimPlot(sce, reduction = 'umap',group.by = 'RNA_snn_res.1.5',
label = TRUE, pt.size = 0.5) + NoLegend()
library(patchwork)
p1+p2
根据clustree可确定合适的分群数,若resolution设置过大,会导致分群过度,把原来分群的中应有的异质性也提炼出来单独作为一群。无论如何分群,都应该结合FindMarkers函数,确认亚群标志基因,结合生物学背景来解释分群结果。
4 细胞亚群等比例抽样
当数据特别大时,可以采用等比例抽样的方式,降低运算时间。
sce=pbmc
p1<-DoHeatmap(sce ,
features = features,
size = 3
)
table(Idents(sce))
#使用subset函数,每个亚群抽取15个细胞,做热图
p2<-DoHeatmap(subset(sce, downsample = 15),
features = features, size = 3)+
labs(title = paste("downsample = 15"))
p1+p2
# 每个细胞亚群抽10
allCells=names(Idents(sce))
allType = levels(Idents(sce))
choose_Cells = unlist(lapply(allType, function(x){
cgCells = allCells[Idents(sce)== x ]
cg=sample(cgCells,10) #sample 从指定的元素中获取指定大小的样本。
cg
}))
#将抽取出的细胞组成一个新的sce对象
cg_sce = sce[, allCells %in% choose_Cells]
> An object of class Seurat
> 13714 features across 90 samples within 1 > assay
> Active assay: RNA (13714 features, 2000 > > variable features)
> 2 dimensional reductions calculated: pca, umap
as.data.frame(table(Idents(cg_sce)))
5 如何查看基因的原始表达量
# 如何看原始表达量 slot:要使用的数据槽,从“raw.data”、“data”或“scale.data”中选择;size 颜色条上方文字大小
#不加slot默认是从之前2000个FindVariableFeatures基因作图
DoHeatmap(subset(sce ),
features = features,
size = 3, slot='data'
)
6 多个亚群的多个标记基因可视化
#首先需要将各个亚群的标记基因找出来
sce.markers <- FindAllMarkers(object = sce,
only.pos = TRUE,min.pct = 0.25, thresh.use = 0.25)
#此时可以创建一个HTML小部件来显示矩形数据
DT::datatable(sce.markers)
可以以交互形式的HTML格式查看各亚群的标记基因
#挑选各个亚群差异基因的前5名
top5 <- sce.markers %>% group_by(cluster) %>% top_n(5,avg_log2FC)
head(top5)
#检测并去除重复值 !表示“与、或、非”中的“非”的意思
top5=top5[!duplicated(top5$gene),]
select_genes_all=split(top5$gene,top5$cluster)#将表拆分成列表
select_genes_all
DotPlot(object = sce, features=select_genes_all,
assay = "RNA") + theme(axis.text.x = element_text(angle= 45, vjust = 0.5, hjust=0.5))
不同亚群多个差异基因的可视化
7 对任意细胞亚群做差异分析
#若没有运行过FindAllMarkers,运行FindAllMarkers函数,查找各亚群标记基因
sce.markers <- FindAllMarkers(object = sce,
only.pos = TRUE,min.pct = 0.25, thresh.use = 0.25)
#table使用交叉分类的因素建立一个列联表,计数在每个组合的因素水平。
table(sce.markers$cluster)
Naive CD4 T CD14+ Mono Memory CD4 T B CD8 T
164 391 178 148 144
FCGR3A+ Mono NK DC Platelet
608 369 633 242
挑选亚群和对应基因
# 挑选细胞,若挑选多个亚群细胞,可采用正则表达式选择,比如挑选 Mono和T细胞:grepl(("Mono|T") , Idents(sce )),这里只挑选Mono类亚群
kp=grepl(("Mono") , Idents(sce ))
table(kp)
sce=sce[,kp]
sce
table( Idents(sce ))
# 挑选对应的标记基因
kp=grepl('Mono',sce.markers$cluster)#返回匹配与否的逻辑值
table(kp)
cg_sce.markers = sce.markers [ kp ,]
#这里认为vg_log2FC>2的才是标记基因
cg_sce.markers=cg_sce.markers[cg_sce.markers$avg_log2FC>2,]
查找两群的差异基因
markers_df <- FindMarkers(object = sce,
ident.1 = 'FCGR3A+ Mono',#ident 聚类名
ident.2 = 'CD14+ Mono',
#logfc.threshold = 0,
min.pct = 0.25)
head(markers_df)
筛选差异基因
#选取avg_log2FC绝对值大于1基因
cg_markers_df=markers_df[abs(markers_df$avg_log2FC) >1,]
dim(cg_markers_df)
DoHeatmap(subset(sce, downsample = 50),
slot = 'counts',
unique(rownames(cg_markers_df)),size=3)
#取两个基因集的交集(基因在某个群中即高表达,又有差异)
intersect(rownames(cg_markers_df) ,
cg_sce.markers$gene)
8 人为划分亚群挑选差异基因
当我们对表达某类基因的细胞感兴趣时,可以把表达这类基因的的细胞挑选出来,单独划分为一个群。
highCells= colnames(subset(x = sce, subset = FCGR3A > 1,slot = 'counts'))
#a %in% table,a值是否包含于table中,为真输出TURE,否者输出FALSE
highORlow=ifelse(colnames(sce) %in% highCells,'high','low')
table(highORlow)
table(Idents(sce))
table(Idents(sce),highORlow)
#将highORlow添加到meta.data中
sce@meta.data$highORlow<-highORlow
#查找划分出的亚群与其他亚群的差异基因
markers <- FindMarkers(sce, ident.1 = "high", group.by = 'highORlow' )
head(markers)
cg_markers=markers[abs(markers$avg_log2FC) >1,]
dim(cg_markers)
DoHeatmap(subset(sce, downsample = 15),
rownames(cg_markers) ,size=3 )
9 查看任意文献的单细胞亚群标记基因
可以直接使用DotPlot函数以文献中标记基因作图
#若想查看下述marker_genes在T细胞中的表达,首先需要将T细胞相关亚群提取出来
sce=pbmc
table(Idents(sce))
#提取单细胞相关亚群
t_sce = sce[, Idents(sce) %in% c("Naive CD4 T" , "Memory CD4 T" , "CD8 T","NK")]
# 然后进行标准的降维聚类分群,代码不要变动
sce=t_sce
sce <- NormalizeData(sce, normalization.method = "LogNormalize",
scale.factor = 1e4)
sce <- FindVariableFeatures(sce, selection.method = 'vst',
nfeatures = 2000)
sce <- ScaleData(sce, vars.to.regress = "percent.mt")
sce <- RunPCA(sce, features = VariableFeatures(object = sce))
sce <- FindNeighbors(sce, dims = 1:10)
sce <- FindClusters(sce, resolution = 1 )
head(Idents(sce), 5)
table(sce$seurat_clusters)
sce <- RunUMAP(sce, dims = 1:10)
DimPlot(sce, reduction = 'umap')
查看文献中提到的基因
marker_genes=c("LEF1","TCF7","SELL","IL7R","CD40LG","ANXA1","FOS","JUN","FOXP3","SAT1","IL2RA","CTLA4","PDCD1","CXCL13","CD200","TNFRSF18","CCR7","NELL2","CD55","KLF2","TOB1","ZNF683","CCL5","GZMK","EOMES","ITM2C","CX3CR1","GNLY","GZMH","GZMB","LAG3","CCL4L2","FCGR3A","FGFBP2","TYROBP","AREG","XCL1","KLRC1","TRDV2","TRGV9","MTRNR2L8","KLRD1","TRDV1","KLRC3","CTSW","CD7","MKI67","STMN1","TUBA1B","HIST1H4C" )
DotPlot(sce, features = marker_genes,
assay='RNA' ) + coord_flip() +
theme(axis.text.x = element_text(angle = 90))
10 多个单细胞亚群各自标记基因联合进行GO及kegg数据库注释
#加载需要的包
library(Seurat)
library(gplots)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
#加载上述cg_sce.markers
pro='all'
load('sce.markers.all_10_celltype.Rdata')
table(sce.markers$cluster)
#基因ID转换bitr(geneID, fromType, toType, OrgDb, drop = TRUE)
ids=bitr(sce.markers$gene,'SYMBOL','ENTREZID','org.Hs.eg.db')
#合并数据
sce.markers=merge(sce.markers,ids,by.x='gene',by.y='SYMBOL')
#拆分id和分群,将ENTREZID拆分成cluster定义的组
gcSample=split(sce.markers$ENTREZID, sce.markers$cluster)
#比较亚群功能
xx <- compareCluster(gcSample, fun="enrichKEGG",organism="hsa", pvalueCutoff=0.05)
p=dotplot(xx)
p+ theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5))
p
对相关基因进行富集分析
#将上升下降的差异基因挑选出来
deg=FindMarkers(object = sce, ident.1 = 'FCGR3A+ Mono',ident.2 = 'CD14+ Mono', #logfc.threshold = 0,min.pct = 0.25)
gene_up=rownames(deg[deg$avg_log2FC>0,])
gene_down=rownames(deg[deg$avg_log2FC < 0,])
library(org.Hs.eg.db)
#把SYMBOL改为ENTREZID,这里会损失一部分无法匹配到的
gene_up=as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db, keys = gene_up,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
gene_down=as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db,keys = gene_down,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
library(clusterProfiler)
这里用到了生信技能树-健明老师的一键分析代码,感谢健明老师的无私分享
## KEGG pathway analysis
### 做KEGG数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
run_kegg <- function(gene_up,gene_down,pro='test'){
library(ggplot2)
gene_up=unique(gene_up)
gene_down=unique(gene_down)
gene_diff=unique(c(gene_up,gene_down))
### over-representation test
# 下面把3个基因集分开做超几何分布检验
# 首先是上调基因集。
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
#universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.up)[,1:6]
kk=kk.up
dotplot(kk)
kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
write.csv(kk@result,paste0(pro,'_kk.up.csv'))
# 首先是下调基因集。
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
#universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.down)[,1:6]
kk=kk.down
dotplot(kk)
kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
write.csv(kk@result,paste0(pro,'_kk.down.csv'))
# 最后是上下调合并后的基因集。
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.05)
head(kk.diff)[,1:6]
kk=kk.diff
dotplot(kk)
kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
write.csv(kk@result,paste0(pro,'_kk.diff.csv'))
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.01,];down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.01,];up_kegg$group=1
#画图设置, 这个图很丑,大家可以自行修改。
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = paste0(pro,'_kegg_up_down.png') )
if(F){
### GSEA
## GSEA算法跟上面的使用差异基因集做超几何分布检验不一样。
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 20,
pvalueCutoff = 0.9,
verbose = FALSE)
head(kk_gse)[,1:6]
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
gseaplot(kk_gse, 'hsa04110',title = 'Cell cycle')
kk=DOSE::setReadable(kk_gse, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
tmp=kk@result
write.csv(kk@result,paste0(pro,'_kegg.gsea.csv'))
# 这里找不到显著下调的通路,可以选择调整阈值,或者其它。
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = paste0(pro,'_kegg_gsea.png'))
#
}
}
### GO database analysis
### 做GO数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
run_go <- function(gene_up,gene_down,pro='test'){
gene_up=unique(gene_up)
gene_down=unique(gene_down)
gene_diff=unique(c(gene_up,gene_down))
g_list=list(gene_up=gene_up,
gene_down=gene_down,
gene_diff=gene_diff)
# 因为go数据库非常多基因集,所以运行速度会很慢。
if(T){
go_enrich_results <- lapply( g_list , function(gene) {
lapply( c('BP','MF','CC') , function(ont) {
cat(paste('Now process ',ont ))
ego <- enrichGO(gene = gene,
#universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
print( head(ego) )
return(ego)
})
})
save(go_enrich_results,file =paste0(pro, '_go_enrich_results.Rdata'))
}
# 只有第一次需要运行,就保存结果哈,下次需要探索结果,就载入即可。
load(file=paste0(pro, '_go_enrich_results.Rdata'))
n1= c('gene_up','gene_down','gene_diff')
n2= c('BP','MF','CC')
for (i in 1:3){
for (j in 1:3){
fn=paste0(pro, '_dotplot_',n1[i],'_',n2[j],'.png')
cat(paste0(fn,'\n'))
png(fn,res=150,width = 1080)
print( dotplot(go_enrich_results[[i]][[j]] ))
dev.off()
}
}
}
kegg_plot <- function(up_kegg,down_kegg){
dat=rbind(up_kegg,down_kegg)
colnames(dat)
dat$pvalue = -log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group
dat=dat[order(dat$pvalue,decreasing = F),]
g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) +
geom_bar(stat="identity") +
scale_fill_gradient(low="blue",high="red",guide = FALSE) +
scale_x_discrete(name ="Pathway names") +
scale_y_continuous(name ="log10P-value") +
coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Pathway Enrichment")
}
运行以上代码后,就可以使用run_kegg,run_go和kegg_plot函数了
#直接利用脚本中的代码对正、负相关基因进行kegg富集分析
run_kegg(gene_up,gene_down,pro='test')
# 下面的 run_go 会比较慢,根据需要
# run_go(gene_up,gene_down,pro='shRNA')
#对正相关基因进行富集分析
go <- enrichGO(gene_up, OrgDb = "org.Hs.eg.db", ont="all")
library(ggplot2)
library(stringr)
p1<-barplot(go, split="ONTOLOGY")+ facet_grid(ONTOLOGY~., scale="free")
go=DOSE::setReadable(go, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
write.csv( go@result,file = 'gene_up_GO_all_barplot.csv')
#对负相关基因进行富集分析
go <- enrichGO(gene_down, OrgDb = "org.Hs.eg.db", ont="all")
p2<-barplot(go, split="ONTOLOGY")+ facet_grid(ONTOLOGY~., scale="free")
go=DOSE::setReadable(go, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
write.csv( go@result,file = 'gene_down_GO_all_barplot.csv')
p1+p2
参考来源
公众号:生信技能树
#section 3已更新#「生信技能树」单细胞公开课2021_哔哩哔哩_bilibili
致谢
I thank Dr.Jianming Zeng(University of Macau), and all the members of his bioinformatics team, biotrainee, for generously sharing their experience and codes.
THE END