拟时序分析

2022-05-16 本文已影响0人 sucycy

单细胞测序可以检测一块组织中每个细胞的转录组情况，但其实一块组织中存在着不同发育状态的细胞，因此通过基因的表达情况我们可以了解一些细胞的发育状况和细胞转化的过程，目前为止做拟时序的软件就有monocle，velocity 等等，在这里我们对目前比较主流的进行测试，后期有发现了好用的软件也会持续进行更新。
测试1：monocle2:Monocle 2然后在自动选择的一组数据质心上构造一棵生成树（DDRTree算法）。然后，该算法将细胞移动到它们最近的树的顶点，更新顶点的位置以适应细胞，学习新的生成树，并迭代地继续这个过程，直到树和细胞的位置已经收敛。在这个过程中，Monocle 2保持了高维空间和低维空间之间的可逆映射，从而既学习了轨迹，又降低了数据的维数。
测试2：monocle3：相比于DDRTree算法，UMAP算法进行轨迹推断可能会更直观，算法上也进行了优化，相比于2，速度和内存消耗都感人了很多
测试3 ：velocity.R:刚转录的mRNA含有内含子与外显子，经过剪切后可形成成熟的mRNA，这个方法就是通过细胞中含有的前体RNA和成熟RNA的含量来预测谱系发生的方向的。
测试4：scvelo：感觉比在R里边操作，顺畅了很多

测试1：monocle2
monocle2安装,BiocManager真的好用，能用BiocManager安装的建议用BiocManager安装

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("monocle")

1 加载包

suppressMessages({
library(reshape2)
library(Seurat)
library(corrplot)
library(grid)
library(cowplot)
library(tidyverse)
library(monocle)
library(ggsci)
library(ggpubr)
})

2 读入Seurat数据，本次用的是上次Seurat分析的数据，但是monocle的计算速度太令人绝望了，所以一直在各种subset，subset完还剩17000+ cells，还是很占用计算资源。

PRO<-readRDS('/test1/PRO1.rds')
Idents(PRO)<-'orig.ident'
PRO<-subset(PRO,idents = 'Bong_marrow1')
Idents(PRO)<-'cluster'
PRO<-subset(PRO,idents = 'Neutrophils',invert = TRUE)

3 将Seurat对象转为monocle对象

PRO_matrix <- as(as.matrix(PRO@assays$RNA@counts), 'sparseMatrix')
pd <- new('AnnotatedDataFrame', data = PRO@meta.data) 
fd <- new('AnnotatedDataFrame', data = data.frame(gene_short_name = row.names(PRO),row.names = row.names(PRO)))
cds <- newCellDataSet(PRO_matrix,
                      phenoData = pd,
                      featureData = fd,
                      lowerDetectionLimit = 0.5,
                      expressionFamily = negbinomial.size())

4 数据均一化，这一步挺慢的

cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

5 过滤基因

cds <- detectGenes(cds, min_expr = 0.1)#在fDate（cds）中添加一列num_cells_expressed
expressed_genes <- row.names(subset(fData(cds),num_cells_expressed >= 10))#过滤掉小于10个细胞中表达的基因
#如果Seurat过滤过，这一步可以不用过滤

6 选择高变基因，这一步比较关键，算法也比较多，不同的算法可能会导致不同的结果，大家可以了解其原理，选择适合自己的算法

##使用seurat选择的高变基因
#express_genes <- VariableFeatures(PRO)
#cds <- setOrderingFilter(cds, express_genes)
#plot_ordering_genes(cds)
##使用clusters差异表达基因
#deg.cluster <- FindAllMarkers(PRO)
#express_genes <- subset(deg.cluster,p_val_adj<0.05)$gene
#cds <- setOrderingFilter(cds, express_genes)
#plot_ordering_genes(cds)
##使用monocle选择的高变基因
disp_table <- dispersionTable(cds)
disp.genes <- subset(disp_table, mean_expression >= 0.1 )
cds <- setOrderingFilter(cds, disp.genes$gene_id)
plot_ordering_genes(cds)
#dpFeature，本来想用这个方法做测试，但是跑了很久没跑出来，就放弃了，使用了monocle选择的高变基因来做
#diff <- differentialGeneTest(cds[expressed_genes,],fullModelFormulaStr="~cluster",cores=1)
#deg <- subset(diff, qval < 0.01) #选出2829个基因
#deg <- deg[order(deg$qval,decreasing=F),]
#write.table(deg,file="train.monocle.DEG.xls",col.names=T,row.names=F,sep="\t",quote=F)
#ordergene <- rownames(deg) 
#cds <- setOrderingFilter(cds, ordergene)
#pdf("train.ordergenes.pdf")
#plot_ordering

image.png

7 降维分析，细胞排序

cds <- reduceDimension(cds, max_components = 2, method = 'DDRTree')
cds <- orderCells(cds)

8差异基因

ordering_genes=names(tail(sort(apply(cds@assayData$exprs,1,mad)),2000)) #用作后边的差异基因分析，也可以用前边得到的disp.genes，seurat得到的差异基因，自己指定基因啥的
diff_test_res <- differentialGeneTest(cds[ordering_genes,],fullModelFormulaStr = "~cluster") #这里可以选择用各种基因集，比如说seurat中发现的高变基因，monocle中的基因集等，这里用到了上一步中基因，很多教程中用到了expreesed gene ，但是这个基因集基因有点多，这一步跑的巨慢
deg <- subset(diff_test_res, qval < 0.01)
deg <- deg[order(deg$qval,decreasing=F),]
##差异基因的结果文件保存
write.table(deg,file="train.monocle.DEG.xls",col.names=T,row.names=F,sep="\t",quote=F)
ordergene <- rownames(deg)
Time_diff <- differentialGeneTest(cds[ordergene,], cores = 1, fullModelFormulaStr = "~sm.ns(Pseudotime)")
#Time_diff <- Time_diff[,c(5,2,3,4,1,6,7)] #调整列的顺序，可以不用跑
write.csv(Time_diff, "Time_diff_all.csv", row.names = F)
Time_genes <- Time_diff %>% pull(gene_short_name) %>% as.character()
p=plot_pseudotime_heatmap(cds[Time_genes,], num_clusters=4, show_rownames=T, return_heatmap=T)
ggsave("Time_heatmapAll.pdf", p, width = 5, height = 10)

9 绘制拟时序分析图

p1<-plot_cell_trajectory(cds, color_by = "cluster")
p2<-plot_cell_trajectory(cds,color_by="Pseudotime", size=1,show_backbone=TRUE) 
p <- p1|p2
ggsave("trajectory_plot.pdf", plot = plot3, width = 16, height = 8)
p3<-plot_cell_trajectory(cds, color_by = "cluster") + facet_wrap("~cluster", nrow = 1)#可以看每个cluster的分布情况
p3
p1<-plot_complex_cell_trajectory(cds, x = 1, y = 2,color_by = "cluster")#+scale_color_manual() +theme(legend.title = element_blank())
p2<-ggplot(pData(cds), aes(Pseudotime, colour = cluster, fill=cluster)) +geom_density(bw=0.5,size=1,alpha = 0.5)+theme_classic2()
p4<-p1|p2#绘制树状图和细胞密度图
ggsave("trajectory_2plot.pdf", plot = p4, width = 16, height = 8)

还可以选择不同
10分支点分析
选择自己感兴趣的细胞亚型和分化路径，然后可以看发生变化的基因和基因的变化趋势

BEAM_res=BEAM(cds[Time_genes,],branch_point = 3)#在这里我们看一下第三个节点的分化基因
BEAM_res <- BEAM_res[order(BEAM_res$qval),] #将得到的基因进行排序
write.csv(BEAM_res, "BEAM_res.csv", row.names = F)# 这一笔跑了好久，为了之后不重复的跑，把它存在表格里，以后直接读取就好。
#plot_genes_branched_heatmap(cds[row.names(subset(BEAM_res,qval < 1e-4)),],branch_point = 3 ，num_clusters = 4, use_gene_short_name = T,show_rownames = T)#有737个基因个gene，太多了，展示在图上不好看，所以下边选了top50的基因进行展示，这一步跑了比较久
BEAM_genes <- top_n(BEAM_res, n = 50, desc(qval)) %>% pull(gene_short_name) %>% as.character()
#在这里可以看不同的基因在不同的分化的时候的表达情况
p<-plot_genes_branched_heatmap(cds[BEAM_genes,],
                            branch_point = 3, #绘制的是哪个分支
                            num_clusters = 3, #分成几个cluster，根据需要调整
                            show_rownames = T,#热图上现实基因名
                            return_heatmap = T)

image.png

这时候还可以看一下基因的变化

keygenes <- head(BEAM_genes ,4)#在这里我们选择top4的gene看一下
cds_subset <- cds[keygenes,]
p1 <- plot_genes_in_pseudotime(cds_subset, color_by = "State")
p2 <- plot_genes_in_pseudotime(cds_subset, color_by = "cluster")
p3 <- plot_genes_in_pseudotime(cds_subset, color_by = "Pseudotime")
P<-p1|p2|p3
P
ggsave("gene_plot.pdf", plot = P, width = 16, height = 8) #width和height可以指定图片的长宽比，是很好用的参数。

image.png

关于monocle2的介绍就先到这里把，这个真的是太耗cpu了

Monocle3
软件安装,有些包如果安装了，就可不必重新安装，如果过程中遇到报错，缺啥补啥，总的来说，没有yum权限，装的痛苦无比。

BiocManager::install(c('BiocGenerics', 'DelayedArray', 'DelayedMatrixStats',
                       'limma', 'S4Vectors', 'SingleCellExperiment',
                       'SummarizedExperiment', 'batchelor', 'Matrix.utils'))
install.packages("devtools")
devtools::install_github('cole-trapnell-lab/leidenbase')
devtools::install_github('cole-trapnell-lab/monocle3')

1加载包

suppressMessages({
library(reshape2)
library(Seurat)
library(corrplot)
library(grid)
library(cowplot)
library(tidyverse)
library(monocle3)
library(ggsci)
library(ggpubr)
})

2 读入Seurat的数据，为了增加分析速度，还是单挑出来一个样本进行分析，monocle3的分析速度真的是比2感人很多.....

PRO<-readRDS('/test1/PRO1.rds')
Idents(PRO)<-'orig.ident'
PRO<-subset(PRO,idents = 'Bong_marrow1')

3 生成monocle格式的数据,并进行均一化，降维分析

data <- GetAssayData(PRO, assay = 'RNA', slot = 'counts')
cell_metadata <- PRO@meta.data
gene_annotation <- data.frame(gene_short_name = rownames(data))
rownames(gene_annotation) <- rownames(data)
cds <- new_cell_data_set(data,cell_metadata = PRO@meta.data,gene_metadata = gene_annotation)
cds <- preprocess_cds(cds, num_dim = 50) #preprocess_cds函数相当于seurat中NormalizeData+ScaleData+RunPCA
#plot_pc_variance_explained(cds)
cds <- reduce_dimension(cds, preprocess_method = "PCA")
cds <- cluster_cells(cds) 
##从seurat导入整合过的umap坐标
cds.embed <- cds@int_colData$reducedDims$UMAP
int.embed <- Embeddings(PRO, reduction = "umap")
int.embed <- int.embed[rownames(cds.embed),]
cds@int_colData$reducedDims$UMAP <- int.embed
#识别轨迹
cds <- learn_graph(cds)
plot_cells(cds, color_cells_by = "celltype")

image.png

4 细胞排序

#手动选择root
#cds <- order_cells(cds) 教程中使用的代码，但是由于juputer中不能出现交互界面，所以分析的时候不能用，没有使用jupyter的小伙伴可以试试是否可以出现交互界面，可以直接在交互界面上进行选择，否则，可能得想点其它的办法。
p<-plot_cells(cds, color_cells_by = "celltype")
p + geom_vline(xintercept = seq(8,9,0.25)) + geom_hline(yintercept = seq(-5,-4,0.25)) #使用坐标定位我们想要的root，定位完选择我们想要的格子，在这里我们选择交叉的右上角
embed <- data.frame(Embeddings(PRO, reduction = "umap"))
embed <- subset(embed, UMAP_1 > 8.75 & UMAP_1 < 9 & UMAP_2 > -4.25 & UMAP_2 < -4)
root.cell <- rownames(embed)
cds <- order_cells(cds, root_cells = root.cell)#可以选择多个root
p2<-plot_cells(cds, color_cells_by = "pseudotime", label_cell_groups = FALSE, label_leaves = FALSE,  label_branch_points = FALSE)
p<-p1|p2
p
ggsave("odercell.pdf", plot = p, width = 10, height = 5)

image.png

5 寻找轨迹分析差异基因

Track_genes <- graph_test(cds, neighbor_graph="principal_graph", cores=2) # 如果出现了Error: 'rBind' is defunct.的报错，运行trace('calculateLW', edit = T, where = asNamespace("monocle3"))然后将出现的脚本中的 Matrix::rBind 改为 rbind
#trace('calculateLW', edit = T, where = asNamespace("monocle3"))
write.csv(Track_genes, "Trackgenes1.csv", row.names = T)
Track_genes_sig <- Track_genes %>% top_n(n=10, morans_I) %>%pull(gene_short_name) %>% as.character()
p <- plot_genes_in_pseudotime(cds[Track_genes_sig,], color_cells_by="celltype", min_expr=0.5, ncol = 2)
p
ggsave("Genes_Jitterplot.pdf", plot = p, width = 8, height = 6)
# plot_cells(cds, genes=Track_genes_sig, show_trajectory_graph=FALSE,label_cell_groups=FALSE,  label_leaves=FALSE) #也可以查看这些基因在featureplot中的表达情况

image.png

6 共表达模块分析

genelist <- pull(Track_genes, gene_short_name) %>% as.character()
gene_module <- find_gene_modules(cds[genelist,], resolution=0.01, cores = 6)#resolution越大，在这里分出来的module越多
#write.csv(gene_module, "Genes_Module.csv", row.names = F)
cell_group <- tibble::tibble(cell=row.names(colData(cds)), cell_group=colData(cds)$celltype)
agg_mat <- aggregate_gene_expression(cds, gene_module, cell_group)
row.names(agg_mat) <- stringr::str_c("Module ", row.names(agg_mat))
p <- pheatmap::pheatmap(agg_mat, scale="column", clustering_method="ward.D2")
ggsave("Genes_Module.pdf", plot = p, width = 8, height = 8)

共表达模块

3安装velocyto
直接在原来的singlecell_test环境下安装

pip install -i https://pypi.tuna.tsinghua.edu.cn/simple velocyto

1 首先我们先加载一下包

suppressMessages({
library(loomR)
library(velocyto.R)
library(Seurat)
library(SeuratWrappers)
library(SeuratDisk)
library(ggplot2)
})

2 接下来我们读入数据，还是使用同一套数据,为了降低运算量，我们就选择一个样本来做

PRO<-readRDS('./PRO1.rds')
Idents(PRO)<-'orig.ident'
PRO<-subset(PRO,idents = 'Bong_marrow1')
ldat <- ReadVelocity(file = "./looms.loom") #读入loom数据

3 整理数据格式

#统一seurat和velocity的名称
colnames(ldat$spliced)<-gsub("x","_1",colnames(ldat$spliced))
colnames(ldat$spliced)<-gsub("looms:","",colnames(ldat$spliced))
colnames(ldat$unspliced)<-colnames(ldat$spliced)
colnames(ldat$ambiguous)<-colnames(ldat$spliced)
## 由于Seurat是选用了筛选后的barcode，而velocyto并没有，在这里我们将细胞统一一下的
ldat$spliced<-ldat$spliced[,rownames(PRO@meta.data)]
ldat$unspliced<-ldat$unspliced[,rownames(PRO@meta.data)]
ldat$ambiguous<-ldat$ambiguous[,rownames(PRO@meta.data)]
emat <- ldat$spliced
#emat <- emat[,colSums(emat)>=1e3] 根据需要看数据需不需过滤
nmat <- ldat$unspliced # disregarding spanning reads, as there are too few of them

4 数据过滤

cluster.label <- PRO@active.ident
emat <- filter.genes.by.cluster.expression(emat,cluster.label,min.max.cluster.average = 0.2)#筛选基因
nmat <- filter.genes.by.cluster.expression(nmat,cluster.label,min.max.cluster.average = 0.05)
length(intersect(rownames(emat),rownames(nmat)))
PROumap <- PRO@reductions$umap@cell.embeddings 
cell.dist <- as.dist(1-armaCor(t(PRO@reductions$umap@cell.embeddings)))#计算细胞距离
fit.quantile <- 0.02
rvel.cd <- gene.relative.velocity.estimates(emat,nmat,deltaT=1,kCells=25,cell.dist=cell.dist,fit.quantile=fit.quantile) #计算RNA速率

5 绘图

Idents(PRO)<-'cluster'
gg<-UMAPPlot(PRO)
gg
ggplot_build(gg)$data
colors <- as.list(ggplot_build(gg)$data[[1]]$colour) 
p1 <- show.velocity.on.embedding.cor(PROumap,rvel.cd,n=1000,scale='sqrt',cell.colors=ac(colors,alpha=0.5),cex=0.8,arrow.scale=2,show.grid.flow=T,min.grid.cell.mass=1.0,grid.n=50,arrow.lwd=1,do.par=F,cell.border.alpha =0.1,USE_OPENMP=1,n.cores=24,main="Cell Velocity") #n可以好好设置一下，细胞数多的时候不宜设的太小
p1

测试4：
使用R语言先输出需要的信息

suppressMessages({
library(loomR)
library(velocyto.R)
library(Seurat)
library(SeuratWrappers)
library(SeuratDisk)
library(ggplot2)
})

读入数据，输出我们需要的数据

PRO<-readRDS('./PRO1.rds')
Idents(PRO)<-'orig.ident'
PRO<-subset(PRO,idents = 'Bong_marrow1')
#提取坐标信息
write.csv(Embeddings(PRO, reduction = "umap"), file = "cell_embeddings.csv")
# 获取每个细胞的barcode
write.csv(Cells(PRO), file = "cellID_obs.csv", row.names = FALSE)
# 提取每个细胞的cluster信息
write.csv(PRO@meta.data[, 'RNA_snn_res.0.8', drop = FALSE], file = "cell_clusters.csv")
# 提取每个细胞的celltype信息
write.csv(PRO@meta.data[, 'cluster', drop = FALSE], file = "cell_celltype.csv")
library(scales)
# 获取celltype的颜色信息
hue_pal()(length(levels(PRO$cluster)))
# 获取cluster的颜色信息
hue_pal()(length(levels(PRO$cluster)))

下面使用python进行分析
1 导入包

import scanpy
import scvelo as scv
import pandas as pd
import numpy as np
import os

2 读入数据

loom_data = scv.read('./looms.loom', cache=False)
# barcode名字去重后缀x，与seurat导出的barcode名称一致
loom_data.obs = loom_data.obs.rename(index = lambda x: x.replace('looms:','').replace('x', '_1'))
# 读取seurat中的meta信息
sample_obs = pd.read_csv('./cellID_obs.csv')
cell_umap= pd.read_csv('./cell_embeddings.csv', header=0, names=["Cell ID", "UMAP_1", "UMAP_2"])
cell_clusters = pd.read_csv('./cell_clusters.csv', header=0, names=["Cell ID", "cluster"])
cell_celltype = pd.read_csv('./cell_celltype.csv', header=0, names=["Cell ID", "celltype"])
# 对细胞文件和RNA剪切速率文件取交集，保留关注的细胞类型
sample_one = loom_data[np.isin(loom_data.obs.index, sample_obs)]
# 构建umap坐标, cluster, celltype信息数据框
sample_one_index = pd.DataFrame(sample_one.obs.index)
sample_one_index = sample_one_index.rename(columns = {'CellID':'Cell ID'})
umap_ordered = sample_one_index.merge(cell_umap, on = 'Cell ID')
celltype_ordered = sample_one_index.merge(cell_celltype, on = "Cell ID")
celltype_ordered.head()
clusters_ordered = sample_one_index.merge(cell_clusters, on = "Cell ID")
clusters_ordered.head()
umap_ordered = umap_ordered.iloc[:,1:]
clusters_ordered = clusters_ordered.iloc[:,1:]
celltype_ordered = celltype_ordered.iloc[:,1:]
sample_one.obsm['X_umap'] = umap_ordered.values
sample_one.uns['clusters'] = clusters_ordered.values
sample_one.obs['celltype'] = celltype_ordered.values
adata = sample_one
# some gene labels are duplicated (Ensembl IDs are still unique!!)
adata.var_names_make_unique()
# save model to file
adata.write('Allcelltype_dynamicModel.h5ad', compression = 'gzip')
# read h5ad file
#adata= scv.read('Allcelltype_dynamicModel.h5ad')

3 进行数据整理

scv.pp.filter_and_normalize(adata,min_shared_counts=30, n_top_genes=2000)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)#在基本预处理（基因选择和归一化）之后，我们计算一阶和二阶矩（均值和非中心方差）以进行速度估计：
scv.tl.velocity(adata, mode = "stochastic")#核心步骤，估计转录速率
scv.tl.velocity_graph(adata) #[Getting Started — scVelo 0.2.5.dev6+g1805ab4.d20210829 documentation](https://scvelo.readthedocs.io/getting_started/)没读懂这一步的算法，感兴趣的小伙伴自己去看吧 
ident_colours = ['#F8766D','#E38900','#C49A00','#99A800','#53B400','#00BC56','#00C094','#00BFC4','#00B6EB','#06A4FF','#A58AFF','#DF70F8','#FB61D7','#FF66A8']#Seurat导出来的颜色，不用也没关系，会有自动生成的颜色
scv.pl.velocity_embedding_stream(adata, basis='X_umap',color = "celltype", palette = ident_colours, size = 20,alpha =0.8) #可以进行绘图了
scv.pl.velocity_embedding_grid(adata, basis='umap', color='celltype', save='embedding_grid.pdf', title='', scale=0.25)#另一种展示形式

image.png

scv.pl.proportions(adata, groupby='celltype') #未剪切的mRNA多可能说明这个cluster转录比较活跃

image.png

# 看一下我们关注的基因的表达情况
scv.pl.velocity(adata, var_names=['Camp'], color='celltype')

image.png

拟时序分析

猜你喜欢

热点阅读