Seurat之整合分析(2)---跨物种
上面那个帖子介绍了FindIntegrationAnchors和IntegrateData,说也适合整合两个datasets只有部分重叠。那么,从这个角度出发,也适合整合两个跨物种的数据集,利用重叠的homolog基因来实现重叠的部分,所以我们拿arabidopsis和rice的单细胞数据来试试看(随便从paper里面选了一组拟南芥根和水稻根的数据作为测试而已)。
注:我是用cellranger获得了比对矩阵文件。然后根据arabidopsis和rice的ortholog基因,把rice ortholog的基因ID换成了arabidopsis,方便后面构造arabidopsis和rice的可比文件。
library(Seurat)
library(SeuratData)
library(patchwork)
library(harmony)
#library(rliger)
library(RColorBrewer)
library(tidyverse)
library(reshape2)
library(ggsci)
library(ggstatsplot)
#常规的构建seurat对象
arabidopsis.data <- Read10X(data.dir = "arabidopsis/filtered_feature_bc_matrix/")
rice.data <- Read10X(data.dir = "rice/filtered_feature_bc_matrix/")
arabidopsis <- CreateSeuratObject(counts = arabidopsis.data, project = "arabidopsis", min.cells = 3, min.features = 200)
rice <- CreateSeuratObject(counts = rice.data, project = "rice", min.cells = 3, min.features = 200)
#做了一些简单的过滤
arabidopsis[["percent.mg"]] <- PercentageFeatureSet(arabidopsis, pattern ="^ATMG")
arabidopsis[["percent.cg"]] <- PercentageFeatureSet(arabidopsis, pattern ="^ATCG")
arabidopsis <- subset(arabidopsis, subset = nFeature_RNA >500& nFeature_RNA <5000& percent.mg <1 & percent.cg <3)
rice<- subset(rice, subset = nFeature_RNA >500& nFeature_RNA <7000)
#获得拟南芥和水稻共有基因的
common_genes <- rownames(arabidopsis)[rownames(arabidopsis) %in% rownames(rice)]
> length(common_genes)
[1] 9042
#提取共有的部分进行对比
arabidopsis <- arabidopsis[rownames(arabidopsis) %in% common_genes,]
rice <- rice[rownames(rice) %in% common_genes,]
#下面的和上面一个帖子一样了
wb_list <- list()
wb_list[["arabidopsis"]] <- arabidopsis
wb_list[["rice"]] <- rice
wb_list <- lapply(X = wb_list, FUN = function(x) {
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
wb_anchors <- FindIntegrationAnchors(object.list = wb_list, dims = 1:30)
wb_seurat <- IntegrateData(anchorset = wb_anchors, dims = 1:30)
DefaultAssay(wb_seurat) <- "RNA"
wb_seurat <- NormalizeData(wb_seurat, verbose = F)
wb_seurat <- FindVariableFeatures(wb_seurat, selection.method = "vst", nfeatures = 2000, verbose = F)
wb_seurat <- ScaleData(wb_seurat, verbose = F)
wb_seurat <- RunPCA(wb_seurat, npcs = 30, verbose = F)
wb_seurat <- RunUMAP(wb_seurat, reduction = "pca", dims = 1:30, verbose = F)
p1<-DimPlot(wb_seurat,reduction = "umap") + scale_color_npg()+
plot_annotation(title = "arabidopsis and rice, before integration")
DefaultAssay(wb_seurat) <- "integrated"
wb_seurat <- ScaleData(wb_seurat, verbose = F)
wb_seurat <- RunPCA(wb_seurat, npcs = 30, verbose = F)
wb_seurat <- RunUMAP(wb_seurat, reduction = "pca", dims = 1:30, verbose = F)
p2<-DimPlot(wb_seurat, reduction = "umap") +
scale_color_npg()+
plot_annotation(title = "arabidopsis and rice, after integration")
p1|p2
#我们可以明显看出合并之前和合并之后的差异。但是我个人觉得貌似合并的效果也没有那么好,不确定是真实的细胞类型的差异,还是算法带来的误差差异。
#下面就是常规的聚类和可视化
wb_seurat <- FindNeighbors(wb_seurat, dims = 1:30, k.param = 10, verbose = F)
wb_seurat <- FindClusters(wb_seurat, verbose = F)
ncluster <- length(unique(wb_seurat[[]]$seurat_clusters))
mycol <- colorRampPalette(brewer.pal(8, "Set2"))(ncluster)
DimPlot(wb_seurat,label = T, reduction = "umap",cols = mycol, repel = T) +NoLegend()
#我们也可以查看每个cluster下面,不同物种之间的差异。
count_table <- table(wb_seurat@meta.data$seurat_clusters, wb_seurat@meta.data$orig.ident)
count_table
#### 可视化
count_table %>%
as.data.frame() %>%
ggbarstats(x = Var2, y = Var1,counts = Freq)+
scale_fill_npg()
后面,我们也可以根据marker基因来注释每个cluster,同时查看不同细胞类型在物种间的差异,例如上面一个帖子讲的那样。