跨物种单细胞比较(1)
前面关于MetaNeighbor的帖子,我们学习了如何用MetaNeighbor来比较两个数据集细胞类型注释的一致性。而用Seurat的FindIntegrationAnchors和IntegrateData函数,我们学习了如何跨物种对两个seurat数据集进行整合,今天结合这2个学习,我们尝试能否衡量两个物种细胞类型注释的一致性。(但是方法不一定正确和有效,也没有进行合理性的评价)
数据集我们依旧选取的和前面一样,拿arabidopsis和rice的单细胞数据来试试看(随便从paper里面选了一组拟南芥根和水稻根的数据作为测试而已)。
对于测试数据集的构建,和前面的部分是一样的。
library(Seurat)
library(SeuratData)
library(patchwork)
library(harmony)
#library(rliger)
library(RColorBrewer)
library(tidyverse)
library(reshape2)
library(ggsci)
library(ggstatsplot)
#常规的构建seurat对象
arabidopsis.data <- Read10X(data.dir = "arabidopsis/filtered_feature_bc_matrix/")
rice.data <- Read10X(data.dir = "rice/filtered_feature_bc_matrix/")
arabidopsis <- CreateSeuratObject(counts = arabidopsis.data, project = "arabidopsis", min.cells = 3, min.features = 200)
rice <- CreateSeuratObject(counts = rice.data, project = "rice", min.cells = 3, min.features = 200)
#做了一些简单的过滤
arabidopsis[["percent.mg"]] <- PercentageFeatureSet(arabidopsis, pattern ="^ATMG")
arabidopsis[["percent.cg"]] <- PercentageFeatureSet(arabidopsis, pattern ="^ATCG")
arabidopsis <- subset(arabidopsis, subset = nFeature_RNA >500& nFeature_RNA <5000& percent.mg <1 & percent.cg <3)
rice<- subset(rice, subset = nFeature_RNA >500& nFeature_RNA <7000)
#下面就是常规的seurat的一些分析
arabidopsis<- NormalizeData(arabidopsis, normalization.method ="LogNormalize", scale.factor =10000)
arabidopsis <- FindVariableFeatures(arabidopsis, selection.method ="vst", nfeatures =5000)
all.genes <- rownames(arabidopsis)
arabidopsis <- ScaleData(arabidopsis, features = all.genes)
arabidopsis <- RunPCA(arabidopsis, features = VariableFeatures(object = arabidopsis))
arabidopsis <- FindNeighbors(arabidopsis, dims = 1:30)
arabidopsis <- FindClusters(arabidopsis, resolution = 0.5)
arabidopsis <- RunUMAP(arabidopsis, dims = 1:30)
arabidopsis <- RunTSNE(arabidopsis, dims = 1:30)
rice<- NormalizeData(rice, normalization.method ="LogNormalize", scale.factor =10000)
rice <- FindVariableFeatures(rice, selection.method ="vst", nfeatures =5000)
all.genes <- rownames(rice)
rice <- ScaleData(rice, features = all.genes)
rice <- RunPCA(rice, features = VariableFeatures(object = rice))
rice <- FindNeighbors(rice, dims = 1:30)
rice <- FindClusters(rice, resolution = 0.5)
rice <- RunUMAP(rice, dims = 1:30)
rice <- RunTSNE(rice, dims = 1:30)
#下面用一些常用的marker基因来标记每个cluster,简单做一下,没有深入对比
Phloem_companion_cell<-c("AT3G19550")
Quiescent_center<-c("AT1G69490","AT3G16150","AT3G53040")
Xylem<-c("AT1G68810")
Non_hair_cell<-c("AT1G68560","AT5G66800")
Cortex<-c("AT1G44970","AT3G12700","AT5G44130")
Epidermis<-c("AT2G16230")
Root_endodermis<-c("AT1G61590","AT4G02090","AT5G42180")
Phloem<-c("AT1G79430","AT4G03500")
Protophloem_sieve_element<-c("AT1G14730")
Protoxylem<-c("AT1G66810")
Trichoblast<-c("AT1G48930")
Lateral_root_cap<-c("AT4G36250","AT5G17520")
Stele<-c("AT4G37650")
all <- c(Phloem_companion_cell,Quiescent_center,Xylem,Non_hair_cell,Cortex,Epidermis,Root_endodermis,Phloem,Protophloem_sieve_element,Protoxylem,Trichoblast,Lateral_root_cap,Stele)
DotPlot(arabidopsis, features = all, cols = c("blue", "red"), dot.scale = 8) +RotatedAxis()
arabidopsis<- RenameIdents(arabidopsis, '0' = "Cortex", '1' = "Lateral root cap",
'2' = "Non hair cell",'3' = "Cortex", '4' = "Cortex", '5' = "Non hair cell",
'6' = "Stele", '7' = "Xylem", '8' = "Trichoblast", '9' = "Quiescent center",
'10' = "Cortex", '11' = "Stele", '12' = "Quiescent center", '13' = "Quiescent center",
'14' = "Root endodermis",'15'="Root endodermis",'16'="Trichoblast",'17'="Root endodermis",'18'="Xylem",
'19'="Phloem",'20'="Protophloem sieve element",'21'="Phloem",'22'="Cortex")
DimPlot(arabidopsis, reduction = "tsne", label = TRUE)
list <- c("EXPA8","AT5G50200","OsPAP10c","OsSNDP1","PT2","CSLD1","OsEXPA17","OsbHLH115",
"OsMST1","LSI2","OsADF3","AT1G33260","OsAAP6","LSI1","OsSCR2",
"ERF131","AT1G51340","OsEMSA1","OsCLIP","AT3G02850","CESA7","BC6","OsSWN6",
"OsHB4","OsFTIP1","NRAMP3","AT1G59750","Os03g0279200","AT3G45980","AT2G29570","OsRPA32",
"AT1G20930","CycB1","ND1","AT3G25980","AT4G14330","AT1G63100","OsDLK","AT2G38905")
DotPlot(rice, features = all, cols = c("blue", "red"), dot.scale = 8) +RotatedAxis()
rice<- RenameIdents(rice, '0' = "Exodermis", '1' = "Root endodermis",
'2' = "Meristem",'3' = "Root endodermis", '4' = "Vascular cylinder", '5' = "Vascular cylinder",
'6' = "Epidermis", '7' = "Epidermis", '8' = "Exodermis", '9' = "Vascular cylinder",
'10' = "Exodermis", '11' = "Meristem", '12' = "Root endodermis", '13' = "Vascular cylinder",
'14' = "Vascular cylinder",'15'="Vascular cylinder",'16'="Meristem",'17'="Pericyle",'18'="Vascular cylinder",
'19'="Epidermis",'20'="Exodermis")
DimPlot(rice, reduction = "tsne", label = TRUE)++NoLegend()
arabidopsis$celltype <- Idents(arabidopsis)
rice$celltype <- Idents(rice)
#获得拟南芥和水稻共有基因的
common_genes <- rownames(arabidopsis)[rownames(arabidopsis) %in% rownames(rice)]
> length(common_genes)
[1] 9042
#提取共有的部分进行对比
arabidopsis <- arabidopsis[rownames(arabidopsis) %in% common_genes,]
rice <- rice[rownames(rice) %in% common_genes,]
#转化成SCE格式的对象
arabidopsis.sce <- as.SingleCellExperiment(arabidopsis)
rice.sce <- as.SingleCellExperiment(rice)
下面,利用前面的MetaNeighbor来衡量拟南芥和水稻之间cell type的相似性。
#拟南芥的数据作为训练集,当然也可以利用MetaNeighbor例子1的做法,把拟南芥和水稻作为一个整体。
pretrained_model <- MetaNeighbor::trainModel(var_genes = rownames(arabidopsis.sce),
dat = arabidopsis.sce,
study_id = arabidopsis.sce$orig.ident,
cell_type = arabidopsis.sce$celltype)
write.table(pretrained_model,"pretrained_arabidopsis_celltype.txt")
arabidopsis_celltype <- read.table("pretrained_arabidopsis_celltype.txt",check.names=FALSE)
colnames(colData(rice.sce))
aurocs <- MetaNeighborUS(trained_model=arabidopsis_celltype ,
dat = rice.sce,
study_id = rice.sce$orig.ident,
cell_type = rice.sce$celltype,
fast_version=TRUE)
plotHeatmapPretrained(aurocs)
aurocs里面就是拟南芥和水稻之间每个cell type之间的AUROC值,可以认为一致性,也可以当做相似性(但是,因为我在这个测试数据里面只是大概粗糙的注释了一下,不具备太多实际的生物学意义)。
我们还可以以桑基图的形式展示各个细胞类型在2个物种之间的关系。