Day4-单细胞数据降维聚类分群
2022-04-18 本文已影响0人
Sun506
昨天cellranger完后在outs文件夹里面就有这三个文件了,随后就可以在R里面读取这个单细胞数据了。
![](https://img.haomeiwen.com/i26913743/32f233eb0cc7e9e0.png)
1.加载R包及读取数据
library(Seurat)
#新建文件夹
dir.create('1.QC')
dir.create('2.Cluster')
#读入第1个数据
ZsGreen1Ldlr_KO_0weekWD <- Read10X("/mnt/SSS/database/GSE155513RAW/ZsGreenNega_Ldlr_KO_0_week_WD/outs/filtered_feature_bc_matrix")
dim(ZsGreen1Ldlr_KO_0weekWD)
ZsLdlr0 <- CreateSeuratObject(ZsGreen1Ldlr_KO_0weekWD,min.cells=3,min.features = 100,project="GSE155513_LDLR")
ZsLdlr0[['group']]<- 'ZsN_Ldlr0w'
ZsLdlr0[['source']]<- 'Ldlr0w'
ZsLdlr0[['ZsGreen']]<- 'nonSMC'
ZsLdlr0[['dataset']]<- 'GSE155513'
ZsLdlr0[['tissue']]<- 'AO'
ZsLdlr0[['condition']]<- 'Ldlr_AS'
dim(ZsLdlr0)#[1] 16171 3811
rm(ZsGreen1Ldlr_KO_0weekWD)
cellranger完一共有14个数据,8个LDLR,6个APOE,这边只读取一个示范
2.去除双细胞
library(DoubletFinder)
library(tidyverse)
library(patchwork)
############ZsLdlr0
## Pre-process Seurat object (standard)
ZsLdlr0 <- NormalizeData(ZsLdlr0)
ZsLdlr0 <- FindVariableFeatures(ZsLdlr0, selection.method = "vst", nfeatures = 2000)
ZsLdlr0 <- ScaleData(ZsLdlr0)
ZsLdlr0 <- RunPCA(ZsLdlr0)
ElbowPlot(ZsLdlr0)
pc.num=1:30
ZsLdlr0 <- RunUMAP(ZsLdlr0, dims=pc.num)
ZsLdlr0 <- FindNeighbors(ZsLdlr0, dims = pc.num) %>% FindClusters(resolution = 0.5)
## pK Identification (no ground-truth)
sweep.res.list <- paramSweep_v3(ZsLdlr0, PCs = pc.num, sct = FALSE)#使用SCT标准化方法,如果使用log标准化,把所有函数中的sct参数设置为 sct = F 即可。
sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
bcmvn <- find.pK(sweep.stats)
pK_bcmvn <- bcmvn$pK[which.max(bcmvn$BCmetric)] %>% as.character() %>% as.numeric()
## 排除不能检出的同源doublets,优化期望的doublets数量
number<-dim(ZsLdlr0)[2]
if(number<800){DoubletRate = 0.004}
if(number>=800 & number<1600){DoubletRate = 0.008}
if(number>=1600 & number<3200){DoubletRate = 0.016}
if(number>=3200 & number<4800){DoubletRate = 0.023}
if(number>=4800 & number<6400){DoubletRate = 0.031}
if(number>=6400 & number<8000){DoubletRate = 0.039}
if(number>=8000 & number<9600){DoubletRate = 0.046}
if(number>=9600 & number<11200){DoubletRate = 0.054}
if(number>=11200 & number<12800){DoubletRate = 0.061}
if(number>=12800 & number<14400){DoubletRate = 0.069}
#提供预估的doublets比例:根据10x官方doublets估计比例,针对自己细胞数进行估计:
# cell loaded 800---cell recovered 500---doublets rate 0.4%
# cell loaded 1600---cell recovered 1000---doublets rate 0.8%
# cell loaded 3200---cell recovered 2000---doublets rate 1.6%
# cell loaded 4800---cell recovered 3000---doublets rate 2.3%
# cell loaded 6400---cell recovered 4000---doublets rate 3.1%
# cell loaded 8000---cell recovered 5000---doublets rate 3.9%
# cell loaded 9600---cell recovered 6000---doublets rate 4.6%
# cell loaded 11200---cell recovered 7000---doublets rate 5.4%
# cell loaded 12800---cell recovered 8000---doublets rate 6.1%
# cell loaded 14400---cell recovered 9000---doublets rate 6.9%
# cell loaded 16000---cell recovered 10000---doublets rate 7.6%
homotypic.prop <- modelHomotypic(ZsLdlr0$seurat_clusters) # 最好提供celltype
nExp_poi <- round(DoubletRate*ncol(ZsLdlr0))
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
## Run DoubletFinder with varying classification stringencies
ZsLdlr0 <- doubletFinder_v3(ZsLdlr0, PCs = 1:30, pN = 0.25, pK = pK_bcmvn, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
## 结果展示,分类结果在ZsLdlr0@meta.data中
DF<-colnames(ZsLdlr0@meta.data)[grep("DF.classifications",colnames(ZsLdlr0@meta.data))]
DimPlot(ZsLdlr0, reduction = "umap", group.by = DF)
#group名字按照metadata里面自动创建的改掉,是计算出来的最优参数
##提取单细胞去除双细胞
colnames(ZsLdlr0@meta.data)[grep("DF.classifications",colnames(ZsLdlr0@meta.data))]<-'DF'
ZsLdlr0 <- subset(ZsLdlr0,subset = DF == 'Singlet')
saveRDS(ZsLdlr0,'ZsLdlr0_singlet_raw.rds')
3.Merge LDLR数据
上面只示范了一个数据,需要将所有数据都读取进来后merge
##使用merge函数合并seurat对象
GSE155513LDLR_raw<- merge(ZsLdlr0,
y = c(ZsPLdlr0,ZsLdlr8,ZsPLdlr8,ZsLdlr16,ZsPLdlr16,ZsLdlr26,ZsPLdlr26),
add.cell.ids = c("ZsLdlr0","ZsPLdlr0","ZsLdlr8","ZsPLdlr8","ZsLdlr16","ZsPLdlr16","ZsLdlr26","ZsPLdlr26"),
project = "GSE155513_LDLR")
table(GSE155513LDLR_raw$group)
## ZsN_Ldlr0w ZsN_Ldlr16w ZsN_Ldlr26w ZsN_Ldlr8w ZsP_Ldlr0w ZsP_Ldlr16w ZsP_Ldlr26w ZsP_Ldlr8w
## 3723 2963 3706 1799 3315 4675 6373 2550
4.QC
#mitochondrial QC metrics
GSE155513LDLR_raw[["percent.mt"]] <- PercentageFeatureSet(GSE155513LDLR_raw, pattern = "^mt-")
#计算红细胞比例
HB1 <- grep(pattern = "^Hba-" , rownames(GSE155513LDLR_raw), value = T)#红细胞
HB2 <- grep(pattern = "^Hbb-" , rownames(GSE155513LDLR_raw), value = T)
Hb.genes <- c(HB1,HB2)
GSE155513LDLR_raw[["percent.Hb"]] <- PercentageFeatureSet(GSE155513LDLR_raw, features = Hb.genes)
# Visualize QC metrics as a violin plot
Idents(GSE155513LDLR_raw)<-GSE155513LDLR_raw[['group']]
pdf('1.QC/LDLR-QC_metrics.pdf',width=10,height = 5)
VlnPlot(GSE155513LDLR_raw, features = c("nFeature_RNA", "nCount_RNA", "percent.mt",'percent.Hb'), ncol = 3,group.by = "group",pt.size = 0.01)
dev.off()
![](https://img.haomeiwen.com/i26913743/dfde0b64bd423cbf.png)
# feature-feature relationships
pdf('1.QC/LDLRfeature-feature relationships.pdf',width=10,height = 5)
plot1 <- FeatureScatter(GSE155513LDLR_raw, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(GSE155513LDLR_raw, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3 <- FeatureScatter(GSE155513LDLR_raw, feature1 = "nCount_RNA", feature2 = "percent.Hb")
pearplot<-CombinePlots(plots = list(plot1, plot2,plot3))
dev.off()
![](https://img.haomeiwen.com/i26913743/d7edf0b615f9bf3e.png)
5.过滤
GSE155513LDLR <- subset(GSE155513LDLR_raw, subset =
nFeature_RNA > 200 &
nCount_RNA >1000 &
nCount_RNA <20000 &
percent.mt < 10 &
percent.Hb< 0.3)
table(GSE155513LDLR$group)
## ZsN_Ldlr0w ZsN_Ldlr16w ZsN_Ldlr26w ZsN_Ldlr8w ZsP_Ldlr0w ZsP_Ldlr16w ZsP_Ldlr26w ZsP_Ldlr8w
## 3368 2617 3090 1646 3176 4128 5788 2467
6.CCA去除批次效应
############ Integrate data#############
GSE155513LDLR.list <- SplitObject(GSE155513LDLR, split.by = 'group')
DefaultAssay(GSE155513LDLR)<-"RNA"
#To change the variable features, please set manually with VariableFeatures
for (i in 1:length(GSE155513LDLR.list)) {
GSE155513LDLR.list[[i]] <- NormalizeData(GSE155513LDLR.list[[i]], verbose = FALSE)
GSE155513LDLR.list[[i]] <- FindVariableFeatures(GSE155513LDLR.list[[i]], selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
}
dims=1:30
GSE155513LDLR.anchors <- FindIntegrationAnchors(object.list = GSE155513LDLR.list, dims = dims)
GSE155513LDLR <- IntegrateData(anchorset = GSE155513LDLR.anchors, dims = dims)
7.降维聚类
GSE155513LDLR <- ScaleData(GSE155513LDLR, verbose = FALSE)
GSE155513LDLR <- RunPCA(GSE155513LDLR, npcs = 30, verbose = FALSE)
GSE155513LDLR <- FindNeighbors(GSE155513LDLR, reduction = "pca", dims = dims)
GSE155513LDLR <- FindClusters(GSE155513LDLR, resolution = 0.5)#22
#新加一列,Cluster从1开始计数
new.cluster.ids <- c(1:length(levels(GSE155513LDLR)))
names(x = new.cluster.ids) <- levels(x = GSE155513LDLR)
GSE155513LDLR <- RenameIdents(object = GSE155513LDLR, new.cluster.ids)
GSE155513LDLR[["Clusters"]]<-Idents(GSE155513LDLR)
dim(GSE155513LDLR)#22362
GSE155513LDLR <- RunUMAP(GSE155513LDLR, dims =dims,perplexity=30)
DimPlot(object = GSE155513LDLR , reduction = "umap", label = T,label.size = 6,repel = T,pt.size = 1) + NoLegend()+NoAxes()
umap,19群
![](https://img.haomeiwen.com/i26913743/666593169cbdcfc8.png)
8.FindMarkers
DefaultAssay(GSE155513LDLR)<-"RNA"
GSE155513LDLR.markers <- FindAllMarkers(GSE155513LDLR, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
GSE155513LDLR.markers <- GSE155513LDLR.markers
GSE155513LDLR.markers$nLog_padj <- -log10(GSE155513LDLR.markers$p_val_adj)
GSE155513LDLR.markers$ratio <- GSE155513LDLR.markers$pct.1/GSE155513LDLR.markers$pct.2
out <- cbind(gene=GSE155513LDLR.markers$gene,GSE155513LDLR.markers)
write.table(out,'2.Cluster/GSE155513LDLR_marker_res0.5.xls',sep = '\t',quote = F,row.names = F)
write.csv(GSE155513LDLR.markers,"2.Cluster/GSE155513LDLR_markers_res0.5.csv")
top10markers <- GSE155513LDLR.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DotPlot(GSE155513LDLR,features =rev(unique(top10markers$gene)),dot.scale = 4)+RotatedAxis()+
theme(axis.text.x = element_text(face="italic",angle = 90,vjust = 0.5),axis.title = element_blank())+
theme(panel.border = element_rect(colour = "black", fill=NA))+
scale_color_viridis(option = "D")#20X5.1
top10ratio <- GSE155513LDLR.markers %>% group_by(cluster) %>% top_n(n = 10, wt = ratio)
DotPlot(GSE155513LDLR,features =rev(unique(top10ratio$gene)),dot.scale = 4)+RotatedAxis()+
theme(axis.text.x = element_text(face="italic",angle = 90,vjust = 0.5),axis.title = element_blank())+
theme(panel.border = element_rect(colour = "black", fill=NA))+
scale_color_viridis(option = "D")#20X5.1
saveRDS(GSE155513LDLR,'GSE155513LDLR.rds')
这边显示top5 markers
![](https://img.haomeiwen.com/i26913743/e51f980875eb83d0.png)
这边显示top5 ratio
![](https://img.haomeiwen.com/i26913743/e0860458d3caeb1a.png)
根据你的marker就可以鉴定细胞类型了。
今天到这里就结束了,明天继续。