一文学会 | 采用Seurat3对10X Genomics示例数
2020-04-07 本文已影响0人
yingyonghui
摘要:从https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz处下载得到2700个PBMC细胞测序数据,采用Seurat(v3.1.0)程序包进行分析。详细介绍见https://satijalab.org/seurat/v3.1/pbmc3k_tutorial.html
分析过程如下:
library(dplyr)
library(Seurat)
library(patchwork)
### 构建Seurat对象
pbmc.data <- Read10X(data.dir = "filtered_gene_bc_matrices/hg19/")
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
### 预处理与质量控制
#计算线粒体RNA比例
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
head(pbmc@meta.data, 5)
head(pbmc@meta.data, 5).png
#使用violin plot查看QC指标分布,如图1所示
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
图1 PBMC QC指标分布.png
#查看feature-feature之间关系,如图2所示
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
图2 RNA count与线粒体RNA比例、基因数量间关系.png
#去除表达基因数量少于200或多于2500的细胞,去除线粒体RNA比例高于5%的细胞
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
### 校正表达值
#校正细胞的表达值,将count数量根据表达总量矫正之后,乘以10000,并进行对数转换
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
#以上代码的计算原理如下
pbmc@assays$RNA@data <-log(t(t(pbmc@assays$RNA@counts)*10000/colSums(pbmc@assays$RNA@counts) + 1))
#以上是默认参数,下列代码实现相同功能
pbmc <- NormalizeData(pbmc)
### 鉴定高变异基因
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
#查看基因的平均表达与标准方差之间的关系,如图3所示
plot1 <- VariableFeaturePlot(pbmc)
top10 <- head(VariableFeatures(pbmc), 10)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
图3 基因的平均表达与标准方差.png
### 归一化表达值
#对基因在细胞间的表达量进行线性变换,使平均值为0,方差为1,去除高表达基因的影响
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
#以上代码的计算原理如下
pbmc@assays$RNA@scale.data <- t(scale(t(pbmc@assays$RNA@data)))
#scale是个十分耗时的过程,为缩短计算时间,ScaleData默认仅对高变异基因进行scale;不论是对全部基因或仅对高变异基因进行scale,并不影响下游的PCA和聚类过程
# ScaleData同样可以用来去除其他因素(如细胞周期状态或线粒体RNA比例)带来的影响
pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt")
### 线性降维
#在scale.data上运行PCA,默认使用的基因为高变异基因
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
#观察PC上重要基因
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
print(pbmc[["pca"]], dims = 1-5, nfeatures = 5).png
#在PC1与PC2上表示基因,如图4所示,这里展示的实际上是细胞进行PCA降维时每个基因的系数,即PCA feature loading,根据PCA算法,有pbmc@reductions$pca@cell.embeddings = t(pbmc@assays$RNA@scale.data[VariableFeatures(pbmc),]) %*% pbmc@reductions$pca@feature.loadings
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
图4 PC1与PC2主要基因降维系数.png
#细胞的PCA降维,如图5所示
DimPlot(pbmc, reduction = "pca")
图5 细胞的PCA降维图.png
#查看每个细胞中单个基因在不同PC维度上的得分,如图6所示
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
图6 细胞中基因在PC1上得分.png
### 确定PCA维度数量
#根据JackStrawPlot来确定下一步降维作用的PC数量,如图7所示。P值越小说明PC越重要,应该纳入下一步分析
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:15)
图7 JackStrawPlot.png
#由图7可以看出,在PC10-PC12之后,PC的P值出现显著降低
#或根据ElbowPlot确定PC数量,如图8所示。标准差较大的PC所解释的数据变异性更大,应当纳入下一步分析
ElbowPlot(pbmc)
图8 ElbowPlot.png
#由图8可看出,在PC9-PC10后出现标准差数据拐点。综上,下游分析中选择了前10个PC
### 细胞聚类
pbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc, resolution = 0.5)
### 非线性降维(UMAP/tSNE)
#UMAP降维,如图9所示
pbmc <- RunUMAP(pbmc, dims = 1:10)
DimPlot(pbmc, reduction = "umap")
图9 PBMC细胞UMAP降维.png
#tSNE降维,如图10所示
pbmc <- RunTSNE(pbmc, dims = 1:10)
DimPlot(pbmc, reduction = "tsne")
图10 PBMC细胞tSNE降维.png
### 差异表达分析
# find all markers of cluster 1
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25)
### 这一步的计算原理中
### logFC = log(mean(expm1(ident.1细胞的data)) + pseudocount.use)- log(mean(expm1(非ident.1细胞的data)) + pseudocount.use)
head(cluster1.markers, n = 5)
head(cluster1.markers, n = 5).png
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
head(cluster5.markers, n = 5).png
# find markers for every cluster compared to all remaining cells, report only the positive ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
pbmc.markers.png
#使用不同的假设检验方法来进行差异表达分析
cluster1.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
#差异基因可视化,如图11-12所示,此外还可以通过RidgePlot, CellScatter, DotPlot等进行展示
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
图11 基因表达violin plot.png
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP","CD8A"))
图12 基因表达UMAP plot.png
#热图显示,如图13所示
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
图13 基因表达热图.png
### 鉴定细胞类型
#根据已知细胞类型的marker gene对各个cluster的细胞进行命名,如图14所示
marker genes.png
new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono", "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
图14 在UMAP降维图中对细胞类群进行标注.png
### 保存结果
saveRDS(pbmc, file = "../output/pbmc3k_final.rds")