Supercell代码实操(二):COVID-19数据的整合分析
前言
在上一期推文:Supercell代码实操(一):5种肿瘤细胞系的整合分析中,Immugent已经初步带大家感受了一下基于Supercell流程的快捷,本期推文小编将会通过一个更大的数据集带大家更加深入的感受基于元细胞分析的重要性。
说到大的单细胞数据集,除了各种大型图谱之外,就数新冠相关研究的数据了。说到新冠研究的相关数据集,小到几千个细胞的数量级,大到上百万的数量级。Immugent本来想为大家展示的是1.4 millions of cells distributed in 284 samples coming from 196 patients的数据。但考虑到有很多小伙伴没有服务器,于是Immugent退而求其次,下面就使用了其中26个样本的数据进行实操。
主要内容
Analysis at the single cell level with Seurat
library(SuperCell)
library(Seurat)
library(SingleCellExperiment)
library(ggplot2)
library(harmony)
library(reshape2)
gene_blacklist <- readRDS("../data/gene_blacklist.rds")
pbmc <- readRDS("../data/pbmc_COVID19_sce/S-S086-2_sce.rds")
sceRawToSeurat<- function(pbmc,nfeatures = 1500) {
pbmc <- CreateSeuratObject(counts = assay(pbmc),meta.data = data.frame(colData(pbmc)))
pbmc <- FindVariableFeatures(pbmc,nfeatures = nfeatures)
VariableFeatures(pbmc) <- VariableFeatures(pbmc)[!VariableFeatures(pbmc) %in% gene_blacklist]
pbmc <- NormalizeData(pbmc)
return(pbmc)
}
pbmc <- sceRawToSeurat(pbmc)
pbmc <- ScaleData(pbmc)
pbmc <- RunPCA(pbmc, npcs = 20)
ElbowPlot(pbmc)
PCAPlot(pbmc,group.by = "majorType")
pbmc <-RunUMAP(pbmc, dims = c(1:20))
UMAPPlot(pbmc, group.by = "majorType")
![](https://img.haomeiwen.com/i21134748/498b7d5586d5fe2b.png)
![](https://img.haomeiwen.com/i21134748/a639104e9d4b74b5.png)
We compute purity of metacells according to their assignment to the majorType annotation.
makeSeuratMC <- function(pbmc,
genes = NULL,
metaFields = c("majorType", "sampleID", "Age", "Sex", "celltype", "PatientID", "SARS.CoV.2", "Outcome", "datasets", "CoVID.19.severity", "Sample.time"),
returnMC = F) {
if(is.null(genes)) {
genes <- VariableFeatures(pbmc)
}
MC <- SCimplify(GetAssayData(pbmc,slot = "data"), # normalized gene expression matrix
n.pc = 20,
k.knn = 5, # number of nearest neighbors to build kNN network
gamma = 20, # graining level
genes.use = genes )# will be the ones used for integration if input is seurat integrated data
MC$purity <- supercell_purity(clusters = pbmc$majorType,
supercell_membership = MC$membership)
for (m in metaFields) {
MC[[m]]<- supercell_assign(clusters = pbmc@meta.data[,m], # single-cell assigment to cell lines (clusters)
supercell_membership = MC$membership, # single-cell assignment to super-cells
method = "absolute")
}
GE <- supercell_GE(as.matrix(GetAssayData(pbmc,slot = "data")),groups = MC$membership)
seuratMC <- supercell_2_Seurat(SC.GE = GE,MC,fields = c(metaFields,"purity"))
res <- seuratMC
if (returnMC) {
res <- list(seuratMC = seuratMC,SC = MC)
}
return(res)
}
supercells <- makeSeuratMC(pbmc,returnMC = T)
seuratMC <- supercells$seuratMC
seuratMC <- RunUMAP(seuratMC,dims = c(1:20))
UMAPPlot(seuratMC,group.by = "majorType")
![](https://img.haomeiwen.com/i21134748/1a91529616dc6c0d.png)
gene<-as.matrix(GetAssayData(seuratMC)["CXCL8",])
correlations<-apply(as.matrix(GetAssayData(seuratMC)),1,function(x){cor(gene,x)})
tf <- read.table("../data/transcription.factor.activity.GO0003700.symbol.list")
correlations <- correlations[names(correlations) %in% tf$V1]
head(sort(abs(correlations),decreasing = T),n = 10)
## KLF10 ZNF467 ZNF385A GAS7 KLF4 SPI1 CREB5 FOS
## 0.7478192 0.7443197 0.7143247 0.7117197 0.7097348 0.7096040 0.7070228 0.7046132
## CEBPD LMO2
## 0.6903713 0.6868367
FeatureScatter(object = pbmc, feature1 = "CXCL8", feature2 = 'KLF10',group.by = "majorType")
FeatureScatter(object = seuratMC, feature1 = "CXCL8", feature2 = 'KLF10',group.by = "majorType")
![](https://img.haomeiwen.com/i21134748/a710252dc68ac106.png)
![](https://img.haomeiwen.com/i21134748/10da622a8f3872b6.png)
supercell_GeneGenePlot(GetAssayData(seuratMC),
gene_x = "CXCL8",gene_y = "FOS",
supercell_size = seuratMC$size,
clusters = seuratMC$majorType)
![](https://img.haomeiwen.com/i21134748/8650752708b97222.png)
files <- list.files("../data/pbmc_COVID19_sce",full.names = T)
# Uncomment if you have less than 20GB of memory
files <- readRDS("../data/lowMemFileList.rds")
# Uncomment if you less than 16GB of memory
files <- files[c(1:6)]
MC.list <- list()
MC.list[["S-S086-2"]] <- seuratMC
for (f in files[which(files != "../data/pbmc_COVID19_sce/S-S086-2_sce.rds")]) {
smp <- readRDS(f)
smp <- sceRawToSeurat(smp)
seuratMC <- makeSeuratMC(smp)
MC.list[[seuratMC$sampleID[1]]] <- seuratMC
}
features <- SelectIntegrationFeatures(MC.list,nfeatures = 1500)
features <- features[!features %in% gene_blacklist]
nSingleCells <- 0
for (i in 1:length(MC.list)) {
MC.list[[i]] <- RenameCells(MC.list[[i]],add.cell.id = unique(MC.list[[i]]$sampleID))
MC.list[[i]] <- ScaleData(MC.list[[i]],features = features)
MC.list[[i]] <- RunPCA(MC.list[[i]] ,features = features,npcs = 20)
nSingleCells <- nSingleCells + sum(MC.list[[i]]$size)
}
nSingleCells ## [1] 51595
reference <- which(c("S-S086-2","S-M064") %in% names(MC.list))
anchors <- FindIntegrationAnchors(object.list = MC.list,
reference = reference,
reduction = "rpca",
anchor.features = features,
dims = 1:20)
integrated <- IntegrateData(anchorset = anchors, dims = 1:20)
DefaultAssay(integrated) = "integrated"
integrated <- ScaleData(integrated, verbose = T)
integrated <- RunPCA(integrated, verbose = FALSE)
integrated <- RunUMAP(integrated, dims = 1:20)
UMAPPlot(integrated,group.by = "sampleID")
UMAPPlot(integrated,group.by = "majorType")
![](https://img.haomeiwen.com/i21134748/c37c611d4757ffe6.png)
![](https://img.haomeiwen.com/i21134748/d6f0508884f86732.png)
majorTypeCounts <- aggregate(integrated$size, by=list(majorType = integrated$majorType,Severity = integrated$CoVID.19.severity,Time = integrated$Sample.time), FUN=sum)
majorTypeCounts$group = paste(majorTypeCounts$Severity,majorTypeCounts$Time,sep = "_")
contingencyTable <- xtabs(x ~ group + majorType,data = majorTypeCounts)
res <- chisq.test(contingencyTable)
Roe <- res$observed/res$expected
melted_Roe <- melt(Roe, na.rm = TRUE)
# Heatmap
ggplot(data = melted_Roe, aes(majorType,group, fill = value))+
geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 1, space = "Lab",
name="Ro/e") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
![](https://img.haomeiwen.com/i21134748/d5a399814774bc57.png)
Idents(integrated) <- "majorType"
markersMega <- FindMarkers(integrated,ident.1 = "Mega", only.pos = T)
markersMega <- markersMega[markersMega$p_val_adj < 0.05, ]
markersMega[order(markersMega$avg_log2FC, decreasing = T), ]
cytokines_Mega_ori_study <- c("PDGFA", "TGFB1", "TNFSF4", "PF4V1", "PF4", "PPBP")
cytokines_Mega_ori_study %in% rownames(markersMega)
Idents(integrated) <- "majorType"
VlnPlot(integrated, features = cytokines_Mega_ori_study, pt.size = 0.1)
Idents(MC.list[["S-S086-2"]]) <- "majorType"
pbmc2 <- pbmc[, pbmc$majorType %in% levels(MC.list[["S-S086-2"]])] #to have the same majorType in MC and single cell object see below
pbmc2$majorType <- factor(pbmc2$majorType,levels = levels(MC.list[["S-S086-2"]]))
Idents(pbmc2) <- "majorType"
VlnPlot(pbmc2, features = cytokines_Mega_ori_study, pt.size = 0.1)
![](https://img.haomeiwen.com/i21134748/1e0bb0aadc537668.png)
![](https://img.haomeiwen.com/i21134748/63243a543e93dca2.png)
About metacell purity
Neu <- colnames(pbmc[,pbmc$majorType == "Neu"])
Neu
supercells$seuratMC$majorType[supercells$SC$membership[Neu]]
MC_majorTypes <- lapply(X = unique(supercells$seuratMC$majorType), FUN = function(x){SC <- SCimplify(GetAssayData(pbmc[,pbmc$majorType == x],slot = "data"),
n.pc = 20,
k.knn = 5, # number of nearest neighbors to build kNN network
gamma = 20 # graining level
);SC$majorType = rep(x,SC$N.SC);return(SC)})
Neu_MC <- SCimplify(GetAssayData(pbmc[,pbmc$majorType == "Neu"],slot = "data"),
n.pc = 3,
k.knn = 4, # number of nearest neighbors to build kNN network
gamma = 1 # graining level
)
Neu_MC$majorType <- rep("Neu",Neu_MC$N.SC)
MC_majorTypes <- c(MC_majorTypes,list(Neu_MC))
MC_majorTypes2 <- supercell_merge(MC_majorTypes, fields = "majorType")
MC_majorTypes_GE <- supercell_GE(as.matrix(GetAssayData(pbmc)[,names(MC_majorTypes2$membership)]),groups = MC_majorTypes2$membership)
MC_majorTypes2$genes.use <- VariableFeatures(pbmc)
seuratPureMC <- supercell_2_Seurat(SC.GE = MC_majorTypes_GE,
SC = MC_majorTypes2,
fields = c("majorType"))
cell.split.condition option of the SCimplify function that is faster but lead to a less good separation of T, B and NK cells at the metacell level…
seuratPureMC <- RunUMAP(seuratPureMC,dims = c(1:20))
UMAPPlot(seuratPureMC,group.by = "majorType")
pureMC <- SCimplify(GetAssayData(pbmc,slot = "data"), # normalized gene expression matrix
n.pc = 20,
k.knn = 5, # number of nearest neighbors to build kNN network
gamma = 20, # graining level
genes.use = VariableFeatures(pbmc),
cell.split.condition = pbmc$majorType)
pureMC[["majorType"]]<- supercell_assign(clusters = pbmc@meta.data[,"majorType"], # single-cell assignment to cell lines (clusters)
supercell_membership = pureMC$membership, # single-cell assignment to super-cells
method = "absolute")
GE <- supercell_GE(as.matrix(GetAssayData(pbmc,slot = "data")),groups = pureMC$membership)
seuratPureMC <- supercell_2_Seurat(SC.GE = GE,
SC = pureMC,
fields = c("majorType"))
seuratPureMC <- RunUMAP(seuratPureMC,dims = c(1:20))
UMAPPlot(seuratPureMC,group.by = "majorType")
seuratPureMC <- ScaleData(seuratPureMC)
seuratPureMC <- RunPCA(seuratPureMC, npcs = 20)
seuratPureMC <- RunUMAP(seuratPureMC,dims = c(1:20))
UMAPPlot(seuratPureMC,group.by = "majorType")
![](https://img.haomeiwen.com/i21134748/cf1f0bb6a3a1008d.png)
![](https://img.haomeiwen.com/i21134748/1705527274d26140.png)
展望
想必跟着今天的代码进行实操的小伙伴就会有更深刻的体会,使用原始数据跑seurat流程和使用supercell整合后的元细胞数据跑seurat流程有天差地别的时间体验。对于supercell整合后的元细胞数据的其它优点,Immugent在前面一篇推文中就已经介绍过了,这里就不再赘述。
此外,我们还需要注意的是并不是所有的单细胞数据分析流程都是适合使用supercell进行整合的,很大一定程度上取决于要解决的科学问题。比如,想通过单细胞数据来找到一群占比很少的稀有细胞,那么就不能使用supercell进行整合,因为这样很可能就会丢失这部分细胞的信息。相反,如果只是想基于单细胞数据分析找到某一种处理或者疾病状态下对主要细胞类型的影响,那使用upercell进行数据整合是非常具有优势的。
好啦,本期分享到这就结束了,我们下期再会~~