生信单细胞测序

OSCA单细胞数据分析笔记-13、Multi-sample co

2021-07-06  本文已影响0人  小贝学生信

对应原版教程第14章http://bioconductor.org/books/release/OSCA/overview.html
单细胞的多组对照设计(例如正常组与给药组)可以为细胞类型水平比较提供以往Bulk RNA-seq分析所不能达到的精度。对此一般有两种进阶分析思路:
(1)DE(Differential expression)--两组样本的同一细胞类型的基因表达差异分析;(2)DA(Differential abundance)--两组样本的同一细胞类型的丰度差异分析

源网侵删

笔记要点

1、示例数据解读
2、DE分析
3、DA分析
4、最后关于DE&DA的思考

1、示例数据解读

#已完成质控、聚类、批次矫正、细胞类型注释
merged

table(merged$tomato,merged$pool)      
#           3    4    5
#  FALSE 1047 3097 6829
#  TRUE  2411 3007 4544

library(scater)
#同一聚类图谱中两组样本的细胞数目分布
head(table(colLabels(merged), merged$tomato))
#    FALSE TRUE
# 1   431  322
# 2   541  404
# 3   335  250
# 4  1257  979
# 5   610  456
# 6   504  433

#观察细胞类型注释情况
by.label <- table(colLabels(merged), merged$celltype.mapped)
pheatmap::pheatmap(log2(by.label+1), color=viridis::viridis(101))

2、DE分析

2.1 流程

summed <- aggregateAcrossCells(merged, 
                               id=colData(merged)[,c("celltype.mapped", "sample")])
summed
#15179   192 即6个样本的32种细胞类型的15179个基因表达情况
#以 Mesenchyme 基因为例
label <- "Mesenchyme"  #间质细胞
current <- summed[,label==summed$celltype.mapped]
head(counts(current))
#         [,1] [,2] [,3] [,4] [,5] [,6]
# Xkr4       2    0    0    0    3    0
# Rp1        0    0    1    0    0    0
# Sox17      7    0    3    0   14    9
# Mrpl15  1420  271 1009  379 1578  749
# Gm37988    1    0    0    0    0    0
# Rgs20      3    0    1    1    0    0

colData(current)[,c("sample","tomato","pool","celltype.mapped")]
# DataFrame with 6 rows and 4 columns
# sample    tomato      pool celltype.mapped
# <integer> <logical> <integer>     <character>
# 1         5      TRUE         3      Mesenchyme
# 2         6     FALSE         3      Mesenchyme
# 3         7      TRUE         4      Mesenchyme
# 4         8     FALSE         4      Mesenchyme
# 5         9      TRUE         5      Mesenchyme
# 6        10     FALSE         5      Mesenchyme

# follow edgeR的差异分析流程,构建DGElist对象
library(edgeR)
y <- DGEList(counts(current), samples=colData(current))
y

3 advantages of "pseudo-bulk"
(1) Larger counts are more amenable to standard DE analysis pipelines designed for bulk RNA-seq data.
(2) Collapsing cells into samples reflects the fact that our biological replication occurs at the sample level.
(3) Variance between cells within each sample is masked, provided it does not affect variance across (replicate) samples.
接下面的流程均以6个样本的Mesenchyme间质细胞 Bulk RNA-seq表达矩阵为例进行分析。

discarded <- current$ncells < 10
y <- y[,!discarded]
summary(discarded)   #如下没有样本被过滤掉
#   Mode   FALSE 
# logical       6

然后进一步过滤低表达的基因,其在大部分样本里均低于最低阈值的表达水平。

keep <- filterByExpr(y, group=current$tomato)
y <- y[keep,]
summary(keep)
#   Mode   FALSE    TRUE 
#logical      83    5815
y <- calcNormFactors(y)
y
# tomato 为分组情况
# pool 为批次情况
y$samples[,c("tomato","pool")]
#        tomato pool
#Sample1   TRUE    3
#Sample2  FALSE    3
#Sample3   TRUE    4
#Sample4  FALSE    4
#Sample5   TRUE    5
#Sample6  FALSE    5

design <- model.matrix(~factor(pool) + factor(tomato), y$samples)
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design, robust=TRUE)
res <- glmQLFTest(fit, coef=ncol(design))
topTags(res)

DEGs are defined as those with non-zero log-fold changes at a false discovery rate of 5%.

# Removing all pseudo-bulk samples with 'insufficient' cells.
summed.filt <- summed[,summed$ncells >= 10]

library(scran)
de.results <- pseudoBulkDGE(summed.filt, 
    label=summed.filt$celltype.mapped,
    design=~factor(pool) + tomato,
    coef="tomatoTRUE",
    condition=summed.filt$tomato 
)

#查看某一种细胞类型的差异分析结果
cur.results <- de.results[["Allantois"]]
cur.results[order(cur.results$PValue),]

#因为没有对比样本而未能差异分析的失败的细胞类型(之前的过滤步骤)
metadata(de.results)$failed
#[1] "Blood progenitors 1" "Caudal epiblast"     "Caudal neurectoderm" "ExE ectoderm"       
#[5] "Parietal endoderm"   "Stripped" 

2.2 结果解读

is.de <- decideTestsPerLabel(de.results, threshold=0.05)
summarizeTestsPerLabel(is.de)
up.de <- is.de > 0 & !is.na(is.de)
head(sort(rowMeans(up.de), decreasing=TRUE), 10)

如下图,举例来说Mid1基因在91%的细胞类型中均上调差异表达

#同理计算下调的common deg
down.de <- is.de < 0 & !is.na(is.de)
head(sort(rowMeans(up.de), decreasing=TRUE), 10)
# Finding all genes that are not remotely DE in all other labels.
remotely.de <- decideTestsPerLabel(de.results, threshold=0.5)
not.de <- remotely.de==0 | is.na(remotely.de)
not.de.other <- rowMeans(not.de[,colnames(not.de)!="Allantois"])==1

# Intersecting with genes that are DE inthe allantois.
unique.degs <- is.de[,"Allantois"]!=0 & not.de.other
unique.degs <- names(which(unique.degs))

# Inspecting the results.
de.allantois <- de.results$Allantois
de.allantois <- de.allantois[unique.degs,]
de.allantois <- de.allantois[order(de.allantois$PValue),]
de.allantois

另一种思路就是改变差异分析的零假设,直接寻找celltype-specific的基因

de.specific <- pseudoBulkSpecific(summed.filt,
    label=summed.filt$celltype.mapped,
    design=~factor(pool) + tomato,
    coef="tomatoTRUE",
    condition=summed.filt$tomato
)

cur.specific <- de.specific[["Allantois"]]
cur.specific <- cur.specific[order(cur.specific$PValue),]
cur.specific
# sizeFactors(summed.filt) <- NULL
plotExpression(logNormCounts(summed.filt),
    features="Rbp4",
    x="tomato", colour_by="tomato",
    other_fields="celltype.mapped") +
    facet_wrap(~celltype.mapped)

2.3 矫正测序胞外RNA干扰

library(MouseGastrulationData)
sce.tal1 <- Tal1ChimeraData()
library(scuttle)
rownames(sce.tal1) <- uniquifyFeatureNames(
  rowData(sce.tal1)$ENSEMBL, 
  rowData(sce.tal1)$SYMBOL
)
summed.tal1 <- aggregateAcrossCells(sce.tal1, 
                                    ids=DataFrame(sample=sce.tal1$sample,
                                                  label=sce.tal1$celltype.mapped)
)
summed.tal1$block <- summed.tal1$sample %% 2 == 0 # Add blocking factor.
# Subset to our neural crest cells.
summed.neural <- summed.tal1[,summed.tal1$label=="Neural crest"]
# Standard edgeR analysis, as described above.
res.neural <- pseudoBulkDGE(summed.neural, 
                            label=summed.neural$label,
                            design=~factor(block) + tomato,
                            coef="tomatoTRUE",
                            condition=summed.neural$tomato)
summarizeTestsPerLabel(decideTestsPerLabel(res.neural))
# Summary of the direction of log-fold changes.
tab.neural <- res.neural[[1]]
tab.neural <- tab.neural[order(tab.neural$PValue),]
head(tab.neural, 10)
library(DropletUtils)
#定义一个空列表,用于储存每个样本的sample  ambient RNA
ambient <- vector("list", ncol(summed.neural))

# Looping over all raw (unfiltered) count matrices and
# computing the ambient profile based on its low-count barcodes.
# Turning off rounding, as we know this is count data.
for (s in seq_along(ambient)) {
    #获取每一个样本还没有过滤空液滴的最原始表达矩阵
    raw.tal1 <- Tal1ChimeraData(type="raw", samples=s)[[1]]
    #预估每一个样本的ambient情况
    ambient[[s]] <- estimateAmbience(counts(raw.tal1), 
        good.turing=FALSE, round=FALSE)
}

# Cleaning up the output for pretty printing.
#合并所有样本的ambient effect
ambient <- do.call(cbind, ambient)
colnames(ambient) <- seq_len(ncol(ambient))
rownames(ambient) <- uniquifyFeatureNames(
    rowData(raw.tal1)$ENSEMBL, 
    rowData(raw.tal1)$SYMBOL
)
head(ambient)
##          1  2  3  4
## Xkr4     1  0  0  0
## Gm1992   0  0  0  0
## Gm37381  1  0  1  0
## Rp1      0  1  0  1
## Sox17   76 76 31 53
## Gm37323  0  0  0  0

#然后据此预测过滤后的表达矩阵的可能受污染的基因表达比例
max.ambient <- maximumAmbience(counts(summed.neural), 
    ambient, mode="proportion")
head(max.ambient)
##           [,1]   [,2]  [,3] [,4]
## Xkr4       NaN    NaN   NaN  NaN
## Gm1992     NaN    NaN   NaN  NaN
## Gm37381    NaN    NaN   NaN  NaN
## Rp1        NaN    NaN   NaN  NaN
## Sox17   0.1775 0.1833 0.468    1
## Gm37323    NaN    NaN   NaN  NaN

#最后将样本平均污染比例超过10%的基因认为是不合格基因,将其从DEG列表中过滤掉
contamination <- rowMeans(max.ambient, na.rm=TRUE)
non.ambient <- contamination <= 0.1
okay.genes <- names(non.ambient)[which(non.ambient)]
tab.neural2 <- tab.neural[rownames(tab.neural) %in% okay.genes,]
table(Direction=tab.neural2$logFC > 0, Significant=tab.neural2$FDR <= 0.05)
head(tab.neural2, 10)
#根据结果可以看到原本sig DEG的血红蛋白基因已经被过滤掉了。

在上述方法中,得到ambient后,如果知道其中某些基因在样本细胞中一定是不表达的,作为阴性对照参考,可提高预估的精度。
#血红蛋白基因一定是不表达的
is.hbb <- grep("^Hb[ab]-", rownames(summed.neural))
alt.prop <- controlAmbience(counts(summed.neural), ambient,
    features=is.hbb,  mode="proportion")
alt.non.ambient <- rowMeans(alt.prop, na.rm=TRUE) <= 0.1
okay.genes <- names(alt.non.ambient)[which(alt.non.ambient)]
tab.neural4 <- tab.neural[rownames(tab.neural) %in% okay.genes,]
head(tab.neural4)

第二种思路:上述的方法的前提是提供有未过滤空液滴的原始表达矩阵,但对于挖掘公共来源的单细胞表达矩阵一般都是过滤后的,不能够提供可以参考的ambient profile。所以再推测ambient profile是很难的。一种想法是,即假设ambient RNA对所有细胞类型的影响都是相同的,所以specific-common DEG是很值得被怀疑的,但也存在很多问题。具体可参看原教程,这里不多阐述了。

3、DA分析

3.1 思路

细胞类型丰度的差异分析是指对于同一种细胞类型的数目在不同分组中是否有显著的差异。基本流程类似上面的DE pipeline,只是表达矩阵(列为样本细胞类型,行名为基因,值为基因表达水平)变成了细胞丰度矩阵(列为样本,行为细胞类型,值为细胞组成数目),同样采用 edgeR pipeline。

3.2 流程

#计算细胞类型数目总和
abundances <- table(merged$celltype.mapped, merged$sample) 
abundances <- unclass(abundances) 
head(abundances)
##                      
##                        5  6   7   8   9  10
##   Allantois           97 15 139 127 318 259
##   Blood progenitors 1  6  3  16   6   8  17
##   Blood progenitors 2 31  8  28  21  43 114
##   Cardiomyocytes      85 21  79  31 174 211
##   Caudal Mesoderm     10 10   9   3  10  29
##   Caudal epiblast      2  2   0   0  22  45

#构建DEGlist对象
extra.info <- colData(merged)[match(colnames(abundances), merged$sample),]
y.ab <- DGEList(abundances, samples=extra.info)
#y.ab$samples$sizeFactor

#过滤低丰度的细胞类型
keep <- filterByExpr(y.ab, group=y.ab$samples$tomato)
y.ab <- y.ab[keep,]

#差异分析
design <- model.matrix(~factor(pool) + factor(tomato), y.ab$samples)
y.ab <- estimateDisp(y.ab, design, trend="none")
fit.ab <- glmQLFit(y.ab, design, robust=TRUE, abundance.trend=FALSE)
res <- glmQLFTest(fit.ab, coef=ncol(design))
summary(decideTests(res))
##        factor(tomato)TRUE
## Down                    1
## NotSig                 22
## Up                      1
topTags(res)

上述流程中,未执行DE里的calcNormFactoors()这一步骤;仅根据DEGlist计算library size进行统一文库大小。这可能会引起composition effect,即一种细胞类型的“高表达”在同一文库大小情况下,势必会引起其它细胞类型丰度的非正常减小。当然可以使用calcNormFactoors()进行矫正,但这需要假设大部分细胞类型的丰度没有显著差异,这对于DA分析是比较困难的。可以通过设定较高的DEG阈值、不考虑明显高丰度的细胞类型进行一定程度的消减。

4 最后关于DE&DA的思考

上一篇 下一篇

猜你喜欢

热点阅读