单细胞轨迹推断(Trajectory inference)分析(
练习需要的数据在这里下载:https://github.com/NBISweden/excelerate-scRNAseq/blob/master/session-trajectories/session-trajectories.md
这篇练习的代码完全出自上面的链接,只不过是自己敲一遍感受一下过程。
这个数据是来自Nestorowa, Hamey等人在2016年发表在blood上的文章。dataset包含1600个从小鼠骨髓里分离出的造血干细胞和祖细胞(使用SMART-seq2技术进行测序)。使用流式和index进行筛选的12个不同表型的HSPC(每个大约10个细胞)已经被纳入dataset,并将在该实验室作为识别转录轨迹模型中的根和分支的生物学基础。
数据下载:
$ wget -nH -nd http://blood.stemcells.cam.ac.uk/data/nestorowa_corrected_log2_transformed_counts.txt.gz \
http://blood.stemcells.cam.ac.uk/data/nestorowa_corrected_population_annotation.txt.gz
$ wget -nH -nd ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE81nnn/GSE81682/suppl/GSE81682%5FHTSeq%5Fcounts%2Etxt%2Egz
$ gunzip *
$ ls -l
你会得到三个文件:
GSE81682_HTSeq_counts.txt
nestorowa_corrected_log2_transformed_counts.txt
nestorowa_corrected_population_annotation.txt
方法一:Monocle2/DDRtree
加载数据:
> library(monocle)
> library(biomaRt)
#作者提供了一个表达矩阵,是已经被过滤的(高水平表达的基因,高质量的细胞),并且经过scaled和标准化。
#其中一个文件是注释文件,是根据免疫表型标记的细胞类型
> lognorm <- t(read.table('nestorowa_corrected_log2_transformed_counts.txt', sep=" ", header=TRUE))
> anno_table <- read.table('nestorowa_corrected_population_annotation.txt')
#为了使用Monocle2/DDRtree来进行轨迹推断,推荐使用未经标准化的基于UMI的count值,是因为Monocle2内部会scale并且标准化你的Data
#你刚才下载的文件里有一个就是原始count文件
> counts <- read.table('GSE81682_HTSeq_counts.txt', sep="\t", header=TRUE, row.names='ID')
> counts[1:5,1:5]
HSPC_007 HSPC_013 HSPC_019 HSPC_025 HSPC_031
ENSMUSG00000000001 0 7 1 185 2
ENSMUSG00000000003 0 0 0 0 0
ENSMUSG00000000028 4 1 2 4 3
ENSMUSG00000000031 0 0 0 0 0
ENSMUSG00000000037 0 0 0 0 0
> dim(counts)
[1] 46175 1920 #基因 细胞
接下来过滤细胞
#需要注意的是,count矩阵是没有经过过滤的,基因的名称是ensembl ID。首先我们根据作者选择的基因来进行矩阵的过滤
#只保留高质量细胞
> counts <- counts[ , colnames(lognorm) ]
> dim(counts)
[1] 46175 1645
下面是先创立一个frame,里面储存细胞的分类
#我们创建一个注释细胞的frame,来标记细胞类型
> pDat <- data.frame(cell=colnames(counts), celltype='undefined', stringsAsFactors=FALSE)
> rownames(pDat) <- pDat$cell
> pDat[rownames(anno_table), 2] <- as.character(anno_table$celltype)
> head(pDat)
cell celltype
HSPC_001 HSPC_001 undefined
HSPC_002 HSPC_002 undefined
HSPC_003 HSPC_003 undefined
HSPC_004 HSPC_004 undefined
HSPC_006 HSPC_006 undefined
HSPC_008 HSPC_008 undefined
再创建一个基因注释的data frame:
#再创建一个基因注释的data frame,将基因的ID与symbls相对应,count矩阵里基因的ID是用biomaRt Bioconductor包来注释的
> mart <- biomaRt::useDataset("mmusculus_gene_ensembl", biomaRt::useMart("ensembl"))
> genes_table <- biomaRt::getBM(attributes=c("ensembl_gene_id", "external_gene_name"), values=rownames(counts), mart=mart)
> rownames(genes_table) <- genes_table$ensembl_gene_id
> head(genes_table)
ensembl_gene_id external_gene_name
ENSMUSG00000064372 ENSMUSG00000064372 mt-Tp
ENSMUSG00000064371 ENSMUSG00000064371 mt-Tt
ENSMUSG00000064370 ENSMUSG00000064370 mt-Cytb
ENSMUSG00000064369 ENSMUSG00000064369 mt-Te
ENSMUSG00000064368 ENSMUSG00000064368 mt-Nd6
ENSMUSG00000064367 ENSMUSG00000064367 mt-Nd5
> fDat <- genes_table[ rownames(counts), ]
> colnames(fDat) <- c('ensembl_gene_id', 'gene_short_name')
> head(fDat)
ensembl_gene_id gene_short_name
ENSMUSG00000000001 ENSMUSG00000000001 Gnai3
ENSMUSG00000000003 ENSMUSG00000000003 Pbsn
ENSMUSG00000000028 ENSMUSG00000000028 Cdc45
ENSMUSG00000000031 ENSMUSG00000000031 H19
ENSMUSG00000000037 ENSMUSG00000000037 Scml2
ENSMUSG00000000049 ENSMUSG00000000049 Apoh
过滤基因:
#现在来过滤count矩阵的基因,标准按照作者的走
> fDat <- fDat[ fDat$gene_short_name %in% rownames(lognorm), ]
> counts <- counts[ rownames(fDat), ]
> dim(counts)
[1] 3804 1645
> dim(fDat)
[1] 3804 2
> dim(pDat)
[1] 1645 2
上面的pDat和fDat是不是很眼熟?是的,是我们接下来要创建的对象的构成要素:
> cds <- newCellDataSet(as.matrix(counts), phenoData=Biobase::AnnotatedDataFrame(pDat), featureData=Biobase::AnnotatedDataFrame(fDat))
> cds
CellDataSet (storageMode: environment)
assayData: 3804 features, 1645 samples
element names: exprs
protocolData: none
phenoData
sampleNames: HSPC_001 HSPC_002 ... Prog_852 (1645 total)
varLabels: cell celltype Size_Factor
varMetadata: labelDescription
featureData
featureNames: ENSMUSG00000000001 ENSMUSG00000000028 ... ENSMUSG00000107235 (3804 total)
fvarLabels: ensembl_gene_id gene_short_name
fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:
保存一下这个对象:
> saveRDS(cds, 'cds_hematopoiesis.rds')
接下来就是标准化、scale:
#Monocle的标准化和scale
> cds <- estimateSizeFactors(cds)
> cds <- estimateDispersions(cds)
Removing 18 outliers
> cds <- detectGenes(cds, min_expr=0.1)
> print(head(fData(cds)))
ensembl_gene_id gene_short_name num_cells_expressed
ENSMUSG00000000001 ENSMUSG00000000001 Gnai3 1613
ENSMUSG00000000028 ENSMUSG00000000028 Cdc45 1438
ENSMUSG00000000056 ENSMUSG00000000056 Narf 1333
ENSMUSG00000000058 ENSMUSG00000000058 Cav2 577
ENSMUSG00000000078 ENSMUSG00000000078 Klf6 1560
ENSMUSG00000000127 ENSMUSG00000000127 Fer 578
找出那些至少在10个细胞里表达的基因:
> expressed_genes <- row.names(subset(fData(cds), num_cells_expressed >= 10))
> length(expressed_genes)
[1] 3785
> print(head(pData(cds)))
cell celltype Size_Factor num_genes_expressed
HSPC_001 HSPC_001 undefined 3.0721956 2034
HSPC_002 HSPC_002 undefined 0.3131489 1778
HSPC_003 HSPC_003 undefined 1.3243296 2195
HSPC_004 HSPC_004 undefined 0.6408191 2084
HSPC_006 HSPC_006 undefined 0.6474894 2171
HSPC_008 HSPC_008 undefined 0.7359116 2021
下面用作者的标准来定义细胞类型。或者你也可以聚类,然后鉴定每一个cluster的marker基因
> diff_test_res <- differentialGeneTest(cds[ expressed_genes, ], fullModelFormulaStr="~ celltype")
> ordering_genes <- row.names(subset(diff_test_res, qval < 0.01))
> length(ordering_genes)
[1] 683
标记将用来排序的基因
> cds <- setOrderingFilter(cds, ordering_genes)
用DDRtree算法推断轨迹分支:
> cds <- reduceDimension(cds, max_components = 2, method='DDRTree')
> cds <- orderCells(cds)
> plot_cell_trajectory(cds, color_by="celltype")
改变细胞颜色:
> cell_colors <- c('lightblue','blue','red','black','orange','yellow','turquoise','lightgrey')
> plot_cell_trajectory(cds, color_by="celltype") + scale_color_manual(values=cell_colors)
绝大多数的非成熟的造血干细胞表达E-Slam,所以用这个标准定义轨迹的“root”:
> table(pData(cds)$State, pData(cds)$celltype)[,"ESLAM"]
1 2 3 4 5
0 0 10 0 0
#这里说明第3个state作为root
在模型里定义root:
> cds <- orderCells(cds, root_state = 3)
定义拟时间(根据与root的距离):
> plot_cell_trajectory(cds, color_by = "Pseudotime")
把每一个branch进行差异基因的检测(根据拟时间模型):
> diff_test_res <- differentialGeneTest(cds[ ordering_genes, ], fullModelFormulaStr = "~sm.ns(Pseudotime)")
> sig_gene_names <- row.names(subset(diff_test_res, qval < 0.1))
> plot_pseudotime_heatmap(cds[ sig_gene_names[1:50], ], num_clusters = 3, cores=4, show_rownames=TRUE)
接下来利用分支表达分析模型(BEAM)来分析红系的通路(分支1所在的地方):
> BEAM_res <- BEAM(cds, branch_point = 1, cores = 4)
> BEAM_res <- BEAM_res[order(BEAM_res$qval),]
> BEAM_res <- BEAM_res[,c("gene_short_name", "pval", "qval")]
> head(BEAM_res)
gene_short_name pval qval
ENSMUSG00000004655 Aqp1 0 0
ENSMUSG00000009350 Mpo 0 0
ENSMUSG00000016494 Cd34 0 0
ENSMUSG00000020125 Elane 0 0
ENSMUSG00000021728 Emb 0 0
ENSMUSG00000021998 Lcp1 0 0
> plot_genes_branched_heatmap(cds[row.names(BEAM_res)[1:50]], branch_point = 1, num_clusters = 3, cores=4, use_gene_short_name=TRUE, show_rownames=TRUE)
看具体的基因在拟时间上的表达情况(分支1):
> plot_genes_branched_pseudotime(cds[row.names(BEAM_res)[1:5]], branch_point = 1, color_by = "celltype", ncol = 1) + scale_color_manual(values=cell_colors)
方法二:Diffusion map
> library(destiny)
> library(ggplot2)
> lognorm <- t(read.table('nestorowa_corrected_log2_transformed_counts.txt', sep=" ", header=TRUE))
> lognorm[1:5,1:5]
HSPC_001 HSPC_002 HSPC_003 HSPC_004 HSPC_006
X1110032F04Rik 0.000000 0.000000 0.000000 0.000000 0.000000
X1110059E24Rik 0.000000 0.000000 2.795189 1.326478 7.348663
X1300017J02Rik 0.000000 0.000000 0.000000 0.000000 0.000000
X1600014C10Rik 0.000000 2.238601 0.000000 1.326478 4.946766
X1700017B05Rik 1.225439 2.238601 1.989360 2.005685 0.000000
# The cell subsets (final differentiation stages) will be used to validate the trajectory model
> anno_table <- read.table('nestorowa_corrected_population_annotation.txt')
> pDat <- data.frame(cell=colnames(lognorm), celltype='undefined', stringsAsFactors=FALSE)
> rownames(pDat) <- pDat$cell
> pDat[ rownames(anno_table), 2] <- as.character(anno_table$celltype)
创建对象:
> eset <- Biobase::ExpressionSet(lognorm, phenoData=Biobase::AnnotatedDataFrame(pDat))
> eset
ExpressionSet (storageMode: lockedEnvironment)
assayData: 3991 features, 1645 samples
element names: exprs
protocolData: none
phenoData
sampleNames: HSPC_001 HSPC_002 ... Prog_852 (1645 total)
varLabels: cell celltype
varMetadata: labelDescription
featureData: none
experimentData: use 'experimentData(object)'
Annotation:
> saveRDS(eset, 'eset_hematopoiesis.rds')
下面进行diffusion map分析:
> dmap <- DiffusionMap(eset)
> plot.DiffusionMap(dmap)
> plot.DiffusionMap(dmap, dims=c(1,2))
> plot.DiffusionMap(dmap, dims=c(2,3))
> plot.DiffusionMap(dmap, dims=c(1,3))
成分1和2可以很清楚的分出红系和白系:
> qplot(DC1, DC2, data=dmap, colour=celltype) + scale_color_manual(values=c('lightblue','brown','red','black','orange','yellow','blue','lightgrey')) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"))
定义root:
> which(anno_table$celltype=="ESLAM")
[1] 19 20 21 22 23 24 25 26 27 28
> dpt <- DPT(dmap, tips=19)
> plot(dpt)
我们还可以把已知的marker基因的表达水平在轨迹上表示出来:
# 基因Procr / 基因Endothelial protein C是HSC亚型的一个marker
> plot(dpt, col_by='Procr', pal=viridis::magma)
# Gata1 是红系的一个关键的TF
> plot(dpt, col_by='Gata1', pal=viridis::magma)
# cathepsin G是中性粒细胞的marker
> plot(dpt, col_by='Ctsg', pal=viridis::magma)