【单细胞转录组实战】十一、复现文章分析结果

2022-09-04 本文已影响0人佳奥

这里是佳奥！

我们进入到作者的GitHub下载一下代码来看看吧。

https://github.com/KPLab/SCS_CAF

1 作者原始代码

当然，由于package的版本日新月异，除非安装相同版本的package，我就不运行了，作为学习。

QQ截图20220903155159.png

2 复现文章分析结果

作者没有使用三大R包，这里我们使用之前讲到的R包来复现文章中的图。

step 1-qc

读入质控文件

主要是读取作者RNA-seq上游分析的一些结果找出离群的那些细胞。

qc1=read.table('qc/SS2_15_0048_qc.txt',header = T)
qc2=read.table('qc/SS2_15_0049_qc.txt',header = T)
qc=rbind(qc1,qc2)
DT::datatable(qc1)
DT::datatable(qc2)
DT::datatable(qc)

粗略的看一看各个细胞的一些统计学指标的分布情况

library(ggplot2)
library(cowplot)
box <- lapply(colnames(qc[,3:12]),function(i) {
    dat <-  qc[,i,drop=F] 
    dat$sample=rownames(dat)
    ## 画boxplot 
   ggplot(dat, aes('all cells', get(i))) +
          geom_boxplot() +
          xlab(NULL)+ylab(i)
})
plot_grid(plotlist=box, ncol=2 )
# ggsave(file="stat_all_cells.pdf")

QQ截图20220903191300.png

批量过滤指标

因为进行了简单探索，对表型数据就有了把握，接下来可以进行一定程度的过滤，因为细节太多，这里为了展现批量处理方式，就不考虑太多。

pa <- colnames(qc[,5:12])
tf <- lapply(pa,function(i) {
 # i=pa[1]
  dat <-  qc[,i]  
  # dat <- abs(log10(dat))
  fivenum(dat)
  (up <- mean(dat)+2*sd(dat))
  (down <- mean(dat)- 2*sd(dat) ) 
  valid <- ifelse(dat > down & dat < up, 1,0 ) 
})

tf <- do.call(cbind,tf)
choosed_cells <- apply(tf,1,function(x) all(x==1))

qc=qc[choosed_cells,]

检查随便QC和作者详细QC的区别

last_qc=read.table('qc/qc_2plates.filtered_cells.txt',header = T)
intersect(substr(rownames(last_qc),1,11),qc$experiment)
[1] "SS2_15_0048" "SS2_15_0049"

可以看到简单粗暴的随意QC和作者用心QC结果类似，作者删除的52个细胞我们随便QC也删除掉了其中的50个。

不过，我们简单粗暴的QC，删除的细胞数量稍微多了一点。

显示运行环境

sessionInfo()

step 2-get matrix

把counts值进行过滤并且储存为R数据

需要读取作者GitHub项目的文件SS2_15_0048里面的counts值矩阵文件，然后进行一系列的过滤手段。

考虑到GitHub文件大小限制，下面的部分代码不运行。

https://github.com/KPLab/SCS_CAF

#ENDOGENOUS GENES
# created by Nikolay Oskolkov, nikolay.oskolkov@scilifelabs.se, 2017
Path_Main<-"/CAF_SCS/"
Path_Main<-"./"
plate1_raw<-read.delim(paste(Path_Main,"/SS2_15_0048/counts.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate1_raw$gene<-make.unique(as.character(plate1_raw$gene))
colnames(plate1_raw)[2:length(colnames(plate1_raw))]<-paste("SS2_15_0048_",colnames(plate1_raw)[2:length(colnames(plate1_raw))],sep="")
plate2_raw<-read.delim(paste(Path_Main,"/SS2_15_0049/counts.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate2_raw$gene<-make.unique(as.character(plate2_raw$gene))
colnames(plate2_raw)[2:length(colnames(plate2_raw))]<-paste("SS2_15_0049_",colnames(plate2_raw)[2:length(colnames(plate2_raw))],sep="")
expr_raw<-merge(plate1_raw,plate2_raw,by="gene",all=TRUE)
rownames(expr_raw)<-as.character(expr_raw$gene)
expr_raw$gene<-NULL
sum(expr_raw==0)/(dim(expr_raw)[1]*dim(expr_raw)[2])
#SPIKES
plate1_raw_ercc<-read.delim(paste(Path_Main,"/SS2_15_0048/counts-ercc.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate1_raw_ercc$gene<-make.unique(as.character(plate1_raw_ercc$gene))
colnames(plate1_raw_ercc)[2:length(colnames(plate1_raw_ercc))]<-paste("SS2_15_0048_",colnames(plate1_raw_ercc)[2:length(colnames(plate1_raw_ercc))],sep="")
plate2_raw_ercc<-read.delim(paste(Path_Main,"/SS2_15_0049/counts-ercc.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate2_raw_ercc$gene<-make.unique(as.character(plate2_raw_ercc$gene))
colnames(plate2_raw_ercc)[2:length(colnames(plate2_raw_ercc))]<-paste("SS2_15_0049_",colnames(plate2_raw_ercc)[2:length(colnames(plate2_raw_ercc))],sep="")
expr_raw_ercc<-merge(plate1_raw_ercc,plate2_raw_ercc,by="gene",all=TRUE)
rownames(expr_raw_ercc)<-as.character(expr_raw_ercc$gene)
expr_raw_ercc$gene<-NULL
sum(expr_raw_ercc==0)/(dim(expr_raw_ercc)[1]*dim(expr_raw_ercc)[2])
barplot(sort(as.numeric(colSums(expr_raw_ercc)),decreasing=TRUE),ylab="SPIKE LIBRARY SIZE",xlab="CELL INDEX")
hist(as.numeric(colSums(expr_raw_ercc)),col="brown",main="Distribution of Spike Library Sizes",xlab="Spike Library Size",breaks=20)
#MERGING ENDOGENOUS GENES WITH SPIKES
print(paste0("There are ",dim(expr_raw)[1]," endogenous genes"))
print(paste0("There are ",dim(expr_raw_ercc)[1]," spikes"))
all.counts.raw<-rbind(expr_raw,expr_raw_ercc)
sum(all.counts.raw==0)/(dim(all.counts.raw)[1]*dim(all.counts.raw)[2])
dim(all.counts.raw[rowSums(all.counts.raw)==0,])

cell_QC<-read.delim(paste(Path_Main,"/qc/qc_2plates.filtered_cells.txt",sep=""),row.names=1,header=TRUE,sep="\t")
rownames(cell_QC)<-gsub("__","_",rownames(cell_QC))
all.counts.raw<-subset(all.counts.raw,select=colnames(all.counts.raw)[!colnames(all.counts.raw)%in%rownames(cell_QC)])
expr_raw<-subset(expr_raw,select=colnames(expr_raw)[!colnames(expr_raw)%in%rownames(cell_QC)])
expr_raw_ercc<-subset(expr_raw_ercc,select=colnames(expr_raw_ercc)[!colnames(expr_raw_ercc)%in%rownames(cell_QC)])
#FILTER OUT GENES HAVING LESS THAN 1 COUNT IN AVERAGE OVER ALL CELLS
all.counts.raw<-all.counts.raw[rowMeans(all.counts.raw)>0,]
expr_raw<-expr_raw[rowMeans(expr_raw)>=1,]
expr_raw_ercc<-expr_raw_ercc[rowMeans(expr_raw_ercc)>0,]
save(expr_raw,expr_raw_ercc,file = 'expr_raw_counts.Rdata')

QC过后的counts矩阵被保存为R对象保存为文件，所以现在可以检查一下过滤后的矩阵

load(file = 'expr_raw_counts.Rdata')
expr_raw[1:4,1:4]
#PLOT CV^2 vs. MEAN RAW COUNT
library("matrixStats")
mean_expr_raw<-as.numeric(rowMeans(expr_raw,na.rm=TRUE))
sd_expr_raw<-rowSds(as.matrix(expr_raw),na.rm=TRUE)
cv_squared_expr_raw<-(sd_expr_raw/mean_expr_raw)^2
plot(log10(cv_squared_expr_raw)~log10(mean_expr_raw),pch=20,cex=0.5,xlab="log10 ( mean raw count )",ylab="log10 ( CV? )",main="RAW COUNTS")
mean_expr_raw_ercc<-as.numeric(rowMeans(expr_raw_ercc,na.rm=TRUE))
sd_expr_raw_ercc<-rowSds(as.matrix(expr_raw_ercc),na.rm=TRUE)
cv_squared_expr_raw_ercc<-(sd_expr_raw_ercc/mean_expr_raw_ercc)^2
points(log10(cv_squared_expr_raw_ercc)~log10(mean_expr_raw_ercc),col="red",pch=20,cex=1.5)
#FIT SPIKEINS WITH A CURVE
fit_expr_raw_ercc<-loess(log10(cv_squared_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))]~log10(mean_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))],span=1)
j<-order(log10(mean_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))])
lines(fit_expr_raw_ercc$fitted[j]~log10(mean_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))][j],col="red",lwd=3)
pred_expr_raw<-predict(fit_expr_raw_ercc,log10(mean_expr_raw))
#DETERMINE VARIABLE GENES THAT ARE ABOVE THE SPIKEINS CURVE
filtered_expr_raw<-expr_raw[log10(cv_squared_expr_raw)>=pred_expr_raw,]
filtered_expr_raw<-filtered_expr_raw[grepl("NA",rownames(filtered_expr_raw))==FALSE,]
COUNTS=filtered_expr_raw
save(COUNTS,file='last-COUNTS.Rdata')

QQ截图20220903191748.png

对最后的counts矩阵进行详细检查

load(file='last-COUNTS.Rdata')
counts=COUNTS
counts[1:4,1:4]

fivenum(apply(counts,1,function(x) sum(x>0) ))
boxplot(apply(counts,1,function(x) sum(x>0) ))

fivenum(apply(counts,2,function(x) sum(x>0) ))
hist(apply(counts,2,function(x) sum(x>0) ))

QQ截图20220903191936.png

把RPKM值进行过滤并且存储为R数据

需要读取作者GitHub项目的文件SS2_15_0048里面的counts值矩阵文件，然后进行一系列的过滤手段。

plate1_rpkm<-read.delim(paste(Path_Main,"/SS2_15_0048/rpkms.tab", sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate1_rpkm$gene<-make.unique(as.character(plate1_rpkm$gene))
colnames(plate1_rpkm)[2:length(colnames(plate1_rpkm))]<-paste("SS2_15_0048_",colnames(plate1_rpkm)[2:length(colnames(plate1_rpkm))],sep="")
plate2_rpkm<-read.delim(paste(Path_Main,"/SS2_15_0049/rpkms.tab", sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate2_rpkm$gene<-make.unique(as.character(plate2_rpkm$gene))
colnames(plate2_rpkm)[2:length(colnames(plate2_rpkm))]<-paste("SS2_15_0049_",colnames(plate2_rpkm)[2:length(colnames(plate2_rpkm))],sep="")
expr_rpkm<-merge(plate1_rpkm,plate2_rpkm,by="gene",all=TRUE)
rownames(expr_rpkm)<-as.character(expr_rpkm$gene)
expr_rpkm$gene<-NULL
sum(expr_rpkm==0)/(dim(expr_rpkm)[1]*dim(expr_rpkm)[2])
#SPIKES
plate1_rpkm_ercc<-read.delim(paste(Path_Main,"/SS2_15_0048/rpkms-ercc.tab", sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate1_rpkm_ercc$gene<-make.unique(as.character(plate1_rpkm_ercc$gene))
colnames(plate1_rpkm_ercc)[2:length(colnames(plate1_rpkm_ercc))]<-paste("SS2_15_0048_",colnames(plate1_rpkm_ercc)[2:length(colnames(plate1_rpkm_ercc))],sep="")
plate2_rpkm_ercc<-read.delim(paste(Path_Main,"/SS2_15_0049/rpkms-ercc.tab", sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate2_rpkm_ercc$gene<-make.unique(as.character(plate2_rpkm_ercc$gene))
colnames(plate2_rpkm_ercc)[2:length(colnames(plate2_rpkm_ercc))]<-paste("SS2_15_0049_",colnames(plate2_rpkm_ercc)[2:length(colnames(plate2_rpkm_ercc))],sep="")
expr_rpkm_ercc<-merge(plate1_rpkm_ercc,plate2_rpkm_ercc,by="gene",all=TRUE)
rownames(expr_rpkm_ercc)<-as.character(expr_rpkm_ercc$gene)
expr_rpkm_ercc$gene<-NULL
#MERGING ENDOGENOUS GENES WITH SPIKES
print(paste0("There are ",dim(expr_rpkm)[1]," endogenous genes"))
print(paste0("There are ",dim(expr_rpkm_ercc)[1]," spikes"))
all.counts.rpkm<-rbind(expr_rpkm,expr_rpkm_ercc)
sum(all.counts.rpkm==0)/(dim(all.counts.rpkm)[1]*dim(all.counts.rpkm)[2])
dim(all.counts.rpkm[rowSums(all.counts.rpkm)==0,])
#PLOT BARPLOT OF TOP EXPRESSED GENES
mean_rpkms<-data.frame(Gene=rownames(all.counts.rpkm),Mean_rpkm=rowMedians(as.matrix(all.counts.rpkm)))
mean_rpkms<-mean_rpkms[order(-mean_rpkms$Mean_rpkm),]
head(mean_rpkms,20)
par(mfrow=c(1,1))
barplot(log10(as.numeric(mean_rpkms$Mean_rpkm)[1:50]),names.arg=mean_rpkms$Gene[1:50],las=2,ylim=c(0,5),cex.names=0.8,ylab="Median RPKM")


#FILTER POOR CELLS OUT
cell_QC<-read.delim(paste(Path_Main,"/qc/qc_2plates.filtered_cells.txt",sep=""),row.names=1,header=TRUE,sep="\t")
rownames(cell_QC)<-gsub("__","_",rownames(cell_QC))
all.counts.rpkm<-subset(all.counts.rpkm,select=colnames(all.counts.rpkm)[!colnames(all.counts.rpkm)%in%rownames(cell_QC)])
expr_rpkm<-subset(expr_rpkm,select=colnames(expr_rpkm)[!colnames(expr_rpkm)%in%rownames(cell_QC)])
expr_rpkm_ercc<-subset(expr_rpkm_ercc,select=colnames(expr_rpkm_ercc)[!colnames(expr_rpkm_ercc)%in%rownames(cell_QC)])

#FILTER OUT GENES HAVING LESS THAN 1 RPKM COUNT IN AVERAGE OVER ALL CELLS
RPKM.full<-all.counts.rpkm[rowSums(all.counts.rpkm)>0,]
save(RPKM.full,file='RPKM.full.Rdata') ## 找差异基因

all.counts.rpkm<-all.counts.rpkm[rowMeans(all.counts.rpkm)>=1,]
expr_rpkm<-expr_rpkm[rowMeans(expr_rpkm)>=1,]
expr_rpkm_ercc<-expr_rpkm_ercc[rowMeans(expr_rpkm_ercc)>=1,]

# save(expr_rpkm,expr_rpkm_ercc,file = 'expr_raw_rpkm.Rdata')

对最后的 RPKM 矩阵进行详细检查

load(file='last-RPKM.Rdata')
dat=RPKM
dim(dat)
fivenum(apply(dat,1,function(x) sum(x>0) ))
boxplot(apply(dat,1,function(x) sum(x>0) ))

fivenum(apply(dat,2,function(x) sum(x>0) ))
hist(apply(dat,2,function(x) sum(x>0) ))

QQ截图20220903192607.png
这个 RPKM 矩阵会进入下一步分析, 主要是降维和聚类。

对最后的 RPKM.full 矩阵进行详细检查

load(file='RPKM.full.Rdata')
dat=RPKM.full
dim(dat)
fivenum(apply(dat,1,function(x) sum(x>0) ))
boxplot(apply(dat,1,function(x) sum(x>0) ))

fivenum(apply(dat,2,function(x) sum(x>0) ))
hist(apply(dat,2,function(x) sum(x>0) ))

QQ截图20220903192649.png
这个 RPKM.full 矩阵会进入下一步分析, 主要是差异分析及表达量可视化。

显示运行环境

sessionInfo()

step 3-db scan

首先载入上一步的RPKM矩阵

load('last-RPKM.Rdata')
#run t-SNE 50 times and select the optimal run based on the itercosts parameter
#subgroups of cells are assigned by dbscan
library(Rtsne)
N_tsne <- 50
tsne_out <- list(length = N_tsne)
KL <- vector(length = N_tsne)
set.seed(1234)

然后进行50次tsne降维

因为运行耗时很可观，所以我们保存运行结果到文件，以便重复使用。

for(k in 1:N_tsne)
{
  tsne_out[[k]]<-Rtsne(t(log10(RPKM+1)),initial_dims=30,verbose=FALSE,check_duplicates=FALSE,
                       perplexity=27, dims=2,max_iter=5000)
  KL[k]<-tail(tsne_out[[k]]$itercosts,1)
  print(paste0("FINISHED ",k," TSNE ITERATION"))
}
names(KL) <- c(1:N_tsne)
save(tsne_out,KL,file = 'tsne_out.Rdata')

查看最佳tsne结果

load(file = 'tsne_out.Rdata')
opt_tsne <- tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]$Y
opt_tsne_full<-tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]
head(opt_tsne)
           [,1]       [,2]
[1,]   8.779314  -7.376137
[2,]  11.023719  -7.026307
[3,]  18.795992 -25.336875
[4,] -11.526798  30.196473
[5,]  11.672634 -26.438114
[6,]  -8.153627  30.434412

对tsne结果进行kmeans

table(kmeans(opt_tsne,centers = 4)$clust)
plot(opt_tsne,  col=kmeans(opt_tsne,centers = 4)$clust, pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")

QQ截图20220903195808.png

对tsne结果进行dbscan

library(dbscan)
plot(opt_tsne,  col=dbscan(opt_tsne,eps=3.1)$cluster, pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")
table(dbscan(opt_tsne,eps=3.1)$cluster)
# 比较两个聚类算法区别
table(kmeans(opt_tsne,centers = 4)$clust,dbscan(opt_tsne,eps=3.1)$cluster)

QQ截图20220903195845.png

得到最后的分组信息

有一个样本是离群点，把它归入到细胞数量最多的那个组别。

library(dbscan)
CAFgroups<-dbscan(opt_tsne,eps=3.1)$cluster
CAFgroups_full<-dbscan(opt_tsne,eps=3.1)
CAFgroups[CAFgroups==0]<-1
CAFgroups_full$cluster[CAFgroups_full$cluster==0]<-1
plot(opt_tsne,  col=CAFgroups, pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")

QQ截图20220903195919.png

直接PCA看看dbscan聚类效果

RPKM.PCA<-prcomp(log2(t(RPKM)+1), center=TRUE)
plot(RPKM.PCA$x,main="first PCA", pch=19, col=CAFgroups)
save(CAFgroups,CAFgroups_full,file='CAFgroups.Rdata')

QQ截图20220903195959.png

表达量的可视化(散点图)

首先作者自定义了一个绘图函数, 接受基因名字，tsne的坐标矩阵，以及表达量矩阵。

#feature plots represent gene expression for each cell on the t-SNE display. 
# It requires the name of the gene as a string, 
# the output of Rtsne and the expression matrix with rownames representing the gene names.
plot.feature2<-function(gene, tsne.output=tsne.out, DATAuse=DATA){
  plot.frame<-data.frame(x=tsne.output$Y[,1], y=tsne.output$Y[,2], log2expr=as.numeric(log2(DATAuse[gene,]+1)))
  
  
  p<-ggplot(plot.frame,aes(x=x, y=y, col=log2expr))+
    geom_point(size=1) +
    labs(title=paste(gene))+
    theme_classic()+
    scale_color_gradientn(colors = c("#FFFF00", "#FFD000","#FF0000","#360101"), limits=c(0,14))+
    theme(axis.title = element_blank())+
    theme(axis.text = element_blank())+
    theme(axis.line = element_blank())+
    theme(axis.ticks = element_blank())+
    theme(plot.title = element_text(size=20,face="italic"))+
    theme(legend.title  = element_blank())+
    
    theme(legend.position = "none")
  
  return(p)
}

就可以根据基因名，从表达量矩阵里面获取该基因的表达情况，然后把tsne的坐标矩阵绘制在画布后，使用该基因的表达量化值来上颜色。

load(file = 'tsne_out.Rdata')
library(ggplot2)
opt_tsne <- tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]$Y
opt_tsne_full<-tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]
load(file='RPKM.full.Rdata')
load(file='CAFgroups.Rdata')
plot.feature2("Pdgfra", opt_tsne_full, RPKM.full)

QQ截图20220903200136.png

表达量的可视化(小提琴图)

首先作者自定义了一个绘图函数, 接受基因名字，tsne的坐标矩阵，以及表达量矩阵。

#violin plots represent gene expression for each subpopulation. The color of each violin represents the mean gene expression after log2 transformation.
#gene: Gene name of interest as string. DATAuse: Gene expression matrix with rownames containing gene names. tsne.popus = dbscan output, axis= if FALSE no axis is printed. legend_position= default "none" indicates where legend is plotted. gene_name = if FALSE gene name will not be included in the plot.
plot.violin2 <- function(gene, DATAuse, tsne.popus, axis=FALSE, legend_position="none", gene_name=FALSE){
  
  testframe<-data.frame(expression=as.numeric(DATAuse[paste(gene),]), Population=tsne.popus$cluster)
  testframe$Population <- as.factor(testframe$Population)
  colnames(testframe)<-c("expression", "Population")
  
  col.mean<-vector()
  for(i in levels(testframe$Population)){
    col.mean<-c(col.mean,mean(testframe$expression[which(testframe$Population ==i)]))
  }
  col.mean<-log2(col.mean+1)
  
  col.means<-vector()
  
  for(i in testframe$Population){
    col.means<-c(col.means,col.mean[as.numeric(i)])
  }
  testframe$Mean<-col.means
  testframe$expression<-log2(testframe$expression+1)
    
  p <- ggplot(testframe, aes(x=Population, y=expression, fill= Mean, color=Mean))+
    geom_violin(scale="width") +
    labs(title=paste(gene), y ="log2(expression)", x="Population")+
    theme_classic() +
       
    scale_color_gradientn(colors = c("#FFFF00", "#FFD000","#FF0000","#360101"), limits=c(0,14))+
    scale_fill_gradientn(colors = c("#FFFF00", "#FFD000","#FF0000","#360101"), limits=c(0,14))+
    
    theme(axis.title.y =  element_blank())+
    theme(axis.ticks.y =  element_blank())+
    theme(axis.line.y =   element_blank())+
    theme(axis.text.y =   element_blank())+
    theme(axis.title.x = element_blank())+
        
    theme(legend.position=legend_position )
  
  if(axis == FALSE){
    p<-p+
      theme( axis.line.x=element_blank(),
             axis.text.x = element_blank(),
             axis.ticks.x = element_blank())
    
  }
  
  if(gene_name == FALSE){
    p<-p+  theme(plot.title = element_blank())   
  }else{ p<-p + theme(plot.title = element_text(size=10,face="bold"))}
  
  p
  
}

定义好绘图函数后，理论上可以绘制任意基因的表达量在不同的聚类的分组表达情况

plot.violin2(gene = "Pdgfra", DATAuse = RPKM.full, tsne.popus = CAFgroups_full)

QQ截图20220903200252.png

显示运行环境

sessionInfo()

step 4-DEG

文献描述是：
Differentually expressed genes are detected using Reproducibility-optimized test statistic (ROTS), for each subgroup compared to the other subgroups.

也就是说作者并没有使用3大单细胞转录组R包。

载入RPKM矩阵

load(file='RPKM.full.Rdata')
load(file='CAFgroups.Rdata')

分开做4次差异分析

运行速度非常慢，建议过夜等待，然后保存结果，方便重复使用。

library(ROTS)
library(plyr)
groups<-CAFgroups
groups[groups!=1]<-234

ROTS_input<-RPKM.full[rowMeans(RPKM.full)>=1,]
ROTS_input<-as.matrix(log2(ROTS_input+1))
Sys.time()
results_pop1 = ROTS(data = ROTS_input, groups = groups , B = 1000 , K = 500 , seed = 1234)
Sys.time()
summary_pop1<-data.frame(summary(results_pop1, fdr=1))

groups<-CAFgroups
groups[groups!=2]<-134
Sys.time()
results_pop2 = ROTS(data = ROTS_input, groups = groups , B = 1000 , K = 500 , seed = 1234)
Sys.time()
summary_pop2<-data.frame(summary(results_pop2, fdr=1))


groups<-CAFgroups
groups[groups!=3]<-124
Sys.time()
results_pop3 = ROTS(data = ROTS_input, groups = groups , B = 1000 , K = 500 , seed = 1234)
Sys.time()
summary_pop3<-data.frame(summary(results_pop3, fdr=1))


groups<-CAFgroups
groups[groups!=4]<-123
Sys.time()
results_pop4 = ROTS(data = ROTS_input, groups = groups , B = 1000 , K = 500 , seed = 1234)
Sys.time()
summary_pop4<-data.frame(summary(results_pop4, fdr=1))

save(summary_pop1,summary_pop2,summary_pop3,summary_pop4,
     file = 'ROTS_summary_pop.Rdata')

这里直接载入 ROTS 的差异分析结果即可。

load(file = 'ROTS_summary_pop.Rdata')
head(summary_pop1)
head(summary_pop2)
head(summary_pop3)
head(summary_pop4)

QQ截图20220903200802.png

每个亚群挑选top18的差异基因绘制热图

population_subset<-c(rownames(summary_pop1[summary_pop1$ROTS.statistic<0,])[1:18],rownames(summary_pop2[summary_pop2$ROTS.statistic<0,])[1:18],rownames(summary_pop3[summary_pop3$ROTS.statistic<0,])[1:18],rownames(summary_pop4[summary_pop4$ROTS.statistic<0,])[1:18])
RPKM_heatmap<-RPKM.full[population_subset,]

RPKM_heatmap<-RPKM_heatmap[,order(CAFgroups_full$cluster)]
RPKM_heatmap<-log2(RPKM_heatmap+1)

popul.col<-sort(CAFgroups_full$cluster)
popul.col<-replace(popul.col, popul.col==1,"#1C86EE" )
popul.col<-replace(popul.col, popul.col==2,"#00EE00" )
popul.col<-replace(popul.col, popul.col==3,"#FF9912" )
popul.col<-replace(popul.col, popul.col==4,"#FF3E96" )
library(gplots)

#pdf("heatmap_genes_population.pdf")
heatmap.2(as.matrix(RPKM_heatmap),ColSideColors = as.character(popul.col), tracecol = NA, dendrogram = "none",col=bluered, labCol = FALSE, scale="none", key = TRUE, symkey = F, symm=F,  key.xlab = "", key.ylab = "", density.info = "density", key.title = "log2(RPKM+1)", keysize = 1.2, denscol="black", Colv=FALSE)
# dev.off()

QQ截图20220903200841.png

显示运行环境

sessionInfo()

step 5-run Seurat

rm(list = ls()) 
Sys.setenv(R_MAX_NUM_DLLS=999)
## 首先counts数据
load(file='expr_raw_counts.Rdata')
counts=expr_raw
# using raw counts is the easiest way to process data through Seurat.
counts[1:4,1:4];dim(counts)
fivenum(apply(counts,1,function(x) sum(x>0) ))
boxplot(apply(counts,1,function(x) sum(x>0) ))
fivenum(apply(counts,2,function(x) sum(x>0) ))
hist(apply(counts,2,function(x) sum(x>0) ))

dat=log2(edgeR::cpm(counts)+1) 
hc=hclust(dist(t(dat)))  
plot(hc,labels = FALSE)  
clus = cutree(hc, 4) 
group_list= as.factor(clus) 
table(group_list) 

group_list
  1   2   3   4 
351 151 161  53

#提取批次信息
colnames(dat) #取列名
library(stringr)
plate=str_split(colnames(dat),'_',simplify = T)[,3] #取列名，以'_'号分割，提取第三列。
#str_split()函数可以分割字符串
table(plate)

n_g = apply(counts,2,function(x) sum(x>1)) #统计每个样本有表达的有多少行（基因）
# 这里我们定义， reads数量大于1的那些基因为有表达，一般来说单细胞转录组过半数的基因是不会表达的。
# 而且大部分单细胞转录组技术很烂，通常超过75%的基因都没办法检测到。

df=data.frame(g=group_list,plate=plate,n_g=n_g) #新建数据框(细胞的属性信息)
library(stringr) 
meta=df
head(meta)  

# 上面得到的 counts 和 meta 两个变量，Seurat 需要使用

library(Seurat)
# https://satijalab.org/seurat/mca.html
# 构建 Seurat 需要的对象
# 其中 min.cells 和 min.genes 两个参数是经验值
sce <- CreateSeuratObject(counts, 
                          meta.data =meta,
                          min.cells = 5, 
                          min.genes = 1000, 
                          project = "sce")
# 参考：https://github.com/satijalab/seurat/issues/668

sce
?seurat
table(apply(counts,2,function(x) sum(x>0) )>1000)
table(apply(counts,1,function(x) sum(x>0) )>4)
## 可以看到上面的过滤参数是如何发挥作用的
head(sce@meta.data) 
#dim(sce@data)
            
## 默认使用细胞名字字符串的切割第一列来对细胞进行分组
# 所以在这里只有一个SS2分组信息, 我们就增加一个 group.by 参数
VlnPlot(sce, 
        features = c("nFeature_RNA", "nCount_RNA" ), 
        group.by = 'plate',
        ncol = 2)
VlnPlot(sce, 
        features = c("nFeature_RNA", "nCount_RNA" ), 
        group.by = 'g',
        nCol = 2)
            
sce <- NormalizeData(object = sce, 
                     normalization.method = "LogNormalize", 
                     scale.factor = 10000)
sce@data[1:4,1:4] 
# 这里需要理解  dispersion 值
#sce <- FindVariableGenes(object = sce, 
                         mean.function = ExpMean, 
                         dispersion.function = LogVMR )

sce <- FindVariableFeatures(sce, selection.method = "vst")            

all.genes <- rownames(sce)
      
## 根据经验阈值挑选的变化基因个数。
#length( sce@var.genes)
length(rownames(sce))
      
## 去除一些技术误差，比如 nUMI或者ERCC
head(sce@meta.data) 
#sce <- ScaleData(object = sce, 
                 vars.to.regress = c("nUMI",'nGene' ))
            
sce <- ScaleData(sce, features = all.genes,
                 vars.to.regress = c("nCount_RNA","percent.ercc" ))               
            
# 后面就不需要考虑ERCC序列了。
sce@meta.data[1:4,1:4]

## 普通的PCA降维
sce <- RunPCA(sce, features = VariableFeatures(object = sce),
              ndims.print = 1:5, nfeatures.print = 5)

#它利用的也就是sce@reductions$pca@feature.loadings结果
VizDimLoadings(sce, dims = 1:2, reduction = "pca")

## 根据参数来调整最后的分组个数
#其中包含了一个resolution的选项，它会设置一个”间隔“值，该值越大，间隔越大，得到的cluster数量越多
sce <- FindNeighbors(sce, dims = 1:15)
sce1 <- FindClusters(sce, resolution = 0.4)

## resolution 是最关键的参数

sce=sce1

sce <- RunTSNE(object = sce, dims.use = 1:10, do.fast = TRUE)
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = sce)

QQ截图20220903203530.png

## 对每个类别细胞都找到自己的marker基因
# Then find markers for every cluster compared to all remaining cells, report # only the positive ones
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, 
                              min.pct = 0.25, 
                              thresh.use = 0.25)
library(dplyr)
top10 <- sce.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
head(meta) 

# 下面的基因是文章作者给出的
gs=read.table('top18-genes-in-4-subgroup.txt')[,1]
gs
library(pheatmap)
intersect(top10$gene,gs)
# setting slim.col.label to TRUE will print just the cluster IDS instead of# every cell name

DoHeatmap(sce, features = top10$gene) + NoLegend()

# save(sce.markers,sce,file='sce_seurat.Rdata')
# load(file='sce_seurat.Rdata')

QQ截图20220903204205.png

step 6-SC3

之前已做绘图，这部分代码仅作展示。

####### SC3 Heatmaps #########
library(scater)
library(SC3)

anno<-data.frame(Plate = unlist(lapply(strsplit(colnames(RPKM),"_"),function(x) x[3])), Population = CAFgroups_full$cluster)
rownames(anno)<-colnames(RPKM)
pd <- new("AnnotatedDataFrame", data = anno)
sceset <-  SingleCellExperiment(assays = list(normcounts = as.matrix(RPKM)) ,colData =anno)

counts(sceset)<-normcounts(sceset)
logcounts(sceset)<- log2(normcounts(sceset)+1)
rowData(sceset)$feature_symbol <- rownames(sceset)
sceset<-sceset[!duplicated(rownames(sceset)),]
sceset <- sc3(sceset, ks = 3:5, biology = TRUE)

pdf("sc3_heatmap.pdf", useDingbats = FALSE, width=6, height = 4.5)

sc3_plot_consensus(
  sceset, k=4,
  show_pdata = c(
    "Plate",
    "Population"
  )
)
dev.off()

matrisome<-as.character(read.table("GitHub/170210_naba_matrisome.txt")[,1])
matrisome<-lookuptable[lookuptable[,1] %in% matrisome,4]

sce.matrisome <- SingleCellExperiment(assays = list(normcounts = as.matrix(RPKM.full[matrisome,])), colData = anno)
counts(sce.matrisome)<-normcounts(sce.matrisome)
logcounts(sce.matrisome)<- log2(normcounts(sce.matrisome)+1)
rowData(sce.matrisome)$feature_symbol <- rownames(sce.matrisome)
sceset<-sce.matrisome[!duplicated(rownames(sce.matrisome)),]

sce.matrisome <- sc3(sce.matrisome, ks = 4, biology = TRUE)

pdf("sce_matrisome_expression.pdf",useDingbats = FALSE, width=6, height = 4.5)
sc3_plot_expression(
  sce.matrisome, k=4,
  show_pdata = c(
    "Plate",
    "Population"
  )
)
dev.off()

有关复现文章分析结果就到这里。

下一篇我们进入新章节，关于公共数据。

我们下一篇再见！

【单细胞转录组实战】十一、复现文章分析结果

1 作者原始代码

2 复现文章分析结果

step 1-qc

读入质控文件

批量过滤指标

检查随便QC和作者详细QC的区别

显示运行环境

step 2-get matrix

把counts值进行过滤并且储存为R数据

把RPKM值进行过滤并且存储为R数据

对最后的 RPKM 矩阵进行详细检查

对最后的 RPKM.full 矩阵进行详细检查

显示运行环境

step 3-db scan

首先载入上一步的RPKM矩阵

然后进行50次tsne降维

查看最佳tsne结果

对tsne结果进行kmeans

对tsne结果进行dbscan

得到最后的分组信息

直接PCA看看dbscan聚类效果

表达量的可视化(散点图)

表达量的可视化(小提琴图)

显示运行环境

step 4-DEG

载入RPKM矩阵

分开做4次差异分析

每个亚群挑选top18的差异基因绘制热图

显示运行环境

step 5-run Seurat

step 6-SC3

猜你喜欢

热点阅读

【单细胞转录组 实战】十一、复现文章分析结果

1 作者原始代码

2 复现文章分析结果

step 1-qc

读入质控文件

批量过滤指标

检查随便QC和作者详细QC的区别

显示运行环境

step 2-get matrix

把counts值进行过滤并且储存为R数据

把RPKM值进行过滤并且存储为R数据

对最后的 RPKM 矩阵进行详细检查

对最后的 RPKM.full 矩阵进行详细检查

显示运行环境

step 3-db scan

首先载入上一步的RPKM矩阵

然后进行50次tsne降维

查看最佳tsne结果

对tsne结果进行kmeans

对tsne结果进行dbscan

得到最后的分组信息

直接PCA看看dbscan聚类效果

表达量的可视化(散点图)

表达量的可视化(小提琴图)

显示运行环境

step 4-DEG

载入RPKM矩阵

分开做4次差异分析

每个亚群挑选top18的差异基因绘制热图

显示运行环境

step 5-run Seurat

step 6-SC3

猜你喜欢

热点阅读

【单细胞转录组实战】十一、复现文章分析结果