拿到基因表达矩阵之后的那点事(二)
2020-08-05 本文已影响0人
凯凯何_Boy
上次说了除了FPKM/RPKM标准化,我们可以直接拿原始Counts去进行差异分析,基于Deseq2、edgeR、limma三个包简单介绍一下分析流程。其中edgeR包在无生物学重复的研究中也用的较多~~
数据准备:counts矩阵及分组文件
Deseq2
rm(list = ls())
#Step1 Deseq2分析
library(DESeq2)
##数据预处理
database <- read.table(file = "../data/CvsNc.txt", sep = "\t", header = T, row.names = 1)
database <- round(as.matrix(database))
##设置分组信息并构建dds对象
condition <- factor(c(rep("C",3),rep("Nc",3)), levels = c("C", "Nc"))
coldata <- data.frame(row.names = colnames(database), condition)
dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
#注: 这一步到下一步之间可以过滤掉一些low count数据,节省内存,提高运行速度
nrow(dds)
dds <- dds[ rowSums(counts(dds)) > 1, ]
##使用DESeq函数估计离散度,然后差异分析获得res对象
dds <- DESeq(dds, parallel = FALSE) #parallel = TRUE 将启用多线程模式
suppressMessages(dds)
res <- results(dds,pAdjustMethod = 'fdr',alpha = 0.05)#默认p阈值0.1,可以通过alpha设定
#最后设定阈值,筛选差异基因,导出数据(全部数据。包括标准化后的count数)
res <- res[order(res$padj),]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
write.csv(resdata,file = "../result1/CvsNC.csv",row.names = F)
#此方法得到的差异基因
diff_gene <- subset(res, padj < 0.05 & (log2FoldChange > 1 | log2FoldChange < -1))
diff_gene_Deseq2 <- row.names(diff_gene)
edgeR
对于该包,官方文档上针对有无生物学重复都有相应的流程
To view documentation for the version of this package installed in your system, start R and enter: browseVignettes("edgeR")
library(edgeR)
##跟DESeq2一样,导入数据,预处理(用了cpm函数)
exprSet<- read.table(file = "../data/CvsNc.txt", sep = "\t", header = TRUE, row.names = 1, stringsAsFactors = FALSE)
group_list <- factor(c(rep("C",3),rep("Nc",3)))
exprSet <- exprSet[rowSums(cpm(exprSet) > 1) >= 2,]
##设置分组信息,并做TMM标准化
exprSet <- DGEList(counts = exprSet, group = group_list)
exprSet <- calcNormFactors(exprSet)
##使用qCML(quantile-adjusted conditional maximum likelihood)估计离散度(只针对单因素实验设计)
exprSet <- estimateCommonDisp(exprSet)
exprSet <- estimateTagwiseDisp(exprSet)
##寻找差异gene(这里的exactTest函数还是基于qCML并且只针对单因素实验设计),然后按照阈值进行筛选即可
et <- exactTest(exprSet)
tTag <- topTags(et, n=nrow(exprSet))
resdata2 <- merge(as.data.frame(tTag$table),as.data.frame(exprSet$counts),by="row.names",sort=FALSE)%>%tibble::column_to_rownames('Row.names')
write.csv(resdata2,file = "../result1/CvsNc_edgeR.csv")
diff_gene_edgeR <- subset(resdata2, FDR < 0.05 & (logFC > 1 | logFC < -1))
diff_gene_edgeR <- row.names(diff_gene_edgeR)
无生物学重复时候
要指定其中的bcv参数, 就是选择一个认为合理的离散度的值。通常如果是实验控制的好的人类数据,那么选择BCV=0.4,比较好的模式生物选择BCV=0.1,技术重复的话选择BCV=0.1
targets <- as.matrix(read.delim('gene.txt', sep = '\t', row.names = 1))
dgelist <- DGEList(counts = targets, group = group) #参数很简单 你的矩阵和分组文件
# bcv是官方文档的推荐数值(对应人的,对应其他物种的值不清楚),可以自己调整
bcv = 0.1
et <- exactTest(y, dispersion=bcv^2)
write.csv(topTags(dge_et, n = nrow(dgelist$counts)), 'exactTest.csv', quote = FALSE) #输出主要结果
limma
##数据预处理
library(limma)
library(edgeR)
#limma包里对RNA-Seq差异分析的limma-trend方法,该方法主要适用于样本间测序深度基本保持一致的情况,也就是说两个样本的文库(reads数目)大小相差的不悬殊,当文库大小在样本间变化幅度相当大的话,可以使用limma的voom方法,可按照下面的代码流程:
counts <- read.table(file = "../data/CvsNc.txt", sep = "\t", header = T, row.names = 1)
dge <- DGEList(counts = counts)
dge <- calcNormFactors(dge) #标准化下
#设置分组信息
group_list <- factor(c(rep("C",3), rep("Nc",3)))
design <- model.matrix(~group_list)
colnames(design) <- levels(group_list)
rownames(design) <- colnames(counts)
#接下来进行voom转化
v <- voom(dge, design, plot=TRUE)
#其实可以不进行TMM标准化,直接用count数据进行voom转化,如:
v <- voom(counts, design, plot=TRUE)
#差异分析
fit <- lmFit(v, design)
fit <- eBayes(fit)
output <- topTable(fit, coef=2,n=Inf)
resdata3 <- merge(output,counts,by="row.names",sort=FALSE)%>%tibble::column_to_rownames('Row.names')
write.csv(resdata3,file = "../result1/CvsNc_limma.csv")
diff_gene_limma <- subset(resdata3, adj.P.Val < 0.05 & (logFC > 1 | logFC < -1))
diff_gene_limma <- row.names(diff_gene_limma)
至此我们已经用了三种方法针对同一基因矩阵进行了差异分析,我们平常做时候可以都跑一遍流程,然后再去取个交集,这里我简单画个Venn图展示下相交情况.
交集
#展示3种方法的相交情况
all_gene <- intersect(intersect(diff_gene_Deseq2,diff_gene_edgeR),diff_gene_limma)
library(VennDiagram)
作图
venn_list <- list(diff_gene_Deseq2,diff_gene_edgeR,diff_gene_limma)
names(venn_list) <- c('Deseq2','edgeR','limma')
venn.diagram(venn_list, filename = '../figures//venn.png', fill = c('red', 'blue','gold'), alpha = 0.50, col = 'black', cex = 1, fontfamily = 'serif', cat.col = c('black', 'black','black'), cat.cex = 1, cat.fontfamily = 'serif', margin = 0.2)
图片.png
我们再以Deseq2的结果简单画个火山图展示一下吧~由于火山图点太多,我们只展示上调和下调按照p.adjust值排名的前10个吧
#确定是上调还是下调,用于给图中点上色
library(ggrepel)
colnames(resdata)[1] <- 'Gene'
resdata$threshold = factor(ifelse(resdata$padj < 0.05 & abs(resdata$log2FoldChange) >= 1, ifelse(resdata$log2FoldChange>= 1 ,'Up','Down'),'NoSignifi'),levels=c('Up','Down','NoSignifi'))
#上调 和下调的前10个标记出来
up <- subset(resdata, threshold == 'Up')
up <- up[order(up$padj), ][1:10, ]
down <- subset(resdata, threshold == 'Down')
down <- down[order(down$padj), ][1:10, ]
plot <- ggplot(resdata,aes(x=log2FoldChange,y=-log10(padj),color=threshold))+
geom_point()+
scale_color_manual(values=c("#DC143C","#00008B","#808080"))+#确定点的颜色
geom_text_repel(data = rbind(up, down), aes(x = log2FoldChange, y = -log10(padj), label = Gene),
size = 3,box.padding = unit(0.5, 'lines'), segment.color = 'black', show.legend = FALSE)+
theme_bw()+#修改图片背景
theme(
legend.title = element_blank()#不显示图例标题
)+
ylab('-log10 (p-adj)')+#修改y轴名称
xlab('log2 (FoldChange)')+#修改x轴名称
geom_vline(xintercept=c(-1,1),lty=3,col="black",lwd=0.5) +#添加横线|FoldChange|>2
geom_hline(yintercept = -log10(0.05),lty=3,col="black",lwd=0.5)#添加竖线padj<0.05
plot
ggsave(paste('../figures/','TgvsNc.tiff'),plot, width = 16, height = 10)
diff_gene <- subset(resdata, padj < 0.05 & (log2FoldChange > 1 | log2FoldChange < -1))
write.csv(diff_gene, paste('../results/','CvsNc_DESeq2.csv'), row.names = FALSE, quote = FALSE)
图片.png
OK, 拿到差异基因之后,我们就可以去拿着这些geneID去做富集分析了,或者GSEA基因集富集,这些就留到下次再说吧~~