基因注释/富集分析与功能分类

DESeq2:基于Counts分析

2019-10-14  本文已影响0人  小杜的生信筆記

data:2019年10月14日

来源文章:RNA-Seq workflow: gene-level exploratory analysis and differential expression

(一)、数据准备

1 数据制备(Data)

​ 使用FeatureCounts读取*.bam文件的counts数目,得到counts.txt文件。

1—countdata文件


image.png

2-coldata


image.png

数据准备好后,即可进行DESeq2分析

2、DSEeq2数据导入及构建dds

#进行root_lncRNA的DESeq2分析
setwd("I:/DKH实时工作进展/蒺藜苜蓿冷胁迫/分析数据结果/FPKM_lncRNA/root")
library("DESeq2")
install.packages("airway")
library("airway")
countdata_root <- read.csv("lncRNA_root.csv", header = T, row.names = 1)
head(countdata_root)
coldata_root <- read.csv("coldata_root.csv", header = T, row.names = 1)
head(coldata_root)
##--------------
dds <- DESeqDataSetFromMatrix(countData = countdata_root, colData = coldata_root, design = ~ sample)
head(dds)
##删除样本中count < 1 的样本
dds <- dds[rowSums(counts(dds)) > 1,]
nrow(dds)

3、rlog变型

rld <- rlog(dds, blind = FALSE) # “FALSE"意思指各处理之间的差异不增加试验的方差平均数
head(assay(rld), 3)
##相互样本之间进行比较
par(mfrow = c(1,2))
dds <- estimateSizeFactors(dds)
plot(log2(counts(dds, normalized = TRUE)[,1:2]+1),
     pch = 16, cex =0.3)
plot(assay(rld)[,1:2],
     pch=16, cex=0.3)
image.png
我们可以看到低计数的基因(左下角)在普通对数尺度上似乎是过度可变的,而rlog变换压缩低计数基因的差异,而数据提供的差异表达信息很少

4、样本的距离计算 (sample distance)

第一步通常是评估样本之间的总体相似性:哪些样本彼此相似,哪些不同?这符合实验设计的预期吗?

sampleDists <- dist(t(assay(rld)))
sampleDists
library("pheatmap")
library("RColorBrewer")
#BiocManager::install("RColorBrewer")
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(rld$dex, rld$sample, sep = "-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors)
image.png

5、PCAfenx

plotPCA(rld, intgroup = c("dex", "sample"))
(data <- plotPCA(rld, intgroup = c("dex", "sample"), returnData=TRUE))
image.png
percentVar <- round(100 * attr(data, "percentVar"))
library("ggplot2")
ggplot(data,aes(PC1,PC2, color=dex, shape=sample)) + geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance"))
image.png
mdsData <- data.frame(cmdscale(sampleDistMatrix))
mds <- cbind(mdsData, as.data.frame(colData(rld)))
ggplot(mds, aes(X1,X2,color=dex,shape=sample)) + geom_point(size=3)
image.png
#creating the sample plot for poissonDistance 
mdsPoisData <- data.frame(cmdscale(sampleDistMatrix))
mdsPois <- cbind(mdsPoisData, as.data.frame(colData(dds)))
ggplot(mdsPois, aes(X1,X2,color=dex,shape=sample)) + geom_point(size=3)
image.png

(二)DESeq - Differential expression analysis

Running the differential expression piple

1 DESeq()

dds <- DESeq(dds)

(res <- results(dds))
image.png
#As res is a DataFrame object, it carries metadata with information on the meaning of the columns:
mcols(res, use.names = TRUE)

summary(res)   #结果展示

2 过滤

###---------------挑选出res进行过滤
res.05 <- results(dds, alpha = .05)
table(res.05$padj < .05)

##如果我们想提高log2倍的变化阈值,以便我们测试由于治疗而表现出更多实质性变化的基因,我们只需提供log2量表上的一个值
resLFC1 <- results(dds, lfcThreshold = 1)
table(resLFC1$padj < 0.1)
#FALSE  TRUE 
# 644   280 

3 多重比较

高通量生物学中,我们注意不要直接使用p值作为空值的证据,而是要对多个测试进行更正。

#报告p-value低于0.05
sum(res$pvalue < 0.05, na.rm = TRUE)
# or 
sum(!is.na(res$pvalue))

## P-value 0.1 
sum(res$padj < 0.1, na.rm = TRUE)
resSig <- subset(res,padj < 0.1)
head(resSig[order(resSig$log2FoldChange),])
head(resSig[ order(resSig$log2FoldChange, decreasing=TRUE), ])
image.png

4 Counts可视化

A quick way to visualize the counts for a particular gene is to use the plotCounts function that takes as arguments the DESeqDataSet, a gene name, and the group over which to plot the counts

topGene <- rownames(res)[which.min(res$padj)]
plotCounts(dds, gene = topGene, intgroup = c("dex"))
##----using the ggplot function from the ggplot2 package
data <- plotCounts(dds, gene=topGene, intgroup=c("dex","sample"), returnData=TRUE)
ggplot(data, aes(x=dex, y=count, color=dex)) +
  scale_y_log10() +
  geom_point(position = position_jitter(width = .1, height = 0), size = 3)
image.png
#可视化其表达形式
ggplot(data, aes(x=dex, y=count, fill=dex)) +
  scale_y_log10() +
  geom_dotplot(binaxis="y", stackdir="center")
##---------------------
ggplot(data, aes(x=dex, y=count, color=dex, group=sample)) +
  scale_y_log10() + geom_point(size=3) + geom_line()

5 plot-MA

plotMA(res, ylim=c(-5,5))
#为提高阙值所得图

6 筛选出在所有样本中方差最大的20个基因

In the sample distance heatmap made previously, the dendrogram at the side shows us a hierarchical clustering of the samples. Such a clustering can also be performed for the genes. Since the clustering is only relevant for genes that actually carry a signal, one usually would only cluster a subset of the most highly variable genes. Here, for demonstration, let us select the 20 genes with the highest variance across samples. We will work with the rlog transformed counts

library("genefilter")
topVarGnes <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20)
head(topVarGnes)

mat <- assay(rld)[topVarGnes, ]
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(rld)[,c("sample","dex")])
pheatmap(mat, annotation_col = df)
image.png
Heatmap of relative rlog-transformed values asorss sample .In the center of the heatmap, we see a set of genes for which the dexamethasone treated samples have higher gene expression.
上一篇下一篇

猜你喜欢

热点阅读