三、差异分析
2021-04-27 本文已影响0人
白米饭睡不醒
三大R包差异分析
if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(DESeq2))BiocManager::install('DESeq2')
if(!require(edgeR))BiocManager::install('edgeR')
if(!require(limma))BiocManager::install('limma')
rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
table(Group)
#> Group
#> normal tumor
#> 9 36
#deseq2----
library(DESeq2)
colData <- data.frame(row.names =colnames(exp),
condition=Group)#行名是表达矩阵的列名,只包含一列分组信息
#根据matrix生成deseq需要的输入数据,countData所需表达矩阵,design为colData的condition列
if(!file.exists(paste0(cancer_type,"_dd.Rdata"))){
dds <- DESeqDataSetFromMatrix(
countData = exp,
colData = colData,
design = ~ condition)
dds <- DESeq(dds)
save(dds,file = paste0(cancer_type,"_dd.Rdata"))
}#注意有设置跳过
load(file = paste0(cancer_type,"_dd.Rdata"))
# 两两比较
res <- results(dds, contrast = c("condition",rev(levels(Group)))) #results提取结果,提供谁是处理谁是对照,顺序为tumor在前normal在后,level提取水平,rev更换顺序
resOrdered <- res[order(res$pvalue),] #按照P值排序,res提取代码,order排序
DEG <- as.data.frame(resOrdered)#变成数据框
DEG = na.omit(DEG)#如果筛选标准较低,表达量低的基因得到的logFC和pValue可能为NA,这步为去掉NA
head(DEG)
#> baseMean log2FoldChange lfcSE stat pvalue
#> ENSG00000105607.11 2295.7526 -3.312580 0.1838483 -18.01800 1.407287e-72
#> ENSG00000052802.11 7540.8645 -3.770384 0.2190204 -17.21476 2.057882e-66
#> ENSG00000080709.13 468.7633 -5.608766 0.3347435 -16.75541 5.169907e-63
#> ENSG00000179152.17 1256.3050 -2.234469 0.1334827 -16.73976 6.725972e-63
#> ENSG00000042781.11 550.0380 -6.475292 0.3879044 -16.69301 1.473499e-62
#> ENSG00000182551.12 12197.3635 -3.543548 0.2163788 -16.37660 2.810313e-60
#> padj
#> ENSG00000105607.11 4.270836e-68
#> ENSG00000052802.11 3.122630e-62
#> ENSG00000080709.13 5.102995e-59
#> ENSG00000179152.17 5.102995e-59
#> ENSG00000042781.11 8.943551e-59
#> ENSG00000182551.12 1.229040e-56
#添加change列标记基因上调下调
logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )#95%置信区间
#logFC_cutoff <- 2
k1 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
k2 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
#>
#> DOWN NOT UP
#> 783 28724 841
head(DEG)
#> baseMean log2FoldChange lfcSE stat pvalue
#> ENSG00000105607.11 2295.7526 -3.312580 0.1838483 -18.01800 1.407287e-72
#> ENSG00000052802.11 7540.8645 -3.770384 0.2190204 -17.21476 2.057882e-66
#> ENSG00000080709.13 468.7633 -5.608766 0.3347435 -16.75541 5.169907e-63
#> ENSG00000179152.17 1256.3050 -2.234469 0.1334827 -16.73976 6.725972e-63
#> ENSG00000042781.11 550.0380 -6.475292 0.3879044 -16.69301 1.473499e-62
#> ENSG00000182551.12 12197.3635 -3.543548 0.2163788 -16.37660 2.810313e-60
#> padj change
#> ENSG00000105607.11 4.270836e-68 NOT
#> ENSG00000052802.11 3.122630e-62 NOT
#> ENSG00000080709.13 5.102995e-59 DOWN
#> ENSG00000179152.17 5.102995e-59 NOT
#> ENSG00000042781.11 8.943551e-59 DOWN
#> ENSG00000182551.12 1.229040e-56 NOT
DESeq2_DEG <- DEG#为了不被后面覆盖
#edgeR----
library(edgeR)
dge <- DGEList(counts=exp,group=Group)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~0+Group)
rownames(design)<-colnames(dge)
colnames(design)<-levels(Group)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1)) #这是结果
DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
#logFC_cutoff <- 2
k1 = (DEG$PValue < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$PValue < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
head(DEG)
#> logFC logCPM LR PValue FDR change
#> ENSG00000042781.11 -6.287813 2.978966 338.9533 1.078532e-75 3.273130e-71 DOWN
#> ENSG00000080709.13 -5.388444 2.749969 305.3198 2.284579e-68 3.466619e-64 DOWN
#> ENSG00000109929.8 -3.992376 5.734121 296.8176 1.625990e-66 1.644851e-62 NOT
#> ENSG00000213398.6 -4.555608 5.835332 295.3276 3.433673e-66 2.605128e-62 DOWN
#> ENSG00000105607.11 -3.112617 5.091139 292.9179 1.150241e-65 6.981505e-62 NOT
#> ENSG00000120158.10 -3.633224 4.588093 285.4740 4.816671e-64 2.436272e-60 NOT
table(DEG$change)
#>
#> DOWN NOT UP
#> 533 28643 1172
edgeR_DEG <- DEG
#####查看数据结果是否算反了,根据logFC -6.287813来看
as.numeric(exp["ENSG00000042781.11",])
boxplot(exp["ENSG00000042781.11",] ~ Group)#根据箱线图看tumor低为下调基因logFC小于零
###limma----
library(limma)
design <- model.matrix(~0+Group)
colnames(design)=levels(Group)
rownames(design)=colnames(exp)
dge <- DGEList(counts=exp)
dge <- calcNormFactors(dge)
v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)
constrasts = paste(rev(levels(Group)),collapse = "-")
cont.matrix <- makeContrasts(contrasts=constrasts,levels = design)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)
DEG = topTable(fit2, coef=constrasts, n=Inf)
DEG = na.omit(DEG)
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
#logFC_cutoff <- 2
k1 = (DEG$P.Value < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$P.Value < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
#>
#> DOWN NOT UP
#> 1060 28915 373
head(DEG)
#> logFC AveExpr t P.Value adj.P.Val
#> ENSG00000042781.11 -6.464608 -0.1996274 -23.43702 9.460362e-28 2.871031e-23
#> ENSG00000080709.13 -5.525461 0.2993289 -21.42402 4.691172e-26 7.118385e-22
#> ENSG00000266964.4 -6.882650 0.1094543 -20.82023 1.604506e-25 1.217338e-21
#> ENSG00000253959.1 -5.171740 -5.6877937 -20.93329 1.271725e-25 1.217338e-21
#> ENSG00000251049.2 -5.861260 -5.4549397 -20.50083 3.111286e-25 1.888426e-21
#> ENSG00000204653.8 -7.427044 0.5359110 -19.20496 4.985070e-24 1.827535e-20
#> B change
#> ENSG00000042781.11 52.55580 DOWN
#> ENSG00000080709.13 48.85594 DOWN
#> ENSG00000266964.4 47.73131 DOWN
#> ENSG00000253959.1 46.44808 DOWN
#> ENSG00000251049.2 45.82235 DOWN
#> ENSG00000204653.8 44.45722 DOWN
limma_voom_DEG <- DEG
tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
edgeR = as.integer(table(edgeR_DEG$change)),
limma_voom = as.integer(table(limma_voom_DEG$change)),
row.names = c("down","not","up")
);tj
#> deseq2 edgeR limma_voom
#> down 783 533 1060
#> not 28724 28643 28915
#> up 841 1172 373
save(DESeq2_DEG,edgeR_DEG,limma_voom_DEG,Group,tj,file = paste0(cancer_type,"_DEG.Rdata"))
可视化
rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
if(!require(tinyarray))devtools::install_local("tinyarray-master.zip",upgrade = F)#如果需要更新的话卸载重装更快
library(ggplot2)
library(tinyarray)
exp[1:4,1:4]
#> TCGA-W5-AA36-01A-11R-A41I-07 TCGA-W5-AA2H-01A-31R-A41I-07
#> ENSG00000000003.13 2504 226
#> ENSG00000000419.11 1272 1146
#> ENSG00000000457.12 504 602
#> ENSG00000000460.15 123 162
#> TCGA-ZU-A8S4-11A-11R-A41I-07 TCGA-WD-A7RX-01A-12R-A41I-07
#> ENSG00000000003.13 4107 9646
#> ENSG00000000419.11 741 1266
#> ENSG00000000457.12 312 1317
#> ENSG00000000460.15 170 451
# cpm 去除文库大小的影响(标准化)
dat = log2(cpm(exp)+1)
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(cancer_type,"_pcaplot.Rdata"))
cg1 = rownames(DESeq2_DEG)[DESeq2_DEG$change !="NOT"]
cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
cg3 = rownames(limma_voom_DEG)[limma_voom_DEG$change !="NOT"]
h1 = draw_heatmap(dat[cg1,],Group,n_cutoff = 2)#n_cutoff参数截止范围,使对比度更鲜明
#选取n_cutoff值可使用以下代码看取值范围
#m = t(scale(t(dat[cg1,])))
#range(m)
#boxplot(m)
h2 = draw_heatmap(dat[cg2,],Group,n_cutoff = 2)
h3 = draw_heatmap(dat[cg3,],Group,n_cutoff = 2)
m2d = function(x){
mean(abs(x))+2*sd(abs(x))
}#与上面阈值一样
v1 = draw_volcano(DESeq2_DEG,pkg = 1,logFC_cutoff = m2d(DESeq2_DEG$log2FoldChange),symmetry = T)#pvalue默认参数0.05,加参数symmetry = T 可以让图居中
v2 = draw_volcano(edgeR_DEG,pkg = 2,logFC_cutoff = m2d(edgeR_DEG$logFC))
v3 = draw_volcano(limma_voom_DEG,pkg = 3,logFC_cutoff = m2d(limma_voom_DEG$logFC))
library(patchwork)#只能拼ggplot2的图,热图可以使用as.ggplot改变格式拼在一起
(h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(cancer_type,"_heat_vo.png"),width = 15,height = 10)
三大R包差异基因对比
rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
load("TCGA-CHOL_pcaplot.Rdata")
UP=function(df){
rownames(df)[df$change=="UP"]
}
DOWN=function(df){
rownames(df)[df$change=="DOWN"]
}
up = intersect(intersect(UP(DESeq2_DEG),UP(edgeR_DEG)),UP(limma_voom_DEG))
down = intersect(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG)),DOWN(limma_voom_DEG))
dat = log2(cpm(exp)+1)
hp = draw_heatmap(dat[c(up,down),],Group,n_cutoff = 2)
#上调、下调基因分别画维恩图
up_genes = list(Deseq2 = UP(DESeq2_DEG),
edgeR = UP(edgeR_DEG),
limma = UP(limma_voom_DEG))
down_genes = list(Deseq2 = DOWN(DESeq2_DEG),
edgeR = DOWN(edgeR_DEG),
limma = DOWN(limma_voom_DEG))
up.plot <- draw_venn(up_genes,"UPgene")
down.plot <- draw_venn(down_genes,"DOWNgene")
#维恩图拼图,终于搞定
library(patchwork)
#up.plot + down.plot
# 拼图
pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect")
ggsave(paste0(cancer_type,"_heat_ve_pca.png"),width = 15,height = 10)
#如果热图按照原顺序画的没有分组,将原矩阵按group顺序重新排列再画
m = dat[c(up,down),]
m = m[,order(Group)]
Group = sort(Group)
draw_heatmap(m,Group,n_cutoff = 2,cluster_cols=F)
#########生信技能树课程笔记