组学数据分析生信医学猴哥差异基因/富集分析

三、差异分析

2021-04-27  本文已影响0人  白米饭睡不醒

三大R包差异分析

if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(DESeq2))BiocManager::install('DESeq2')
if(!require(edgeR))BiocManager::install('edgeR')
if(!require(limma))BiocManager::install('limma')

rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
table(Group)
#> Group
#> normal  tumor 
#>      9     36

#deseq2----
library(DESeq2)
colData <- data.frame(row.names =colnames(exp), 
                      condition=Group)#行名是表达矩阵的列名,只包含一列分组信息
#根据matrix生成deseq需要的输入数据,countData所需表达矩阵,design为colData的condition列
if(!file.exists(paste0(cancer_type,"_dd.Rdata"))){
  dds <- DESeqDataSetFromMatrix(
  countData = exp,
  colData = colData,
  design = ~ condition)
  dds <- DESeq(dds)
  save(dds,file = paste0(cancer_type,"_dd.Rdata"))
}#注意有设置跳过
load(file = paste0(cancer_type,"_dd.Rdata"))

# 两两比较
res <- results(dds, contrast = c("condition",rev(levels(Group)))) #results提取结果,提供谁是处理谁是对照,顺序为tumor在前normal在后,level提取水平,rev更换顺序 
resOrdered <- res[order(res$pvalue),] #按照P值排序,res提取代码,order排序
DEG <- as.data.frame(resOrdered)#变成数据框
DEG = na.omit(DEG)#如果筛选标准较低,表达量低的基因得到的logFC和pValue可能为NA,这步为去掉NA
head(DEG)
#>                      baseMean log2FoldChange     lfcSE      stat       pvalue
#> ENSG00000105607.11  2295.7526      -3.312580 0.1838483 -18.01800 1.407287e-72
#> ENSG00000052802.11  7540.8645      -3.770384 0.2190204 -17.21476 2.057882e-66
#> ENSG00000080709.13   468.7633      -5.608766 0.3347435 -16.75541 5.169907e-63
#> ENSG00000179152.17  1256.3050      -2.234469 0.1334827 -16.73976 6.725972e-63
#> ENSG00000042781.11   550.0380      -6.475292 0.3879044 -16.69301 1.473499e-62
#> ENSG00000182551.12 12197.3635      -3.543548 0.2163788 -16.37660 2.810313e-60
#>                            padj
#> ENSG00000105607.11 4.270836e-68
#> ENSG00000052802.11 3.122630e-62
#> ENSG00000080709.13 5.102995e-59
#> ENSG00000179152.17 5.102995e-59
#> ENSG00000042781.11 8.943551e-59
#> ENSG00000182551.12 1.229040e-56

#添加change列标记基因上调下调
logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )#95%置信区间
#logFC_cutoff <- 2
k1 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
k2 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
#> 
#>  DOWN   NOT    UP 
#>   783 28724   841
head(DEG)
#>                      baseMean log2FoldChange     lfcSE      stat       pvalue
#> ENSG00000105607.11  2295.7526      -3.312580 0.1838483 -18.01800 1.407287e-72
#> ENSG00000052802.11  7540.8645      -3.770384 0.2190204 -17.21476 2.057882e-66
#> ENSG00000080709.13   468.7633      -5.608766 0.3347435 -16.75541 5.169907e-63
#> ENSG00000179152.17  1256.3050      -2.234469 0.1334827 -16.73976 6.725972e-63
#> ENSG00000042781.11   550.0380      -6.475292 0.3879044 -16.69301 1.473499e-62
#> ENSG00000182551.12 12197.3635      -3.543548 0.2163788 -16.37660 2.810313e-60
#>                            padj change
#> ENSG00000105607.11 4.270836e-68    NOT
#> ENSG00000052802.11 3.122630e-62    NOT
#> ENSG00000080709.13 5.102995e-59   DOWN
#> ENSG00000179152.17 5.102995e-59    NOT
#> ENSG00000042781.11 8.943551e-59   DOWN
#> ENSG00000182551.12 1.229040e-56    NOT

DESeq2_DEG <- DEG#为了不被后面覆盖


#edgeR----
library(edgeR)

dge <- DGEList(counts=exp,group=Group)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 

design <- model.matrix(~0+Group)
rownames(design)<-colnames(dge)
colnames(design)<-levels(Group)

dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1)) #这是结果

DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )

#logFC_cutoff <- 2
k1 = (DEG$PValue < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$PValue < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))

head(DEG)
#>                        logFC   logCPM       LR       PValue          FDR change
#> ENSG00000042781.11 -6.287813 2.978966 338.9533 1.078532e-75 3.273130e-71   DOWN
#> ENSG00000080709.13 -5.388444 2.749969 305.3198 2.284579e-68 3.466619e-64   DOWN
#> ENSG00000109929.8  -3.992376 5.734121 296.8176 1.625990e-66 1.644851e-62    NOT
#> ENSG00000213398.6  -4.555608 5.835332 295.3276 3.433673e-66 2.605128e-62   DOWN
#> ENSG00000105607.11 -3.112617 5.091139 292.9179 1.150241e-65 6.981505e-62    NOT
#> ENSG00000120158.10 -3.633224 4.588093 285.4740 4.816671e-64 2.436272e-60    NOT
table(DEG$change)
#> 
#>  DOWN   NOT    UP 
#>   533 28643  1172
edgeR_DEG <- DEG


#####查看数据结果是否算反了,根据logFC -6.287813来看
as.numeric(exp["ENSG00000042781.11",])
boxplot(exp["ENSG00000042781.11",] ~ Group)#根据箱线图看tumor低为下调基因logFC小于零


###limma----
library(limma)

design <- model.matrix(~0+Group)
colnames(design)=levels(Group)
rownames(design)=colnames(exp)

dge <- DGEList(counts=exp)
dge <- calcNormFactors(dge)

v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)

constrasts = paste(rev(levels(Group)),collapse = "-")
cont.matrix <- makeContrasts(contrasts=constrasts,levels = design) 
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)

DEG = topTable(fit2, coef=constrasts, n=Inf)
DEG = na.omit(DEG)

logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )

#logFC_cutoff <- 2
k1 = (DEG$P.Value < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$P.Value < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
#> 
#>  DOWN   NOT    UP 
#>  1060 28915   373
head(DEG)
#>                        logFC    AveExpr         t      P.Value    adj.P.Val
#> ENSG00000042781.11 -6.464608 -0.1996274 -23.43702 9.460362e-28 2.871031e-23
#> ENSG00000080709.13 -5.525461  0.2993289 -21.42402 4.691172e-26 7.118385e-22
#> ENSG00000266964.4  -6.882650  0.1094543 -20.82023 1.604506e-25 1.217338e-21
#> ENSG00000253959.1  -5.171740 -5.6877937 -20.93329 1.271725e-25 1.217338e-21
#> ENSG00000251049.2  -5.861260 -5.4549397 -20.50083 3.111286e-25 1.888426e-21
#> ENSG00000204653.8  -7.427044  0.5359110 -19.20496 4.985070e-24 1.827535e-20
#>                           B change
#> ENSG00000042781.11 52.55580   DOWN
#> ENSG00000080709.13 48.85594   DOWN
#> ENSG00000266964.4  47.73131   DOWN
#> ENSG00000253959.1  46.44808   DOWN
#> ENSG00000251049.2  45.82235   DOWN
#> ENSG00000204653.8  44.45722   DOWN

limma_voom_DEG <- DEG
tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
           edgeR = as.integer(table(edgeR_DEG$change)),
           limma_voom = as.integer(table(limma_voom_DEG$change)),
           row.names = c("down","not","up")
          );tj
#>      deseq2 edgeR limma_voom
#> down    783   533       1060
#> not   28724 28643      28915
#> up      841  1172        373
save(DESeq2_DEG,edgeR_DEG,limma_voom_DEG,Group,tj,file = paste0(cancer_type,"_DEG.Rdata"))

可视化

rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
if(!require(tinyarray))devtools::install_local("tinyarray-master.zip",upgrade = F)#如果需要更新的话卸载重装更快
library(ggplot2)
library(tinyarray)
exp[1:4,1:4]
#>                    TCGA-W5-AA36-01A-11R-A41I-07 TCGA-W5-AA2H-01A-31R-A41I-07
#> ENSG00000000003.13                         2504                          226
#> ENSG00000000419.11                         1272                         1146
#> ENSG00000000457.12                          504                          602
#> ENSG00000000460.15                          123                          162
#>                    TCGA-ZU-A8S4-11A-11R-A41I-07 TCGA-WD-A7RX-01A-12R-A41I-07
#> ENSG00000000003.13                         4107                         9646
#> ENSG00000000419.11                          741                         1266
#> ENSG00000000457.12                          312                         1317
#> ENSG00000000460.15                          170                          451
# cpm 去除文库大小的影响(标准化)
dat = log2(cpm(exp)+1)
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(cancer_type,"_pcaplot.Rdata"))

cg1 = rownames(DESeq2_DEG)[DESeq2_DEG$change !="NOT"]
cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
cg3 = rownames(limma_voom_DEG)[limma_voom_DEG$change !="NOT"]

h1 = draw_heatmap(dat[cg1,],Group,n_cutoff = 2)#n_cutoff参数截止范围,使对比度更鲜明
#选取n_cutoff值可使用以下代码看取值范围
#m = t(scale(t(dat[cg1,])))
#range(m)
#boxplot(m)

h2 = draw_heatmap(dat[cg2,],Group,n_cutoff = 2)
h3 = draw_heatmap(dat[cg3,],Group,n_cutoff = 2)

m2d = function(x){
  mean(abs(x))+2*sd(abs(x))
}#与上面阈值一样

v1 = draw_volcano(DESeq2_DEG,pkg = 1,logFC_cutoff = m2d(DESeq2_DEG$log2FoldChange),symmetry = T)#pvalue默认参数0.05,加参数symmetry = T 可以让图居中
v2 = draw_volcano(edgeR_DEG,pkg = 2,logFC_cutoff = m2d(edgeR_DEG$logFC))
v3 = draw_volcano(limma_voom_DEG,pkg = 3,logFC_cutoff = m2d(limma_voom_DEG$logFC))

library(patchwork)#只能拼ggplot2的图,热图可以使用as.ggplot改变格式拼在一起
(h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(cancer_type,"_heat_vo.png"),width = 15,height = 10)

三大R包差异基因对比

rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
load("TCGA-CHOL_pcaplot.Rdata")
UP=function(df){
  rownames(df)[df$change=="UP"]
}
DOWN=function(df){
  rownames(df)[df$change=="DOWN"]
}

up = intersect(intersect(UP(DESeq2_DEG),UP(edgeR_DEG)),UP(limma_voom_DEG))
down = intersect(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG)),DOWN(limma_voom_DEG))
dat = log2(cpm(exp)+1)
hp = draw_heatmap(dat[c(up,down),],Group,n_cutoff = 2)

#上调、下调基因分别画维恩图
up_genes = list(Deseq2 = UP(DESeq2_DEG),
          edgeR = UP(edgeR_DEG),
          limma = UP(limma_voom_DEG))

down_genes = list(Deseq2 = DOWN(DESeq2_DEG),
          edgeR = DOWN(edgeR_DEG),
          limma = DOWN(limma_voom_DEG))

up.plot <- draw_venn(up_genes,"UPgene")
down.plot <- draw_venn(down_genes,"DOWNgene")

#维恩图拼图,终于搞定

library(patchwork)
#up.plot + down.plot
# 拼图
pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect")
ggsave(paste0(cancer_type,"_heat_ve_pca.png"),width = 15,height = 10)
#如果热图按照原顺序画的没有分组,将原矩阵按group顺序重新排列再画
m = dat[c(up,down),]
m = m[,order(Group)]
Group = sort(Group)
draw_heatmap(m,Group,n_cutoff = 2,cluster_cols=F)

#########生信技能树课程笔记

上一篇 下一篇

猜你喜欢

热点阅读