数据整理组学数据分析

七、数据标准化和差异表达分析

2021-03-10  本文已影响0人  白米饭睡不醒

1.数据标准化

(1)标准化

##  R包准备

rm(list = ls())

## Installing R packages
bioPackages <-c( 
  "corrplot","ggrepel", #绘制相关性图形
  "stringr", #处理字符串的包
  "readr","tximport","dplyr", #处理salmon表达量的扩展包
  "FactoMineR","factoextra", #PCA分析软件
  "limma","edgeR","DESeq2", #差异分析的三个软件包
  "clusterProfiler", "org.Hs.eg.db", #安装进行GO和Kegg分析的扩展包
  "GSEABase","GSVA", #安装进行GSEA分析的扩展包
  "airway" # 包含数据集的bioconductor软件包
  )


## If you are in China, run the command below
local({
  r <- getOption( "repos" );# set CRAN mirror for users in China
  r[ "CRAN" ] <- "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"; # CRAN的镜像地址
  options( repos = r )
  
  BioC <- getOption( "BioC_mirror" ); # set bioconductor mirror for users in China
  BioC[ "BioC_mirror" ] <- "https://mirrors.ustc.edu.cn/bioc/"; # bioconductor的镜像地址
  options( BioC_mirror = BioC )
})

# 检查是否设定完毕
getOption("BioC_mirror")
getOption("CRAN")

if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")

# 安装devtools管理github上的软件包
if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")


## Installing missing packages
lapply( bioPackages, 
        function( bioPackage ){
          if(!bioPackage %in% rownames(installed.packages())){
              CRANpackages <- available.packages()

              if(bioPackage %in% rownames(CRANpackages)){
                install.packages( bioPackage)
              }else{
                  BiocManager::install(bioPackage,suppressUpdates=F,ask=F)
              }
          }
        })


## 验证R扩展包是否安装成功
library(limma)
library(edgeR)
library(DESeq2)
library(FactoMineR)
library(factoextra)
library(clusterProfiler)
library(org.Hs.eg.db)

# 不显示加载信息
suppressMessages(library(limma))

##标准化

# 魔幻操作,一键清空
rm(list = ls()) 
#4.0以后不用  options(stringsAsFactors = F)

# 加载airway数据集并转换为表达矩阵
library(airway,quietly = T)
data(airway)
class(airway)

rawcount <- assay(airway)#用assay函数取表达矩阵
colnames(rawcount)

# 查看表达谱
rawcount[1:4,1:4]

# 去除前的基因表达矩阵情况
dim(rawcount)

# 获取分组信息
group_list <- colData(airway)$dex
group_list

# 过滤在至少在75%的样本中都不表达的基因
keep <- rowSums(rawcount>0) >= floor(0.75*ncol(rawcount))
table(keep)

filter_count <- rawcount[keep,]
filter_count[1:4,1:4]
dim(filter_count)

# 加载edgeR包计算counts per millio(cpm) 表达矩阵,并对结果取log2值
library(edgeR)
express_cpm <- log2(cpm(filter_count)+1)
express_cpm[1:6,1:6]

# 保存表达矩阵和分组结果(路径可改)
save(filter_count,express_cpm,group_list,file = "../Analysis/data/Step01-airwayData.Rdata")

1.1.1 1.1.2 1.1.3 1.1.4 1.1.5 1.1.6 1.1.7

(2)样本总体分布

rm(list = ls())
# options(stringsAsFactors = F)

# 加载原始表达的数据
lname <- load(file = "../code-down/Step01-airwayData.Rdata")
lname

exprSet <- express_cpm
exprSet[1:6,1:6]#检查表达谱是否正确

# 样本表达总体分布-箱式图
library(ggplot2)
# 构造绘图数据
data <- data.frame(expression=c(exprSet),sample=rep(colnames(exprSet),each=nrow(exprSet)))
head(data)#第一列表达矩阵,第二列样本名

p <- ggplot(data = data,aes(x=sample,y=expression,fill=sample))#sample填充颜色
p1 <- p + geom_boxplot()+ theme(axis.text.x = element_text(angle = 90))+ xlab(NULL) + ylab("log2(CPM+1)")
#angle横轴90度展示,xlab,ylab是x和y的标签
p1

# 保存图片(还有其他保存图片的方法,路径要改)
pdf(file = "../Analysis/sample_cor/sample_boxplot.pdf",width = 6,height = 8)
print(p1)
dev.off()

# 样本表达总体分布-小提琴图
p2 <- p + geom_violin() + theme(axis.text = element_text(size = 12),axis.text.x = element_text(angle = 90)) + xlab(NULL) + ylab("log2(CPM+1)")
p2

# 保存图片(路径要改)
pdf(file = "../Analysis/sample_cor/sample_violin.pdf",width = 6,height = 8)
print(p2)
dev.off()

# 样本表达总体分布-概率密度分布图
m <- ggplot(data=data, aes(x=expression))
p3 <- m +  geom_density(aes(fill=sample, colour=sample),alpha = 0.2) + xlab("log2(CPM+1)")
p3

# 保存图片(路径要改)
pdf(file = "../Analysis/sample_cor/sample_density.pdf",width = 7,height = 8)
print(p3)
dev.off()

1.2.1 1.2.2 1.2.3

(3)样本之间的相关性

## 3.样本之间的相关性-cor----
# 利用绝对中位差mad/标准差sd统计学方法进行数据异常值检测
# 将表达量的绝对中位差mad从大到小排列取前500的结果
dat <- express_cpm
dat <- log2(as.matrix(filter_count)+1)
tmp <- sort(apply(dat,1, mad),decreasing = T)[1:500]#用差异表达最大的五百个基因
exprSet <-dat[names(tmp),]

# 使用500个基因的表达量来做相关性图
library(corrplot)
dim(exprSet)

# 计算相关性
#0-0.3 基本不相关  0.3-0.6 弱相关    0.6-0.8 强相关    0.8-1 非常强相关
M <- cor(exprSet)#cor默认皮尔逊相关,M为对称矩阵
g <- corrplot(M,order = "AOE",addCoef.col = "white")

corrplot(M,order = "AOE",type="upper",tl.pos = "d",method = "color")
corrplot(M,add=TRUE, type="lower", method="number",order="AOE",diag=FALSE,tl.pos="n", cl.pos="n")

# 绘制样本相关性的热图
library(pheatmap)
anno <- data.frame(sampleType=group_list)
rownames(anno) <- colnames(exprSet)#确保注释行的行名等于数据的列名
anno
p <- pheatmap::pheatmap(M,display_numbers = T,annotation_col = anno,fontsize = 12,cellheight = 50,cellwidth = 50,cluster_rows = T,cluster_cols = T)
#display_numbers方块中显示数字,annotation_col注释条,fontsize字体大小,cellheight、cellwidth格子大小,cluster_cols是否聚类
p

pdf(file = "../Analysis/sample_cor/cor.pdf")
print(p)
dev.off()

2.3.1 2.3.2 2.3.3 2.3.4

2.差异表达分析

2.1 2.2 2.3 2.4 2.5 2.6

(1)limma包分析

2.1.1

# 清空当前对象
rm(list = ls())
options(stringsAsFactors = F)

# 读取基因表达矩阵
lname <- load(file = "../code-down/Step01-airwayData.Rdata")
lname

exprSet <- filter_count
# 检查表达谱
dim(exprSet)
exprSet[1:6,1:6]
table(group_list)

# 加载包
library(limma)
library(edgeR)

## 第一步,创建设计矩阵和对比:假设数据符合正态分布,构建线性模型
# 0代表x线性模型的截距为0
design <- model.matrix(~0+factor(group_list))#设计对比矩阵
colnames(design) <- levels(factor(group_list))
rownames(design) <- colnames(exprSet)
design

# 设置需要进行对比的分组,需要修改
comp <- 'trt-untrt'#前面是处理组后面是对照组
cont.matrix <- makeContrasts(contrasts=c(comp),levels = design)



## 第二步,进行差异表达分析
# 将表达矩阵转换为edgeR的DGEList对象
dge <- DGEList(counts=exprSet)

# 进行标准化
dge <- calcNormFactors(dge)   

#Use voom() [15] to convert the read counts to log2-cpm, with associated weights, ready for linear modelling:
v <- voom(dge,design,plot=TRUE, normalize="quantile") 
fit <- lmFit(v, design)
fit2 <- contrasts.fit(fit,cont.matrix)
fit2 <- eBayes(fit2)


## 第三步,提取过滤差异分析结果
tmp <- topTable(fit2, coef=comp, n=Inf,adjust.method="BH")#对p值进行BH校正
DEG_limma_voom <- na.omit(tmp)
head(DEG_limma_voom)

# 筛选上下调,设定阈值
fc_cutoff <- 1.5
pvalue <- 0.05

DEG_limma_voom$regulated <- "normal"#normal为不显著

loc_up <- intersect(which(DEG_limma_voom$logFC>log2(fc_cutoff)),which(DEG_limma_voom$P.Value<pvalue))
loc_down <- intersect(which(DEG_limma_voom$logFC< (-log2(fc_cutoff))),which(DEG_limma_voom$P.Value<pvalue))

DEG_limma_voom$regulated[loc_up] <- "up"
DEG_limma_voom$regulated[loc_down] <- "down"
  
table(DEG_limma_voom$regulated)

# 添加一列gene symbol
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)

library(clusterProfiler)
id2symbol <- bitr(rownames(DEG_limma_voom), fromType = "ENSEMBL", toType = "SYMBOL", OrgDb = org.Hs.eg.db )
head(id2symbol)

symbol <- rep("NA",time=nrow(DEG_limma_voom))
symbol[match(id2symbol[,1],rownames(DEG_limma_voom))] <- id2symbol[,2]
DEG_limma_voom <- cbind(rownames(DEG_limma_voom),symbol,DEG_limma_voom)
colnames(DEG_limma_voom)[1] <- "GeneID"


# 保存
write.table(DEG_limma_voom,"../Analysis/deg_analysis/DEG_limma_voom_all-1.xls",row.names = F,sep="\t",quote = F)

## 取表达差异倍数和p值,矫正后的pvalue
DEG_limma_voom <- DEG_limma_voom[,c(1,2,3,6,7,9)]
save(DEG_limma_voom, file = "../Analysis/deg_analysis/Step03-limma_voom_nrDEG.Rdata")


## 检查是否上下调设置错了
# 挑选一个差异表达基因
head(DEG_limma_voom)

exp <- c(t(express_cpm[match("ENSG00000178695",rownames(express_cpm)),]))
test <- data.frame(value=exp,group=group_list)
library(ggplot2)
ggplot(data=test,aes(x=group,y=value,fill=group)) + geom_boxplot()

(2)edgeR分析

2.2.1

# 参考链接:https://www.biostars.org/p/110861/

rm(list = ls())
options(stringsAsFactors = F)

# 读取基因表达矩阵信息并查看分组信息和表达矩阵数据
lname <- load(file = "../code-down/Step01-airwayData.Rdata")
lname

exprSet <- filter_count
dim(exprSet)
exprSet[1:6,1:6]
table(group_list)

# 加载包
library(edgeR)

# 假设数据符合正态分布,构建线性模型。0代表x线性模型的截距为0
design <- model.matrix(~0+factor(group_list))
rownames(design) <- colnames(exprSet)
colnames(design) <- levels(factor(group_list))
design

# 构建edgeR的DGEList对象
DEG <- DGEList(counts=exprSet,group=factor(group_list))

# 增加一列$norm.factors
DEG$samples$lib.size <- colSums(DEG$counts)#lib.size测序深度
DEG$samples

# 归一化基因表达分布
DEG <- calcNormFactors(DEG)

# 计算线性模型的参数
DEG <- estimateGLMCommonDisp(DEG,design)
DEG <- estimateGLMTrendedDisp(DEG, design)
DEG <- estimateGLMTagwiseDisp(DEG, design)

# 拟合线性模型
fit <- glmFit(DEG, design)

# 进行差异分析,1,-1意味着前比后
lrt <- glmLRT(fit, contrast=c(1,-1)) 

# 提取过滤差异分析结果
DEG_edgeR <- as.data.frame(topTags(lrt, n=nrow(DEG)))
head(DEG_edgeR)

# 筛选上下调,设定阈值
fc_cutoff <- 1.5
fdr <- 0.05

DEG_edgeR$regulated <- "normal"

loc_up <- intersect(which(DEG_edgeR$logFC>log2(fc_cutoff)),which(DEG_edgeR$FDR<fdr))
loc_down <- intersect(which(DEG_edgeR$logFC< (-log2(fc_cutoff))),which(DEG_edgeR$FDR<fdr))

DEG_edgeR$regulated[loc_up] <- "up"
DEG_edgeR$regulated[loc_down] <- "down"

table(DEG_edgeR$regulated)

# 添加一列gene symbol
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)

library(clusterProfiler)
id2symbol <- bitr(rownames(DEG_edgeR), fromType = "ENSEMBL", toType = "SYMBOL", OrgDb = org.Hs.eg.db )
head(id2symbol)

symbol <- "NA"
symbol[match(id2symbol[,1],rownames(DEG_edgeR))] <- id2symbol[,2]
DEG_edgeR <- cbind(rownames(DEG_edgeR),symbol,DEG_edgeR)
colnames(DEG_edgeR)[1] <- "GeneID"

# 保存
DEG_edgeR_up <-  DEG_edgeR[DEG_edgeR$regulated=="up",]
write.table(DEG_edgeR,"../Analysis/deg_analysis/DEG_edgeR_all.xls",row.names = F,sep="\t",quote = F)

## 取表达差异倍数和p值,矫正后的pvalue
colnames(DEG_edgeR)
DEG_edgeR <- DEG_edgeR[,c(1,2,3,6,7,8)]
save(DEG_edgeR, file = "../Analysis/deg_analysis/Step03-edgeR_nrDEG.Rdata")


## 检查是否上下调设置错了
# 挑选一个差异表达基因
head(DEG_edgeR)

exp <- c(t(express_cpm[match("ENSG00000152583",rownames(express_cpm)),]))
test <- data.frame(value=exp,group=group_list)
library(ggplot2)
ggplot(data=test,aes(x=group,y=value,fill=group)) + geom_boxplot()

(3)DESeq2分析

2.3.1
rm(list = ls())
options(stringsAsFactors = F)

# 读取基因表达矩阵信息
lname <- load(file = "../code-down/Step01-airwayData.Rdata")
lname 

# 查看分组信息和表达矩阵数据
exprSet <- filter_count
dim(exprSet)
exprSet[1:6,1:6]
table(group_list)

# 加载包
library(DESeq2)

# 第一步,构建DESeq2的DESeq对象
colData <- data.frame(row.names=colnames(exprSet),group_list=group_list)
dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,design = ~ group_list)

# 第二步,进行差异表达分析
dds2 <- DESeq(dds)

# 提取差异分析结果,trt组对untrt组的差异分析结果
tmp <- results(dds2,contrast=c("group_list","trt","untrt"))
DEG_DESeq2 <- as.data.frame(tmp[order(tmp$padj),])
head(DEG_DESeq2)

# 去除差异分析结果中包含NA值的行
DEG_DESeq2 = na.omit(DEG_DESeq2)

# 筛选上下调,设定阈值
fc_cutoff <- 2
fdr <- 0.05

DEG_DESeq2$regulated <- "normal"

loc_up <- intersect(which(DEG_DESeq2$log2FoldChange>log2(fc_cutoff)),which(DEG_DESeq2$padj<fdr))
loc_down <- intersect(which(DEG_DESeq2$log2FoldChange< (-log2(fc_cutoff))),which(DEG_DESeq2$padj<fdr))

DEG_DESeq2$regulated[loc_up] <- "up"
DEG_DESeq2$regulated[loc_down] <- "down"

table(DEG_DESeq2$regulated)

# 添加一列gene symbol
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)

library(clusterProfiler)
id2symbol <- bitr(rownames(DEG_DESeq2), fromType = "ENSEMBL", toType = "SYMBOL", OrgDb = org.Hs.eg.db )
head(id2symbol)

symbol <- rep("NA",time=nrow(DEG_DESeq2))
symbol[match(id2symbol[,1],rownames(DEG_DESeq2))] <- id2symbol[,2]
DEG_DESeq2 <- cbind(rownames(DEG_DESeq2),symbol,DEG_DESeq2)
colnames(DEG_DESeq2)[1] <- "GeneID"
head(DEG_DESeq2)

# 保存
write.table(DEG_DESeq2,"../Analysis/deg_analysis/DEG_DESeq2_all.xls",row.names = F,sep="\t",quote = F)

## 取表达差异倍数和p值,矫正后的pvalue并保存
colnames(DEG_DESeq2)
DEG_DESeq2 <- DEG_DESeq2[,c(1,2,4,7,8,9)]
save(DEG_DESeq2, file = "../Analysis/deg_analysis/Step03-DESeq2_nrDEG.Rdata")


## 检查是否上下调设置错了
# 挑选一个差异表达基因
head(DEG_DESeq2)

exp <- c(t(express_cpm[match("ENSG00000152583",rownames(express_cpm)),]))
test <- data.frame(value=exp,group=group_list)
library(ggplot2)
ggplot(data=test,aes(x=group,y=value,fill=group)) + geom_boxplot()

2.3.2

3.差异结果可视化

3.1

rm(list = ls())
options(stringsAsFactors = F)

# 加载原始表达矩阵
load(file = "../Analysis/data/Step01-airwayData.Rdata")

# 读取3个软件的差异分析结果
load(file = "../Analysis/deg_analysis/Step03-limma_voom_nrDEG.Rdata")
load(file = "../Analysis/deg_analysis/Step03-DESeq2_nrDEG.Rdata")
load(file = "../Analysis/deg_analysis/Step03-edgeR_nrDEG.Rdata")
ls()

# 提取所有差异表达的基因名
limma_sigGene <- DEG_limma_voom[DEG_limma_voom$regulated!="normal",1]
edgeR_sigGene <- DEG_edgeR[DEG_edgeR$regulated!="normal",1]
DESeq2_sigGene <- DEG_DESeq2[DEG_DESeq2$regulated!="normal",1]

# 绘制热图
dat <- express_cpm[match(limma_sigGene,rownames(express_cpm)),]
dat[1:4,1:4]
group <- data.frame(group=group_list)
rownames(group)=colnames(dat)

# 加载包
library(pheatmap)
p <- pheatmap(dat,scale = "row",show_colnames =T,show_rownames = F, cluster_cols = T,annotation_col=group,main = "limma's DEG",treeheight_row = 0,treeheight_col = 0) 


pdf()
png()
tiff()


3.2
rm(list = ls())
options(stringsAsFactors = F)

# 加载原始表达矩阵
load(file = "../Analysis/data/Step01-airwayData.Rdata")

# 读取3个软件的差异分析结果
load(file = "../Analysis/deg_analysis/Step03-limma_voom_nrDEG.Rdata")
load(file = "../Analysis/deg_analysis/Step03-DESeq2_nrDEG.Rdata")
load(file = "../Analysis/deg_analysis/Step03-edgeR_nrDEG.Rdata")
ls()

# 根据需要修改DEG的值
data <- DEG_limma_voom
colnames(data)


# 绘制火山图
library(ggplot2)
colnames(data)
p <- ggplot(data=data, aes(x=logFC, y=-log10(adj.P.Val),color=regulated)) + 
     geom_point(alpha=0.5, size=1.8) + theme_set(theme_set(theme_bw(base_size=20))) + 
     xlab("log2FC") + ylab("-log10(FDR)") +scale_colour_manual(values = c('blue','black','red'))
p


上一篇下一篇

猜你喜欢

热点阅读