R语言作业GO, KEGG, DO富集分析富集分析

【生信技能树】2020-01-08作业:GEO挖掘标准流程分析G

2020-01-10  本文已影响0人  猫叽先森

题目出自【生信技能树】公众号文章:2011年的表达芯片分析和2019年的区别
要求:走GEO标准分析流程分析GSE27447


代码主要复制于Jimmy大佬的github:https://github.com/jmzeng1314/GEO/tree/master/GSE42872_main
主要步骤:
step1 - 读入GSE27447数据
step2 - 检查表达矩阵
step3 - 差异分析
step4 - GO/KEGG数据库注释
step5 - GSEA基因集富集分析(施工中)
step6 - GSVA基因集变异分析(施工中)


step1 - 读入GSE27447数据

rm(list=ls())
options(stringsAsFactors = F)

library(AnnoProbe) # 生信技能树出品的GEO数据下载利器
library(Biobase)
gset <- geoChina("GSE27447")
gse27447 <- gset[[1]]
exprSet <- exprs(gse27447)
boxplot(exprSet,las=2)
boxplot_before

根据boxplot可以得出,原始数据需要log2处理。

exprSet <- log2(exprSet+1)
boxplot(exprSet,las=2)
boxplot_after_log2
exprSet[1:4,1:4]
#        GSM678364 GSM678365 GSM678366 GSM678367
#7892501  3.221012  4.502222  1.271969  1.849167
#7892502  6.909593  7.305615  6.956150  7.078983
#7892503  5.125911  8.659878  5.395697  5.660467
#7892504 11.045603 10.060453 10.845937 11.145225

#行名是探针,列名是样本,中间的数据是某样本中某探针的表达量。

gse27447@annotation
#[1] "GPL6244"
checkGPL(gse27447@annotation)
#[1] TRUE 

# checkGPL()结果是TRUE说明AnnoProbe包中存在"GPL6244" 平台数据,于是可以使用另外两个超级厉害的函数idmap()filterEM(),得到探针对应的基因名,然后把表达矩阵的探针名转换为基因名。

ids <- idmap(gse27447@annotation)
dat <- filterEM(exprSet,ids)
dim(dat)
#[1] 18837    19
dat <- dat[order(rownames(dat)),]

获取临床信息,从中进一步获取分组信息

pd <- pData(gse27447)
library(stringr)
group_list=str_split(pd$title,' ',simplify = T)[,1]
table(group_list)
#group_list
#non-triple     triple 
#        14          5 

所以是5个三阴乳腺癌样本,14个非三阴乳腺癌样本。
把表达矩阵和分组信息保存到Rdata文件。

save(dat,group_list,file = 'step1-output.Rdata')

step2 - 检查表达矩阵

rm(list = ls())  ##清空Enviroment
options(stringsAsFactors = F)
load('step1-output.Rdata') ##读入第一步得到的数据,然后检查数据

table(group_list)
#group_list
#non-triple     triple 
#        14          5 
dat[1:4,1:4]
#        GSM678364 GSM678365 GSM678366 GSM678367
#A1CF     6.654092  6.074542  6.258994  6.609746
#A2M     10.266259 11.034813 10.494956 11.450283
#A2ML1    6.375017  6.118954  5.683851  4.278468
#A3GALT2  4.424096  4.144055  4.906982  5.065266

2.1 样本相关性分析

dim(dat)
#[1] 18837    19
ac=data.frame(groups=group_list)
rownames(ac)=colnames(dat) #把ac的行名给到n的列名,即对每一个探针标记上分组信息
pheatmap(cor(dat),annotation_col = ac)
heatmap - cor.png

2.2 主成分分析(PCA)

# 我们是对样本做主成分分析,所以要求行名是样本名,列名是探针(基因)名,所以需要对表达矩阵进行转置

dat=t(dat) 
dat=as.data.frame(dat)
dat=cbind(dat,group_list) ##给表达矩阵加上分组信息

library("FactoMineR")
library("factoextra") 
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#dat最后一列是group_list。pca分析需要一个纯数值矩阵,所以将dat最后一列去掉以后赋值给dat.pca
(fviz_pca_ind(dat.pca,geom.ind = "point",
             col.ind = dat$group_list,
             palette = c("#00AFBB", "#E7B800"),
             addEllipses = TRUE,
             legend.title = "Groups"
))
ggsave('all_samples_PCA.png')
all_samples_PCA.png

可以看到,两组样本并没有分开╮(╯▽╰)╭
不过大家都是肿瘤样本,相似性较高也可以理解。

2.3 取sd最大的1000个基因画热图

rm(list = ls())  ## 魔幻操作,一键清空~
load(file = 'step1-output.Rdata')
dat[1:4,1:4] 
cg=names(tail(sort(apply(dat,1,sd)),1000))

##使用 apply()函数计算获取表达矩阵dat每行(即每个基因表达量)的方差,从小到大排序后,取最大的1000个方差,获取其对应的基因名,赋值给变量cg
##然后就可以对这1000个基因画热图

library(pheatmap)
pheatmap(dat[cg,],show_colnames =F,show_rownames = F)
heatmap - cg.png

优化一下,比如对dat[cg,]归一化:

n=t(scale(t(dat[cg,]))) 
n[n>2]=2 
n[n< -2]= -2
n[1:4,1:4]
pheatmap(n,show_colnames =F,show_rownames = F)
heatmap - n.png

给样本添加分组信息:

ac=data.frame(g=group_list)
rownames(ac)=colnames(n) #把ac的行名给到n的列名,即对每一个样本标记上分组信息
(heatmap <- pheatmap(n,show_colnames =F,show_rownames = F,
                    annotation_col=ac))
library(ggplot2)
ggsave(filename = 'heatmap_top1000_sd.png',heatmap)
heatmap_top1000_sd.png

step3 - 差异分析

3.1 使用limma包做差异分析

参考:【生信菜鸟团】用limma对芯片数据做差异分析

rm(list = ls())  ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load(file = 'step1-output.Rdata') ##载入数据
dat[1:4,1:4] 
#        GSM678364 GSM678365 GSM678366 GSM678367
#A1CF     6.654092  6.074542  6.258994  6.609746
#A2M     10.266259 11.034813 10.494956 11.450283
#A2ML1    6.375017  6.118954  5.683851  4.278468
#A3GALT2  4.424096  4.144055  4.906982  5.065266
group_list[group_list=='non-triple'] <- 'non_triple'
table(group_list) 
#group_list
#non_triple     triple 
#        14          5 

使用箱线图boxplot肉眼观察一下前几个基因的数据分布。

boxplot(unlist(dat[1,])~group_list) 
boxplot - gene-1.png
自定义一个函数,用ggpubr包画漂亮点的boxplot
library(ggpubr)
bp=function(g){         #定义一个函数g,函数为{}里的内容
  df=data.frame(gene=g,stage=group_list)
  p <- ggboxplot(df, x = "stage", y = "gene",
                 color = "stage", palette = "jco",
                 add = "jitter")
  p + stat_compare_means()
}
p1 <- bp(as.numeric(dat[1,]))
p2 <- bp(as.numeric(dat[2,]))
p3 <- bp(as.numeric(dat[3,]))
p4 <- bp(as.numeric(dat[4,]))

library(patchwork)
(bp_4 <- p1|p2|p3|p4)
boxplot - 4gene.png

以上也只属于check data,下面开始真正的差异分析

library(limma)

design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
exprSet=dat
rownames(design)=colnames(exprSet)
contrast.matrix<-makeContrasts("triple-non_triple",levels = design)
contrast.matrix ##这个矩阵声明,我们要把 triple 组跟 non_triple 进行差异分析比较
#            Contrasts
#Levels       triple-non_triple
#  non_triple                -1
#  triple                     1

自定义函数DEG,功能是做差异分析。

DEG <- function(exprSet,design,contrast.matrix){
  fit <- lmFit(exprSet,design)
  fit2 <- contrasts.fit(fit, contrast.matrix) 
  fit2 <- eBayes(fit2) 
  tempOutput = topTable(fit2, coef=1, n=Inf)
  nrDEG = na.omit(tempOutput) 
  return(nrDEG)
}

把数据喂给DEG函数,获取差异分析结果,并保存到Rdata

deg <- DEG(exprSet,design,contrast.matrix)
head(deg)
#            logFC  AveExpr         t      P.Value  adj.P.Val        B
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433
#NEK11   -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194
#HOXB3   -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988
#EGR2     2.140287 6.380807  5.680934 1.350247e-05 0.06003476 2.575932
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696
# DNAH5   -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157
save(deg,file = 'deg.Rdata')

手动检查deg前几个基因表达情况

bp(as.numeric(dat[rownames(dat)=='ARFGEF3',]))
ARFGEF3_exprData.png

可以看到,与non-triple组比较,triple组表达ARFGEF3显著下调,与deg结果一致。

3.2 差异分析结果的可视化

3.2.1 火山图

设置P.ValuelogFC的阈值,将基因分为up,down,stable三类

df <- deg
colnames(deg)
df$v <- -log10(df$P.Value)
p_thred <- 0.05
logFC_thred <- 1.5
df$groups = ifelse(df$P.Value > p_thred, "stable", 
              ifelse(df$logFC > logFC_thred, "up", 
                ifelse(df$logFC < -logFC_thred, "down", 
                                 "stable")))
table(df$groups)
#  down stable     up 
#   164  18554    119 
library(ggplot2)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_thred,3),
                    '\nThe number of up gene is ',nrow(df[df$groups =='up',]) ,
                    '\nThe number of down gene is ',nrow(df[df$groups =='down',])
)
p <- ggplot(data = df, aes(x = logFC, y = v)) +
  geom_point(alpha=0.4, size=1.75, 
             aes(color=groups)) +
  scale_color_manual(values=c("blue", "grey","red")) +
  geom_vline(xintercept=c(-logFC_thred,logFC_thred),lty=4,col="black",lwd=0.8) +
  geom_hline(yintercept = -log10(p_thred),lty=4,col="black",lwd=0.8) +
  labs(x="logFC",y="-log10(P.value)")+
  ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
  theme_bw()
print(p)
volcano.png
3.2.2 热图
  load(file = 'step1-output.Rdata')
  dat[1:4,1:4]
#        GSM678364 GSM678365 GSM678366 GSM678367
#A1CF     6.654092  6.074542  6.258994  6.609746
#A2M     10.266259 11.034813 10.494956 11.450283
#A2ML1    6.375017  6.118954  5.683851  4.278468
#A3GALT2  4.424096  4.144055  4.906982  5.065266
  table(group_list)
#group_list
#non-triple     triple 
#        14          5
x=deg$logFC #deg取logFC这列并将其重新赋值给x
names(x)=rownames(deg) #deg取probe_id这列,并将其作为名字给x
cg=c(names(head(sort(x),100)),#对x进行从小到大排列,取前100及后100,并取其对应的探针名,作为向量赋值给cg
       names(tail(sort(x),100)))

n=t(scale(t(dat[cg,])))
n[n>2]=2
n[n< -2]= -2

ac=data.frame(groups=group_list)
rownames(ac)=colnames(n) 
pheatmap(n,show_colnames =F,show_rownames = F,
           annotation_col=ac) 
heatmap - deg - top200.png

step4 - GO/KEGG数据库注释

参考:【生信技能树】公众号文章:为R包写一本书(像Y叔致敬)

4.0 整理数据

设置P.Value和logFC的阈值,将基因分为UPDOWNstable3组, deg矩阵新建一列g储存基因分组信息。

不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。

rm(list = ls())  ## 魔幻操作,一键清空~
load(file = 'deg.Rdata')
head(deg)
#            logFC  AveExpr         t      P.Value  adj.P.Val        B
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433
#NEK11   -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194
#HOXB3   -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988
#EGR2     2.140287 6.380807  5.680934 1.350247e-05 0.06003476 2.575932
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696
#DNAH5   -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157
p_thred <- 0.05
logFC_thred <- 1.5
deg$g=ifelse(deg$P.Value>p_thred,'stable',
             ifelse( deg$logFC > logFC_thred,'UP',
                     ifelse( deg$logFC < -logFC_thred,'DOWN','stable') )
)
table(deg$g)
#  DOWN stable     UP 
#   164  18554    119 
head(deg)
#            logFC  AveExpr         t      P.Value  adj.P.Val        B      g
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433   DOWN
#NEK11   -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194   DOWN
#HOXB3   -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988 stable
#EGR2     2.140287 6.380807  5.680934 1.350247e-05 0.06003476 2.575932     UP
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696   DOWN
#DNAH5   -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157   DOWN

富集分析需要基因的ENTREZID,使用Y叔神作clusterProfiler包里的bitr()函数获取对应关系,deg矩阵转存为DEG,并加上ENTREZID

deg$SYMBOL=rownames(deg)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(unique(deg$SYMBOL), fromType = "SYMBOL",
           toType = c( "ENTREZID"),
           OrgDb = org.Hs.eg.db)
head(df)
#   SYMBOL ENTREZID
#1 ARFGEF3    57221
#2   NEK11    79858
#3   HOXB3     3213
#4    EGR2     1959
#5 C3orf14    57415
#6   DNAH5     1767
DEG=deg
head(DEG)
#            logFC  AveExpr         t      P.Value  adj.P.Val        B      g  SYMBOL
#ARFGEF3 -1.933996 9.224776 -5.893853 8.360630e-06 0.06003476 2.930433   DOWN ARFGEF3
#NEK11   -1.644951 6.944406 -5.882179 8.582134e-06 0.06003476 2.911194   DOWN   NEK11
#HOXB3   -1.459656 7.038370 -5.696430 1.303732e-05 0.06003476 2.601988 stable   HOXB3
#EGR2     2.140287 6.380807  5.680934 1.350247e-05 0.06003476 2.575932     UP    EGR2
#C3orf14 -2.110725 6.862692 -5.555574 1.794713e-05 0.06003476 2.363696   DOWN C3orf14
#DNAH5   -3.742125 5.909818 -5.478036 2.141867e-05 0.06003476 2.231157   DOWN   DNAH5

DEG=merge(DEG,df,by='SYMBOL')
head(DEG)
#   SYMBOL       logFC   AveExpr           t    P.Value adj.P.Val         B      g ENTREZID
#1    A1CF -0.24279450  6.481373 -0.78084875 0.44383564 0.8444129 -5.353833 stable    29974
#2     A2M  0.74751848 11.069348  1.89303707 0.07258966 0.5827869 -4.132066 stable        2
#3   A2ML1  0.99851595  4.933066  0.92880174 0.36382423 0.8085547 -5.242500 stable   144568
#4 A3GALT2  0.03320466  4.269750  0.08468268 0.93333708 0.9894294 -5.625240 stable   127550
#5  A4GALT -0.28893317  6.883433 -0.71439103 0.48305962 0.8637983 -5.397984 stable    53947
#6   A4GNT  0.43189200  6.202285  1.60275087 0.12432137 0.6497896 -4.528644 stable    51146
save(DEG,file = 'anno_DEG.Rdata')

分别取出上调基因和下调基因,合并为差异基因

gene_up <- DEG[DEG$g == 'UP','ENTREZID'] 
gene_down <- DEG[DEG$g == 'DOWN','ENTREZID'] 
gene_diff <- c(gene_up,gene_down)
gene_all <- as.character(DEG[ ,'ENTREZID'] )

geneList <- DEG$logFC
names(geneList) <- DEG$ENTREZID
geneList <- sort(geneList,decreasing = T)

4.1 KEGG

4.1.1 上调基因集
library(ggplot2)
library(clusterProfiler)
kk.up <- enrichKEGG(gene         = gene_up,
                      organism     = 'hsa',
                      universe     = gene_all,
                      pvalueCutoff = 0.9,
                      qvalueCutoff = 0.9)
head(kk.up)[,1:6]
#               ID                       Description GeneRatio  BgRatio       pvalue    p.adjust
#hsa04640 hsa04640        Hematopoietic cell lineage      7/53  88/7130 3.334829e-06 0.000506894
#hsa04662 hsa04662 B cell receptor signaling pathway      6/53  78/7130 2.152723e-05 0.001636069
#hsa04062 hsa04062       Chemokine signaling pathway      6/53 175/7130 1.769682e-03 0.081549397
#hsa05340 hsa05340          Primary immunodeficiency      3/53  35/7130 2.146037e-03 0.081549397
#hsa04921 hsa04921        Oxytocin signaling pathway      5/53 147/7130 4.506704e-03 0.133247928
#hsa04064 hsa04064      NF-kappa B signaling pathway      4/53  95/7130 5.259787e-03 0.133247928

KEGG分析上调基因集结果可视化:
1)上调基因所属信号通路(气泡图)

dotplot(kk.up)
kk.up.dotplot.png

2)查看第一个结果hsa04640的信号通路示意图:

browseKEGG(kk.up, 'hsa04640')
KEGG_hsa04640.png

3)上调基因所属信号通路(条带图)

(gg_barplot <- barplot(kk.up,showCategory=20))
KEGG_up_barplot.png

4)通路与基因之间的关系可视化:

cnetplot(kk.up, categorySize="pvalue", foldChange=geneList,colorEdge = TRUE)  
KEGG_up_cnetplot.png

5)通路与通路之间的关系图:

emapplot(kk.up)
KEGG_up_emapplot.png

6)热图展现通路与基因之间的关系:

heatplot(kk.up)
KEGG_up_heatplot.png
4.1.2 下调基因集

同样的方法计算下调基因集KEGG

kk.down <- enrichKEGG(gene         =  gene_down,
                                        organism     = 'hsa',
                                        universe     = gene_all,
                                        pvalueCutoff = 0.9,
                                        qvalueCutoff = 0.9)
  head(kk.down)[,1:6]
#                 ID                      Description GeneRatio  BgRatio       pvalue  p.adjust
#hsa04915 hsa04915       Estrogen signaling pathway      6/62 130/7130 0.0008719172 0.1464821
#hsa00910 hsa00910              Nitrogen metabolism      2/62  16/7130 0.0082547476 0.4221930
#hsa04921 hsa04921       Oxytocin signaling pathway      5/62 147/7130 0.0087691281 0.4221930
#hsa04934 hsa04934                 Cushing syndrome      5/62 152/7130 0.0100522154 0.4221930
#hsa04360 hsa04360                    Axon guidance      5/62 177/7130 0.0184322243 0.4366278
#hsa04927 hsa04927 Cortisol synthesis and secretion      3/62  65/7130 0.0186643448 0.4366278
dotplot(kk.down );ggsave('kk.down.dotplot.png')
kk.down.dotplot.png
4.1.2 差异基因集
  kk.diff <- enrichKEGG(gene         = gene_diff,
                        organism     = 'hsa',
                        pvalueCutoff = 0.05)
  head(kk.diff)[,1:6]
  dotplot(kk.diff );ggsave('kk.diff.dotplot.png')
kk.diff.dotplot.png

合并上下调基因后,几乎分析不出信号通路。╮(╯▽╰)╭

4.1.3 Pathway Enrichment
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1

kegg_plot <- function(up_kegg,down_kegg){
    dat=rbind(up_kegg,down_kegg)
    colnames(dat)
    dat$pvalue = -log10(dat$pvalue)
    dat$pvalue=dat$pvalue*dat$group 
    
    dat=dat[order(dat$pvalue,decreasing = F),]
    
    g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) + 
      geom_bar(stat="identity") + 
      scale_fill_gradient(low="blue",high="red",guide = FALSE) + 
      scale_x_discrete(name ="Pathway names") +
      scale_y_continuous(name ="log10P-value") +
      coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
      ggtitle("Pathway Enrichment") 
}
 
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = 'kegg_up_down.png')
kegg_up_down.png
这张图其实就是上调基因和下调基因pathway条带图合并,可以看到Oxytocin signaling pathway同时存在于上调基因和下调基因pathway,刚好跟差异基因集的分析结果一样。
4.1.4 GSEA
kk_gse <- gseKEGG(geneList     = geneList,
                    organism     = 'hsa',
                    nPerm        = 1000,
                    minGSSize    = 120,
                    pvalueCutoff = 0.9,
                    verbose      = FALSE)
head(kk_gse)[,1:6]
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
gseaplot.png
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
  
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
kegg_up_down_gsea.png

4.2 GO

g_list=list(gene_up=gene_up,
            gene_down=gene_down,
            gene_diff=gene_diff)
  
go_enrich_results <- lapply( g_list , function(gene) {
      lapply( c('BP','MF','CC') , function(ont) {
      cat(paste('Now process ',ont ))
      ego <- enrichGO(gene          = gene,
                      universe      = gene_all,
                      OrgDb         = org.Hs.eg.db,
                      ont           = ont ,
                      pAdjustMethod = "BH",
                      pvalueCutoff  = 0.99,
                      qvalueCutoff  = 0.99,
                      readable      = TRUE)
        
      print( head(ego) )
      return(ego)
    })
})
save(go_enrich_results,file = 'go_enrich_results.Rdata')

load(file = 'go_enrich_results.Rdata')
n1= c('gene_up','gene_down','gene_diff')
n2= c('BP','MF','CC') 
for (i in 1:3){
  for (j in 1:3){
    fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
    cat(paste0(fn,'\n'))
    png(fn,res=150,width = 1080)
    dotplot(go_enrich_results[[i]][[j]],title=paste0('dotplot_',n1[i],'_',n2[j])) %>% print()
    dev.off()
  }
}
dotplot_gene_up_BP.png
dotplot_gene_up_MF.png
dotplot_gene_up_CC.png dotplot_gene_down_BP.png
dotplot_gene_down_MF.png
dotplot_gene_down_CC.png dotplot_gene_diff_BP.png
dotplot_gene_diff_MF.png
dotplot_gene_diff_CC.png

未完待续……

step5 - GSEA基因集富集分析

step6 - GSVA基因集变异分析

上一篇下一篇

猜你喜欢

热点阅读