GEO数据挖掘

第一步:简单的GEO数据提取及数据分析

2020-05-20  本文已影响0人  碌碌无为的杰少

简介

这是复现一篇结肠癌的文献图例,GSE4107,文献地址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4399548/pdf/ott-8-745.pdf,其中由于作者细节未公布,可能差异基因个数有稍许差别

数据提取

利用getGEO函数下载GSE4107,用exprs函数获的exp表达矩阵,pData函数获取临床信息pd临床数据,@annotation获得gpl平台数据。

#数据下载
rm(list = ls())
options(stringsAsFactors = F)
library(GEOquery)
gse = "GSE4107"
eSet <- getGEO(gse, 
               destdir = '.', 
               getGPL = F)
#(1)提取表达矩阵exp
exp <- exprs(eSet[[1]])
exp[1:4,1:4]
exp = log2(exp+1)
#(2)提取临床信息
pd <- pData(eSet[[1]])
#(3)调整pd的行名顺序与exp列名完全一致
p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
#(4)提取芯片平台编号
gpl <- eSet[[1]]@annotation
save(gse,pd,exp,gpl,file = "step1output.Rdata")

获取GPL平台信息

首先利用pd中的title信息获取临床信息并factor化,再利用GPL获取探针信息。

rm(list = ls())  
load(file = "step1output.Rdata")
library(stringr)
pd$title
group_list=ifelse(str_detect(pd$title,"healthy control"),"control","treat")
#设置参考水平,对照在前,处理在后
group_list = factor(group_list,
                    levels = c("control","treat"))
#2.ids-----------------
#方法1 BioconductorR包
gpl 
#http://www.bio-info-trainee.com/1399.html
if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids <- toTable(hgu133plus2SYMBOL)
head(ids)
# 方法2 读取gpl页面的soft文件,按列取子集
# 方法3 官网下载
# 方法4 自主注释 
save(exp,group_list,ids,file = "step2output.Rdata")

制作PCA图和热图

这一步只要提供expgrou_list其他不用变

rm(list = ls())  
load(file = "step1output.Rdata")
load(file = "step2output.Rdata")
boxplot(exp,outline=FALSE,notch=T,las=2)
#输入数据:exp和group_list
#Principal Component Analysis
#http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials

dat=as.data.frame(t(exp))
library(FactoMineR)#画主成分分析图需要加载这两个包
library(factoextra) 
# pca的统一操作走起
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
                         geom.ind = "point", # show points only (nbut not "text")
                         col.ind = group_list, # color by groups
                         #palette = c("#00AFBB", "#E7B800"),
                         addEllipses = TRUE, # Concentration ellipses
                         legend.title = "Groups"
)
pca_plot
ggsave(plot = pca_plot,filename = paste0(gse,"PCA.png"))
save(pca_plot,file = "pca_plot.Rdata")

#热图 
cg=names(tail(sort(apply(exp,1,sd)),1000))
n=exp[cg,]

#绘制热图
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n) 
library(pheatmap)
pheatmap1 <- pheatmap(n,
         show_colnames =F,
         show_rownames = F,
         annotation_col=annotation_col,
         scale = "row")
ggsave(plot = pheatmap1,filename = paste0(gse,"pheatmap1.png"))

做差异分析

利用贝叶斯获取表达差异矩阵,用inner_join获得探针信息,获得deg表达。

rm(list = ls()) 
load(file = "step2output.Rdata")
#差异分析,用limma包来做
#需要表达矩阵和group_list,不需要改
library(limma)
design=model.matrix(~group_list)
fit=lmFit(exp,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)

#为deg数据框添加几列
#1.加probe_id列,把行名变成一列
library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg))
head(deg)
#2.加symbol列,火山图要用
deg <- inner_join(deg,ids,by="probe_id")
head(deg)
deg <- deg[!duplicated(deg$symbol),]
#3.加change列,标记上下调基因
logFC_t=1
P.Value_t = 0.05
k1 = (deg$adj.P.Val < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$adj.P.Val < P.Value_t)&(deg$logFC > logFC_t)
table(k1)
table(k2)
change = ifelse(k1,"down",ifelse(k2,"up","stable"))
deg <- mutate(deg,change)

火山图

提供了差异前几基因或者某一单独基因的实现方法

load(file = "step1output.Rdata")
load(file = "step4output.Rdata")
#1.火山图----
library(dplyr)
library(ggplot2)

dat  = deg
if(T) {
  x1 = dat %>% 
    filter(change == "up") %>% 
    arrange(desc(logFC))%>%
    head(3)
  x2 = dat %>% 
    filter(change == "down") %>% 
    arrange(logFC)%>%
    head(3)
  for_label = rbind(x1,x2)
}

#%>%
if(F){
  for_label <- dat%>% 
    filter(symbol %in% c("VIP","SFRP1")) 
}
if(F){
  for_label <- dat %>% head(10)
}
if(F) {
  x1 = dat %>% 
    filter(change == "up") %>% 
    head(3)
  x2 = dat %>% 
    filter(change == "down") %>% 
    head(3)
  for_label = rbind(x1,x2)
}

#  head(10)
p <- ggplot(data = dat, 
            aes(x = logFC, 
                y = -log10(adj.P.Val))) +
  geom_point(alpha=0.4, size=3.5, 
             aes(color=change)) +
  ylab("-log10(Pvalue)")+
  scale_color_manual(values=c("blue", "grey","red"))+
  geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +
  geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +
  theme_bw()+
  theme(panel.border = element_blank(),panel.grid.major = element_blank(),
        
         panel.grid.minor = element_blank(),axis.line = element_line(colour = "black"))
p
volcano_plot <- p +
  geom_point(size = 3, shape = 1, data = for_label) +
  ggrepel::geom_label_repel(
    aes(label = symbol),
    data = for_label,
    color="black")
ggsave(plot = volcano_plot,filename = paste0(gse,"volcano.png"))
image.png

热图和拼图

热图提供前多少基因或者所有差异基因的热图,if函数提供了标记列名的函数,最后用patchwork进行拼图

load(file = 'step2output.Rdata')
dd1=deg
x=dd1$logFC 
names(x)=dd1$probe_id
cg=c(names(head(sort(x),30)),names(tail(sort(x),30)))
cg
#cg = deg$probe_id[deg$change !="stable"]
rownames(dd1) <- dd1$probe_id
dd2 <- dd1[c(names(head(sort(x),30)),names(tail(sort(x),30))),"symbol"]
dd2
if(F) {
  rownames(dd1) <- dd1$probe_id
  dd2 <- dd1[c(names(head(sort(x),30)),names(tail(sort(x),30))),"symbol"]
  dd2
}
n=exp[cg,]
#rownames(n) <- dd2
dim(n)
#作热图
library(pheatmap)
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n) 
library(ggplotify)
heatmap_plot <- as.ggplot(pheatmap(n,show_colnames =F,
                                   show_rownames = T,
                                   scale = "row",
                                   #cluster_cols = F, 
                                   annotation_col=annotation_col
                              )) 
heatmap_plot
ggsave(heatmap_plot,filename = paste0(gse,"heatmap.png"))
load("pca_plot.Rdata")
library(patchwork)
(pca_plot | volcano_plot )/heatmap_plot+ plot_annotation(tag_levels = "A")
Cggsave(filename = "test.png",width = 15,height = 5)
image.png
上一篇下一篇

猜你喜欢

热点阅读