GEO——代码复现

2022-04-18 本文已影响0人 monkey_study
文章题目：Identification of the interaction network of hub genes for melanoma treated with vemurafenib based on microarray data####

GEO42872 GPL6244

B站学习了生信技能树jimmy老师的代码后，自己尝试运行复现一遍，从下载数据（GEOquery/GEOmirror）到检查数据（boxplot/PCA），分组的构建，接着是标准化流程：差异分析（可视化展示：热图、火山图），功能富集分析（GO、KEGG，这里用的是y叔的clusterProfiler包），GSEA（gene set enrichment analysis）。其中，火山图画的感觉还不错，哈哈！

rm(list = ls())  #一键清空
options(stringsAsFactors = F) #在调用as.data.frame的时，将stringsAsFactors设置为FALSE可以避免character类型自动转化为factor类型
##加载包
library(GEOmirror)
suppressMessages(library(GEOquery))
#GSE42872数据下载
f <- 'GSE42872_eSet.Rdata'
if (T) {
gse <- geoChina(gse = "GSE42872")
save(gse,file=f)
}
load(f)
exprSet <- exprs(gse[[1]])  #提取表达矩阵赋值给exprSet
#查看数据
dim(exprSet)  #行33297     列6
head(exprSet)
range(exprSet)   # [1]  2.66517 14.56410  初步判断数据可能经过了log转换
boxplot(exprSet,las=0)  #已经过标准化，las参数调整横轴标签方向0横，1横，2纵，3纵

#提取临床信息
pdat <- pData(gse[[1]])
View(pdat)
library(stringr)  #字符串处理包
group_list <- str_split(pdat$title," ",simplify = T)[,4]  #提取分组信息
table(group_list)  #查看分组
if (F) {
##注释包/GEO注释平台 GPL6244
BiocManager::install("hugene10sttranscriptcluster.db")  #下载对应注释包
library(hugene10sttranscriptcluster.db)  #加载包
ls("package:hugene10sttranscriptcluster.db") #查看包得内容
ids <- toTable(hugene10sttranscriptclusterSYMBOL)  #toTable函数提取探针id，symbol对应关系
head(ids)  #查看前六行 
exprs
dim(ids) # 19869     2
ids<- ids[ids$symbol!=" ",] # 19869     2 提取symbol 不为空得行
ids <- ids[ids$probe_id %in%  rownames(exprSet),] #19869 
exprSet[1:4,1:4]
head(ids)
exprSet <- exprSet[rownames(exprSet) %in%ids$probe_id,  ]
dim(exprSet) # 19869     6
table(sort(table(ids$symbol))) #查看基因探针对应关系
# 1     2     3     4     5     6     7     8    10    26 
# 18098   597   131    15     7     5     1     2     1     1 
#由于多个探针对应一个基因，所以取基因在样本中的均值并排序取最大得作为唯一探针与基因对应。
##加载注释函数
anno <- function(exprSet,ids){
  probes <- as.character(by(exprSet,ids$symbol,function(x)rownames(x)[which.max(rowMeans(x))]))
  exprSet=exprSet[rownames(exprSet) %in% probes ,]
  rownames(exprSet)=ids[ids$probe_id%in%rownames(exprSet),2]
  return(exprSet)
}
exprSet<- anno(exprSet,ids)
exprSet[1:4,1:4]  #查看表达矩阵
#保存注释后的表达矩阵以及分组信息
}
save(exprSet,group_list,file = "output.Rdata")


##检查分组是否合适，PCA。
load("output.Rdata")
dat<- exprSet
## 下面是画PCA的必须操作，需要看说明书。
#画PCA图时要求是行名时样本名，列名时探针名，因此此时需要转换。将matrix转换为data.frame
dat <- cbind(as.data.frame(t(dat)),group_list) #cbind横向追加，即将分组信息追加到最后一列
library("FactoMineR")#画主成分分析图需要加载这两个包
library("factoextra") 
dim(dat)  #[1]     6 18859
dim(exprSet)  #[1] 18858     6
# The variable group_list is removed before PCA analysis
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#现在dat最后一列是group_list，需要重新赋值给一个dat.pca,这个矩阵是不含有分组信息的
fviz_pca_ind(dat.pca,
             geom.ind = "point", # 图形显示形式，可以是point，text
             col.ind = dat$group_list, # color by groups
             # palette = c("#00AFBB", "#E7B800"),
             addEllipses = F, # 图例是否加框
             legend.title = "Groups"
)  #PCA可视化fviz_pca_ind
ggsave('all_samples_PCA.png',width = 600,height = 600,units = "mm")

###DEGs package limma input: 1.expression matrix 2.design matrix 3. contrast matrix
suppressMessages(library(limma))
#1.expression matrix :exprSet
exprSet[1:4,1:4]
#2 design matrix
table(group_list)
design <- model.matrix(~0+factor(group_list,levels = c("Control","Vemurafenib"),ordered = T))
head(design)
colnames(design) <- levels(factor(group_list))
rownames(design) <- colnames(exprSet)
#3. contrast matrix
contrast <- makeContrasts(paste0(rev(levels(factor(group_list))),collapse = "-"),
                          levels =design)
#DEGs
DEGs <- function(exprSet,design,contrast){
# step1
fit <- lmFit(exprSet,design)
# step2
fit1 <- contrasts.fit(fit,contrast)
# step3
fit2 <- eBayes(fit1)
nrDEG  <- na.omit(topTable(fit2,coef = paste0(rev(levels(factor(group_list))),collapse = "-"),number = Inf,adjust.method = "BH"))
return(nrDEG)
}
nrDEG <- DEGs(exprSet,design,contrast)
save(nrDEG,file = 'nrDEG.Rdata')
table(nrDEG$adj.P.Val<0.01&nrDEG$logFC>=2)
table(nrDEG$adj.P.Val<0.01&nrDEG$logFC<=-2)

#valcone plot
if(T){ 
  library(dplyr)
  library(tidyverse)
  library(ggplot2)
  DEG_valvone <- nrDEG
  colnames(DEG_valvone)
  DEG_valvone <- DEG_valvone %>%
    select(logFC,AveExpr, adj.P.Val) %>%
    mutate(v= -log10(adj.P.Val),
           gene_type = case_when(logFC >= 2 & adj.P.Val < 0.01 ~ "up",
                                 logFC <= -2 & adj.P.Val < 0.01 ~ "down",
                                 TRUE ~ "stable")) 
  ##查看x坐标轴设定范围
  DEG_valvone %>%
    pull(logFC) %>%
    min() %>%
    floor()   #查看x轴最小边界 -5
  DEG_valvone %>%
    pull(logFC) %>%
    max() %>%
    ceiling()  #最大边界6
  
  ###设定上调下调稳定点颜色大小透明度
  cols <- c("up" = "#ffad73", "down" = "#26b3ff", "stable" = "grey") 
  sizes <- c("up" = 2, "down" = 2, "stable" = 1) 
  alphas <- c("up" = 1, "down" = 1, "stable" = 0.5)
}
final_plot <- DEG_valvone %>%
  ggplot(aes(x = logFC,
             y = v,
             fill = gene_type,    
             size = gene_type,
             alpha = gene_type)) + 
  geom_point(shape = 21, # Specify shape and colour as fixed local parameters    
             colour = "black") + 
  geom_hline(yintercept = -log10(0.01),
             linetype = "dashed") + 
  geom_vline(xintercept = c(-2, 2),
             linetype = "dashed") +
  scale_fill_manual(values = cols) + # Modify point colour
  scale_size_manual(values = sizes) + # Modify point size
  scale_alpha_manual(values = alphas) + # Modify point transparency
  scale_x_continuous(breaks = c(seq(-6, 6, 1)),       
                     limits = c(-6, 6))+
  labs(title = "Gene expression changes in Vemurafenib versus control samples",
       x = "log2(fold change)",
       y = "-log10(adjusted P-value)",
       colour = "Expression \nchange") +
  theme_bw() + # Select theme with a white background  
  theme(panel.border = element_rect(colour = "black", fill = NA, size= 0.5),    
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank()) 
final_plot
ggsave("valcone_plot.png",plot = final_plot,width = 200,height = 150,units ="mm" )


## Plot MA plots
if(T){DEG_valvone %>%
    pull(logFC) %>%
    min() %>%
    floor()   #查看x轴最小边界 -5
  DEG_valvone %>%
    pull(logFC) %>%
    max() %>%
    ceiling()  #最大边界6
  res <- DEG_valvone[with(DEG_valvone, order(logFC)),]  #按照logFC倒序排列
  res$threshold <- as.factor(res$adj.P.Val < 0.01)
  MAplot<- ggplot(data=res, aes(x=AveExpr, y=logFC, colour=threshold)) + 
    geom_point(alpha=0.4, size=1.8) + 
    geom_hline(aes(yintercept = 0), colour = "blue", size = 1.2) +
    ylim(c(-5,6)) + 
    labs(title = "MA plot between Vemurafenib and control samples",
         x = "Mean expression",
         y = "Log2 Fold Change")+
    theme(axis.title.x = element_text(face = "bold", size = 15),
          axis.text.x = element_text(face = "bold", size = 12)) +
    theme(axis.title.y = element_text(face = "bold", size = 15),
          axis.text.y = element_text(face = "bold", size = 12)) +
    scale_colour_discrete(name = "p.adjusted < 0.01") +
    theme(legend.title = element_text(face = "bold", size = 15)) +
    theme(legend.text = element_text(size = 14))
  ggsave("MA_plot.png",plot = MAplot,width = 200,height = 200,units ="mm" )
}


#GO enrichment analysis
load("nrDEG.Rdata")
head(nrDEG)
if(T){ 
  suppressMessages(library(org.Hs.eg.db,
                           ggplot2,
                           dplyr,
                           tidyverse))
  nrDEG <- nrDEG %>%
    mutate(v= -log10(adj.P.Val),
           gene_type = case_when(logFC >= 2 & adj.P.Val < 0.01 ~ "up",
                                 logFC <= -2 & adj.P.Val < 0.01 ~ "down",
                                 TRUE ~ "stable")) 
  save(nrDEG,file="deg.Rdata")
}
load("deg.Rdata")
head(nrDEG)

nrDEG$SYMBOL <- rownames(nrDEG)
df <- bitr(nrDEG$SYMBOL, fromType = "SYMBOL",
           toType = c( "ENTREZID"),
           OrgDb = org.Hs.eg.db) 
nrDEG=merge(nrDEG,df,by.y='SYMBOL',by.x='SYMBOL')
gene_up <- nrDEG[nrDEG$gene_type=="up",'ENTREZID']
gene_down <- nrDEG[nrDEG$gene_type=="down",'ENTREZID']
gene_df <- c(gene_up,gene_down)
gene_all<- as.character(nrDEG[,'ENTREZID'])
#GO
if(F){ego_BP <- enrichGO(gene = gene_up,  #entrez gene id
                   keyType = "ENTREZID", 
                   OrgDb= org.Hs.eg.db,
                   ont = "BP",  #生物学功能
                   pAdjustMethod = "BH", 
                   universe=gene_all,
                   minGSSize = 10,
                   maxGSSize = 500,
                   pvalueCutoff = 0.05,  #adjust pvalue
                   qvalueCutoff = 0.05) #pvalue test value
ego_MF <- enrichGO(gene = gene_up,  #entrez gene id
                   keyType = "ENTREZID", 
                   OrgDb= org.Hs.eg.db,
                   ont = "MF",  #生物学功能
                   pAdjustMethod = "BH", 
                   universe=gene_all,
                   minGSSize = 10,
                   maxGSSize = 500,
                   pvalueCutoff = 0.05,  #adjust pvalue
                   qvalueCutoff = 0.05) #pvalue test value
ego_MF <- enrichGO(gene = gene_up,  #entrez gene id
                   keyType = "ENTREZID", 
                   OrgDb= org.Hs.eg.db,
                   ont = "CC",  #生物学功能
                   pAdjustMethod = "BH", 
                   universe=gene_all,
                   minGSSize = 10,
                   maxGSSize = 500,
                   pvalueCutoff = 0.05,  #adjust pvalue
                   qvalueCutoff = 0.05) 
}
ego_all <- enrichGO(gene = gene_df,  #entrez gene id
                   keyType = "ENTREZID", 
                   OrgDb= org.Hs.eg.db,
                   ont = "CC",  #生物学功能
                   pAdjustMethod = "BH", 
                   universe=gene_all,
                   minGSSize = 10,
                   maxGSSize = 500,
                   pvalueCutoff = 0.05,  #adjust pvalue
                   qvalueCutoff = 0.05,
                   readable = T)
save(ego_all,file = "GO_result.Rdata")

#可视化
dotplot(ego_all)   #气泡图
cnetplot(ego_all)
barplot(ego_all)  #条带图
cnetplot(ego_all, categorySize="pvalue",colorEdge = TRUE,circular = TRUE) #简易弦图

##KEGG
geo_kegg <- enrichKEGG(gene_df,
                       organism = "hsa",keyType = "kegg",
                       pvalueCutoff = 0.05,
                       pAdjustMethod = "BH",
                       universe=gene_all,
                       minGSSize = 10,
                       maxGSSize = 500,
                       qvalueCutoff =2,
                       use_internal_data = FALSE
)
#查看富集结果
table(geo_kegg@result$p.adjust<0.05)  #1个
dotplot(geo_kegg)
GEO——代码复现

文章题目：Identification of the interaction network of hub genes for melanoma treated with vemurafenib based on microarray data####

GEO42872 GPL6244

猜你喜欢

热点阅读