GEO数据挖掘

GEO数据分析之GSEA

2020-01-06  本文已影响0人  天涯清水

GSEA-analysis

1.加载数据

载入前一步分析得到的表达矩阵
library(ggstatsplot);
library(cowplot);
library(clusterProfiler);
library(stringr);
library(dplyr);
library(tidyr);
library(ggplot2);
library(ggstatsplot);
load(file = 'GSE63067_GSEA.Rdata')#导入上一步分析的数据
exprSet <- data_plot
exprSet[1:3,1:3]
##                PAX8   CYP2A6   SCARB1
## GSM1539877 6.506860 11.94711 9.129116
## GSM1539878 6.313513 11.82544 9.402811
## GSM1539879 6.273058 11.42314 8.120977

2.批量相关性分析

将第一行目的基因跟其他行的编码基因批量做相关性分析,得到相关性系数以及p值。
y <- as.numeric(exprSet[,"CCL20"])

colnames <- colnames(exprSet)

cor_data_df <- data.frame(colnames)

for (i in 1:length(colnames)){

  test <- cor.test(as.numeric(exprSet[,i]),y,type="spearman")

  cor_data_df[i,2] <- test$estimate

  cor_data_df[i,3] <- test$p.value

}

names(cor_data_df) <- c("symbol","correlation","pvalue")

# 查看这个数据结构

head(cor_data_df)
##   symbol correlation      pvalue
## 1   PAX8 -0.23354999 0.350963277
## 2 CYP2A6 -0.60172099 0.008244347
## 3 SCARB1 -0.19907443 0.428394688
## 4 TTLL12 -0.57277340 0.012974684
## 5  CYTOR  0.35144428 0.152686677
## 6 ADAM32 -0.01286106 0.959604984

3.筛选最相关的基因

筛选p值小于0.05,按照相关性系数绝对值选前500个的基因, 数量可以自己定。
cor_data_sig <- cor_data_df %>% 

  filter(pvalue < 0.05) %>% 

  arrange(desc(abs(correlation)))%>% 

  dplyr::slice(1:500)

4.随机选取正的和负的分别作图验证

正相关的选取IL2RG;负相关选取MARK1
#正相关的选取IL2RG

ggscatterstats(data = exprSet, 

               y = CCL20, 

               x = IL2RG,

               centrality.para = "mean",                              

               margins = "both",                                         

               xfill = "#CC79A7", 

               yfill = "#009E73", 

               marginal.type = "histogram",

               title = "Relationship between CCL20 and IL2RG")
## Warning: This plot can't be further modified with `ggplot2` functions.
## In case you want a `ggplot` object, set `marginal = FALSE`.
#负相关的选取MARK1

ggscatterstats(data = exprSet, 

               y = CCL20, 

               x = MARK1,

               centrality.para = "mean",                              

               margins = "both",                                         

               xfill = "#CC79A7", 

               yfill = "#009E73", 

               marginal.type = "histogram",

               title = "Relationship between CCL20 and IL2RG")
## Warning: This plot can't be further modified with `ggplot2` functions.
## In case you want a `ggplot` object, set `marginal = FALSE`.
image.png
#还可以用cowplot拼图

p1 <- ggscatterstats(data = exprSet, 

                     y = CCL20, 

                     x = IL2RG,

                     centrality.para = "mean",                              

                     margins = "both",                                         

                     xfill = "#CC79A7", 

                     yfill = "#009E73", 

                     marginal.type = "histogram",

                     title = "Relationship between CCL20 and IL2RG")
## Warning: This plot can't be further modified with `ggplot2` functions.
## In case you want a `ggplot` object, set `marginal = FALSE`.
p2 <- ggscatterstats(data = exprSet, 

                     y = CCL20, 

                     x = MARK1,

                     centrality.para = "mean",                              

                     margins = "both",                                         

                     xfill = "#CC79A7", 

                     yfill = "#009E73", 

                     marginal.type = "histogram",

                     title = "Relationship between CCL20 and IL2RG")
## Warning: This plot can't be further modified with `ggplot2` functions.
## In case you want a `ggplot` object, set `marginal = FALSE`.
plot_grid(p1,p2,nrow = 1,labels = LETTERS[1:2])
image.png

5.聚类分析

既然确定了相关性是正确的,那么用筛选的基因进行富集分析就可以反推这个基因的功能。
#获得基因列表

gene <- str_trim(cor_data_sig$symbol,'both')

#基因名称转换,返回的是数据框

gene = bitr(gene, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")

go <- enrichGO(gene = gene$ENTREZID, OrgDb = "org.Hs.eg.db", ont="all")

# 这里因为是计算的所有GO分析的三个分类,所以可以合并作图

# 这是条形图

barplot(go, split="ONTOLOGY")+ 
  facet_grid(ONTOLOGY~., scale="free")
image.png
# 这是气泡图

dotplot(go, split="ONTOLOGY")+ 
  facet_grid(ONTOLOGY~., scale="free")
image.png
# 
# 这时候,我们能推断CCL20这个基因主要参与免疫调控和T细胞激活,细胞因子受体活性调剂等功能,大致跟她本身的功能是一致的。 

参考

上一篇下一篇

猜你喜欢

热点阅读