基因组分析TCGA_CNV生物信息

TCGA 拷贝数变异(CNV)–使用maftools读取和汇总g

2021-03-14  本文已影响0人  庄小尽

读取和汇总gistic输出文件

maftools : Summarize, Analyze and Visualize MAF Files

rm(list = ls())
library(tidyverse)
# 建议下载github最新版本的maftools包
library(maftools)

#读入GISTIC输出的文件
laml.gistic = readGistic(gisticAllLesionsFile = './Gistic 2.0 results/all_lesions.conf_99.txt',
                         gisticAmpGenesFile = './Gistic 2.0 results/amp_genes.conf_99.txt', 
                         gisticDelGenesFile = './Gistic 2.0 results/del_genes.conf_99.txt', 
                         gisticScoresFile = './Gistic 2.0 results/scores.gistic', 
                         isTCGA = T)
## -Processing Gistic files..
## --Processing amp_genes.conf_99.txt
## --Processing del_genes.conf_99.txt
## --Processing scores.gistic
## --Summarizing by samples
laml.gistic
## An object of class  GISTIC 
##           ID summary
## 1:   Samples     500
## 2:    nGenes    8606
## 3: cytoBands      37
## 4:       Amp      39
## 5:       Del   59539
## 6:     total   59578

可视化重要的结果

##绘图
#genome plot 部分突出了明显的扩增和删失区域
gisticChromPlot(gistic = laml.gistic, ref.build = 'hg38') ## 参考基因组选hg38
image.png
#Bubble plot 绘制显著改变的cytobands,以及改变的样本数和它所包含的基因数的函数。每一个气泡的大小是根据-log10转化q值。
gisticBubblePlot(gistic = laml.gistic)
image.png
#oncoplot  
gisticOncoPlot(gistic = laml.gistic, 
               sortByAnnotation = TRUE, top =20)
image.png

挑选q<0.25的基因

(sig_cytoband <- laml.gistic@cytoband.summary %>% filter(qvalues<0.25) %>% .$Unique_Name)
##  [1] "DP_13:8q24.22"  "DP_17:10q23.1"  "DP_33:22q12.3"  "DP_16:10q21.2" 
##  [5] "DP_11:8p23.2"   "AP_1:4q31.21"   "AP_3:22q11.21"  "DP_18:10q23.31"
##  [9] "DP_25:16q23.2"  "DP_27:17p13.3"  "DP_34:22q13.32" "DP_3:2q37.1"   
## [13] "DP_31:20p12.1"  "DP_26:16q23.3"  "DP_10:7q34"     "DP_12:8q24.11" 
## [17] "DP_14:9q22.32"  "DP_15:10q11.21" "DP_19:12p13.2"  "DP_1:2p23.1"   
## [21] "DP_20:13q14.11" "DP_21:13q22.1"  "DP_22:14q24.2"  "DP_24:16p13.2" 
## [25] "DP_28:18q21.33" "DP_29:19p13.3"  "DP_2:2p11.2"    "DP_30:19p13.12"
## [29] "DP_32:22q12.3"  "DP_4:3p25.3"    "DP_6:5p14.1"    "DP_9:7q22.3"   
## [33] "DP_23:15q25.3"  "DP_8:6q27"      "DP_5:4q31.21"   "AP_2:7q34"     
## [37] "DP_7:6q26"
table(grepl(pattern = "AP",sig_cytoband))
## 
## FALSE  TRUE 
##    34     3
## 挑选q值小于0.25的Amplification cytoband
sig_AP_cytoband <- laml.gistic@cytoband.summary %>% 
  filter(qvalues<0.25) %>% 
  .$Unique_Name %>% 
  .[grepl(pattern = "AP",.)]

length(sig_AP_cytoband)
## [1] 3
## 将Amplification 的基因挑选出来
sig_AP_gene <- laml.gistic@data %>% filter(Cytoband %in% sig_AP_cytoband) %>% .$Hugo_Symbol 
unique(sig_AP_gene)
## [1] "CECR2"        "LOC105377458" "[MTRNR2L6]"
## 挑选q值小于0.15的Deletion cytoband
sig_DP_cytoband <- laml.gistic@cytoband.summary %>% 
  filter(qvalues<0.15) %>% 
  .$Unique_Name %>% 
  .[grepl(pattern = "DP",.)]

length(sig_DP_cytoband)
## [1] 10
## 将Deletion cytoband 的基因挑选出来
sig_DP_gene <- laml.gistic@data %>% filter(Cytoband %in% sig_DP_cytoband) %>% .$Hugo_Symbol 
head(unique(sig_DP_gene))
## [1] "APOL1"   "APOL2"   "APOL4"   "APOL3"   "MIR6819" "FAM19A5"

挑一个基因出来看看

## 看一下经典的抑癌基因在那个Cytoband
"CECR2"%in% sig_AP_gene
## [1] TRUE
laml.gistic@data %>% 
  filter(Hugo_Symbol %in% "CECR2") %>% 
  .$Cytoband %>% 
  unique(.)
## [1] "AP_3:22q11.21"
## 一层一层的剥桔子,让我找到你了,然后再返回去看看Amplification GISTIC plot。真香,对上了。单基因的分析就结束了。

找到了基因之后就可以“为所欲为了….”

# 拷贝数缺失区域的基因那么多,来个GO、kegg、GSEA这些分析是不是来一套了。我就简单copy下代码做做GO分析
# 用Y叔的包玩玩
length(unique(sig_DP_gene))## 因为选择q<0.25的拷贝数区域缺失的基因个数太多了(8000多),所以我修改了下q值,按需修改,没有好的阈值,只有合不合适的结果。
## [1] 842
library(clusterProfiler)
library(org.Hs.eg.db)
library(topGO)
sig_DP_entrezId = mapIds(x = org.Hs.eg.db,
                       keys = unique(sig_DP_gene),
                       keytype = "SYMBOL",
                       column = "ENTREZID")

table(is.na(sig_DP_entrezId))
## 
## FALSE  TRUE 
##   791    51
sig_DP_entrezId <- na.omit(sig_DP_entrezId)

go_ALL <- enrichGO(gene = sig_DP_entrezId,
                       OrgDb = org.Hs.eg.db,
                       keyType = "ENTREZID",
                       ont = "ALL",
                       pvalueCutoff = 0.5,
                       qvalueCutoff = 0.5)

##分析完成后,作图
dotplot(go_ALL, split="ONTOLOGY")+ facet_grid(ONTOLOGY~.,scale="free")
image.png
### 数目太多,我只要各个条目的前6个

BP_df<- go_ALL@result %>% arrange(desc(Count)) %>% filter(ONTOLOGY=="BP") %>% head()
CC_df<- go_ALL@result %>% arrange(desc(Count)) %>% filter(ONTOLOGY=="CC") %>% head()
MF_df<- go_ALL@result %>% arrange(desc(Count)) %>% filter(ONTOLOGY=="MF") %>% head()
go_df <- rbind(BP_df,CC_df,MF_df)

go_df$go_term_order=factor(x = c(1:nrow(go_df)),labels = go_df$Description)
library(ggsci)
ggplot(data=go_df, aes(x=go_term_order,y=Count, fill=ONTOLOGY)) +
  geom_bar(stat="identity", width=0.8)  + 
  scale_fill_jama() + 
  theme_classic() +
  xlab("GO term") + ylab("Number of Genes") + labs(title = "The Most Enriched GO Terms")+ 
  theme(axis.text.x=element_text(face = "bold", color="gray50",angle = 60,vjust = 1, hjust = 1 )) 
image.png
## 顺利完成,里面的细节稍微调调图,按需美化,又可以放文章里面了。
## 拷贝数变化的分析就到暂时到这了,让我们拭目以待下一种类型的数据。
## 以上分析根据个人理解进行分析,如有错误,请批评指正。

TCGA 拷贝数变异(CNV)数据整理(一)
TCGA 拷贝数变异(CNV)--Gistic 2.0在线分析(二)

上一篇下一篇

猜你喜欢

热点阅读