TCGA-15.突变数据分析-mutation signafit

2020-01-29 本文已影响0人小洁忘了怎么分身

花花写于2020-01-29 今天大年初四，在家各种姿势打滚，实在是无聊，遂决定起来学习。经评测，发明了最舒服的边休息边工作姿势，大概就是：

0.准备R包

懒得写安装代码啦，你长大了，要学会自己安装，参考https://www.jianshu.com/p/bba1ad57a144

rm(list=ls())
options(stringsAsFactors = F) 
project='TCGA_KIRC'
library(maftools) 
library(dplyr)
library(pheatmap)
library(ComplexHeatmap)
library(stringr)
require(deconstructSigs)
require(BSgenome)
require(BSgenome.Hsapiens.UCSC.hg38)

1.读取突变数据maf

是和上一篇一样的数据，从gdc下载的maf文件和临床信息，见：
https://www.jianshu.com/p/2d6cb5bd8771

laml = read.maf(maf = 'TCGA.KIRC.mutect.somatic.maf.gz')
#> -Reading
#> -Validating
#> -Silent variants: 8383 
#> -Summarizing
#> --Mutiple centers found
#> BCM;BI--Possible FLAGS among top ten genes:
#>   TTN
#>   MUC16
#>   HMCN1
#> -Processing clinical data
#> --Missing clinical data
#> -Finished in 4.134s elapsed (6.208s cpu)
laml 
#> An object of class  MAF 
#>                         ID summary   Mean Median
#>  1:             NCBI_Build  GRCh38     NA     NA
#>  2:                 Center  BCM;BI     NA     NA
#>  3:                Samples     336     NA     NA
#>  4:                 nGenes    9444     NA     NA
#>  5:        Frame_Shift_Del    1732  5.155      4
#>  6:        Frame_Shift_Ins    1201  3.574      1
#>  7:           In_Frame_Del     238  0.708      0
#>  8:           In_Frame_Ins     350  1.042      0
#>  9:      Missense_Mutation   12997 38.682     36
#> 10:      Nonsense_Mutation    1259  3.747      2
#> 11:       Nonstop_Mutation      18  0.054      0
#> 12:            Splice_Site     490  1.458      1
#> 13: Translation_Start_Site      25  0.074      0
#> 14:                  total   18310 54.494     47

mut=laml@data
mut=mut[mut$Variant_Type=='SNP',]
#每行记录了一个突变，所以统计样本列的行数即为样本的突变数量
plot(table(mut[,16]),las = 2)

可以可视化看每个样本的突变数量

2.制作signatures的输入数据（96频谱）

关于什么是mutation

signature：http://www.bio-info-trainee.com/1619.html

关于96频谱：http://www.biotrainee.com/thread-832-1-1.html

a = mut[,c(16,5,6,12,13)]
colnames(a)=c( "Sample","chr", "pos","ref",  "alt")
a$Sample=as.character(a$Sample)

sigs.input <- mut.to.sigs.input(mut.ref = a, 
                                sample.id = "Sample", 
                                chr = "chr", 
                                pos = "pos", 
                                ref = "ref", 
                                alt = "alt",
                                bsg = BSgenome.Hsapiens.UCSC.hg38)
class(sigs.input)
#> [1] "data.frame"
#第一个样本的突变绘图
barplot(as.numeric(sigs.input[1,])~colnames(sigs.input))

sigs.input[1:4,1:4]
#>                              A[C>A]A A[C>A]C A[C>A]G A[C>A]T
#> TCGA-G6-A8L7-01A-11D-A36X-10       0       1       0       0
#> TCGA-B8-A54G-01A-11D-A25V-10       0       1       0       0
#> TCGA-BP-4995-01A-01D-1462-08       0       0       0       0
#> TCGA-B0-5100-01A-01D-1421-08       1       0       0       0

一顿操作猛如虎，生成signature与样本对应关系的矩阵

if(!file.exists(paste0(project,"w.Rdata"))){
  w=lapply(unique(a$Sample), function(i){
    ## signatures.cosmic signatures.nature2013
    sample_1 = whichSignatures(tumor.ref = sigs.input[,], 
                               signatures.ref = signatures.cosmic, 
                               sample.id =  i, 
                               contexts.needed = TRUE,
                               tri.counts.method = 'default')
    print(c(i,which(unique(a$Sample)==i)))
    return(sample_1$weights)
  })
  w=do.call(rbind,w)
  save(w,file = paste0(project,"w.Rdata"))
}
load(paste0(project,"w.Rdata"))
mat = t(w)

3.可视化

矩阵的可视化--热图

3.1 简单版本的热图

pheatmap(mat,
         cluster_rows = F,
         cluster_cols = T,
         show_colnames = F)

3.2 好看一点的热图

看到了这个图，氮素不知道用什么包画的，反正闲的没事干，不如自己模仿一波。如果你知道用的什么包，可以留言告诉我~感谢

热图上面放直方图，直方图内容是每个样本的突变数量。想到了ComplexHeatmap，可以将直方图作为注释添加上去。

难点是调整顺序，R语言神技能加持。

捋一捋：

直方图横纵坐标是样本和突变数量
热图输入数据mat横纵坐标是样本和signature
注释栏来源于clinical数据，与样本一一对应。
所以需要调整样本顺序，三者统一。
注释栏简化一下，用生死和性别；先排一下序，再按照排好的顺序给mat和直方图输入数据排序。

load("clinical.Rdata")
table( str_sub(colnames(mat),1,12)%in% cl_df$bcr_patient_barcode)
#> 
#> TRUE 
#>  336
#一一对应，只要mat列名对应的临床数据。
cl_df = cl_df[cl_df$bcr_patient_barcode %in% str_sub(colnames(mat),1,12),]
#按照性别和生死排序
cl_df = arrange(cl_df,gender,vital_status)
#mat排序
x = match(cl_df$bcr_patient_barcode,str_sub(colnames(mat),1,12))
mat = mat[,x]
#直方图输入数据排序
for_bar = laml@variants.per.sample$Variants[match(cl_df$bcr_patient_barcode,str_sub(laml@variants.per.sample$Tumor_Sample_Barcode,1,12))]
#确认顺序是否正确
cl_df$bcr_patient_barcode[1] == str_sub(colnames(mat),1,12)[1]
#> [1] TRUE
cl_df$bcr_patient_barcode[1] == for_bar[1]
#> [1] FALSE

annotation = HeatmapAnnotation(bar1 = anno_barplot(for_bar),
  df = data.frame(gender = cl_df$gender,
                                               vital = cl_df$vital_status)
                               )

# top_annotation参数在顶部添加注释信息
ht_list = Heatmap(mat, name = "mat", 
        #top_annotation = column_ha, 
        #right_annotation = row_ha,
        cluster_rows = F,
        cluster_columns = F,
        #show_row_names = F,
        show_column_names = F,
        top_annotation = annotation)

draw(ht_list, heatmap_legend_side = "left", annotation_legend_side = "bottom")