TCGA 数据分析实战 —— CNV 与突变

2021-07-12 本文已影响0人名本无名

前言

在介绍完 TCGAbiolinks 的查询下载和数据分析功能之后，我们简单展示几个示例，来练练手，加深对这个包的理解和使用

我们主要从：

基因组
转录组
表观组

3 个维度分别举例来进行说明

基因组分析

拷贝数变异（CNV）在癌症的发生和发展研究中扮演重要的角色。由于基因组重排（如染色体缺失、重复、插入和易位），导致染色体片段的扩增或删失。

CNV 是大于 1kb 的基因组区域发生拷贝数的扩增和删失。

TCGA 的数据可以分为三个等级：

level 1: 原始的测序数据（fasta、fastq等）
level 2: 比对好的 bam 文件
level 3: 经过标准化处理的数据

我们需要获取到 level 3 的数据来进行分析，来识别癌症基因组变异

1. 数据预处理

我们分析的是 legacy 数据库中 Affymetrix Genome-Wide Human SNP Array 6.0 平台的数据

获取 20 个 GBM 和 LGG 样本的 CNV 数据

query.gbm.nocnv <- GDCquery(
  project = "TCGA-GBM",
  data.category = "Copy number variation",
  legacy = TRUE,
  file.type = "nocnv_hg19.seg",
  sample.type = c("Primary Tumor")
)
# 只挑选 20 个样本
query.gbm.nocnv$results[[1]] <- query.gbm.nocnv$results[[1]][1:20,]

GDCdownload(query.gbm.nocnv)

cnvMatrix <- GDCprepare(query.gbm.nocnv)

> head(cnvMatrix）
# A tibble: 6 x 6
  Sample                       Chromosome     Start       End Num_Probes Segment_Mean
  <chr>                        <chr>          <dbl>     <dbl>      <dbl>        <dbl>
1 TCGA-28-2513-01A-01D-0784-01 1            3218610  74709711      40434      -0.0175
2 TCGA-28-2513-01A-01D-0784-01 1           74709724  74715340          3      -1.21  
3 TCGA-28-2513-01A-01D-0784-01 1           74715466 233232858      79323       0.0051
4 TCGA-28-2513-01A-01D-0784-01 1          233233557 233234188          2      -1.56  
5 TCGA-28-2513-01A-01D-0784-01 1          233234531 247813706       9387       0.0168
6 TCGA-28-2513-01A-01D-0784-01 2             484222  20397902      12886       0.0123

2. 识别 recurrent CNV

癌症相关的 CNV 会在许多样本中反复出现，我们使用 GAIA 包来识别这种显著的 recurrent CNV

GAIA 算法基于随机排列的统计检验，同时考虑样本内的显著性和同质性，以重复迭代的方式删除区间内的峰，直至剩下的峰没有显著性为止识别显著的 CNV

该包除了需要输入样本的观测数据，还需要基因组探针元数据，我们可以从 broadinstitute 网站上下载

gdac.root <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/"
file <- paste0(gdac.root, "genome.info.6.0_hg19.na31_minus_frequent_nan_probes_sorted_2.1.txt")
if(!file.exists(basename(file))) 
  downloader::download(file, file.path("~/Downloads", basename(file)))

如果下载出错的话，可以手动复制链接到浏览器中下载

markersMatrix <-  readr::read_tsv(
  file.path("~/Downloads", basename(file)), 
  col_names = FALSE, col_types = "ccn", 
  progress = FALSE
)

> head(markersMatrix)
# A tibble: 6 x 3
  X1        X2       X3
  <chr>     <chr> <dbl>
1 CN_473963 1     61735
2 CN_473964 1     61808
3 CN_473965 1     61823
4 CN_477984 1     62152
5 CN_473981 1     62920
6 CN_473982 1     62937

处理 CNV 数据矩阵

cnvMatrix %<>%
  filter(abs(Segment_Mean) > 0.3) %>%
  mutate(label = if_else(Segment_Mean < -0.3, 0, 1)) %>%
  dplyr::select(-Segment_Mean) %>%
  `names<-`(c("Sample.Name", "Chromosome", "Start", "End", "Num.of.Markers", "Aberration")) %>%
  mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome),
         Chromosome = ifelse(Chromosome == "Y", 24, Chromosome),
         Chromosome = as.integer(Chromosome)) %>%
  # 注意要转换为 data.frame 格式
  as.data.frame()

下载一些常见的 CNV 区域数据，需要过滤掉这些区域

file <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/CNV.hg19.bypos.111213.txt"
if(!file.exists(basename(file))) 
  downloader::download(file, file.path("~/Downloads", basename(file)))
  
commonCNV <- readr::read_tsv(
  file.path("~/Downloads", basename(file)), 
  progress = FALSE
  ) %>%
  mutate(markerID = paste(Chromosome, Start, sep = ":"))

处理探针数据

markersMatrix_filtered <- markersMatrix %>%
  `names<-`(c("Probe.Name", "Chromosome", "Start")) %>%
  mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome),
         Chromosome = ifelse(Chromosome == "Y", 24, Chromosome),
         Chromosome = as.integer(Chromosome)) %>%
  mutate(markerID = paste(Chromosome, Start, sep = ":")) %>%
  filter(!duplicated(markerID)) %>%
  # 过滤一些常见的 CNV
  filter(!markerID %in% commonCNV$markerID) %>%
  dplyr::select(-markerID)

运行 gaia，识别 recurrent CNV

library(gaia)

set.seed(200)
markers_obj <- load_markers(as.data.frame(markersMatrix_filtered))
nbsamples <- length(unique(cnvMatrix$Sample.Name))
cnv_obj <- load_cnv(cnvMatrix, markers_obj, nbsamples)
suppressWarnings({
  cancer <- "GBM"
  results <- runGAIA(
    cnv_obj, markers_obj,
    output_file_name = paste0("~/Downloads/GAIA_", cancer, "_flt.txt"),
    # 指定需要分析的变异类型，-1 分析所有的变异
    aberrations = -1,
    # 指定需要分析的染色体，默认为 -1，表示所有染色体
    chromosomes = -1,
    # 使用近似的方式可以加快计算速度
    approximation = TRUE,
    # 设置迭代次数
    num_iterations = 5000,
    threshold = 0.25
  )
})

绘制结果

RecCNV <- as_tibble(results) %>%
  mutate_at(vars("q-value"), as.numeric) %>%
  mutate_at(vars(-"q-value"), as.integer) %>% {
    # 如果 q-value 为 0，则设置其值为数据中的最小非零值
    minval = min(.$`q-value`[which(.$`q-value` != 0)])
    mutate(., `q-value` = ifelse(`q-value` == 0, minval, `q-value`))
  } %>%
  mutate(score = -log10(`q-value`)) %>%
  as.data.frame()

threshold <- 0.3
gaiaCNVplot(RecCNV,threshold)
save(results, RecCNV, threshold, file = paste0("~/Downloads/", cancer,"_CNV_results.rda"))

3. recurrent CNV 基因注释

在使用 GAIA 识别出癌症中重复出现的基因组区域变异之后，我们需要将其注释到对应的基因。

我们使用 biomaRt 来获取所有人类基因的基因组范围信息，然后将显著变异的基因组区域映射到对应的基因上

library(GenomicRanges)

# 使用 biomart 获取 GENCODE 数据库中的基因信息
genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg19") %>%
  filter(external_gene_name != "" & chromosome_name %in% c(1:22, "X", "Y")) %>%
  mutate(
    chromosome_name = ifelse(
      chromosome_name == "X", 23, ifelse(chromosome_name == "Y", 24, chromosome_name)
    ),
    chromosome_name = as.integer(chromosome_name)
  ) %>%
  arrange(chromosome_name, start_position) %>%
  dplyr::select(c("external_gene_name", "chromosome_name", "start_position","end_position")) %>%
  `names<-`(c("GeneSymbol","Chr","Start","End"))

genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
save(genes_GR, genes, file = "~/Downloads/genes_GR.rda", compress = "xz")

将 CNV 区域注释到基因

# 选择需要的列
sCNV <- dplyr::select(RecCNV, c(1:4, 6)) %>%
  filter(`q-value` <= threshold) %>%
  arrange(1, 3) %>%
  `names<-`(c("Chr","Aberration","Start","End","q-value"))
# 转换为 GenomicRanges 格式
sCNV_GR <- makeGRangesFromDataFrame(sCNV, keep.extra.columns = TRUE)
# 寻找交叠区间
hits <- findOverlaps(genes_GR, sCNV_GR, type = "within")
sCNV_ann <- cbind(sCNV[subjectHits(hits),], genes[queryHits(hits),])

library(glue)
# 格式化输出内容
AmpDel_genes <- as_tibble(sCNV_ann, .name_repair = "universal") %>%
  mutate(
    AberrantRegion = glue("{Chr...1}:{Start...3}-{End...4}"),
    GeneRegion = glue("{Chr...7}:{Start...8}-{End...9}")
  ) %>%
  dplyr::select(c(6, 2, 5, 10, 11)) %>%
  mutate(Aberration = if_else( Aberration == 0, "Del", "Amp"))

save(RecCNV, AmpDel_genes, file = paste0("~/Downloads/", cancer, "_CNV_results.rda"))

输出结果如下

> AmpDel_genes
# A tibble: 317 x 5
   GeneSymbol   Aberration q.value AberrantRegion        GeneRegion           
   <chr>        <chr>        <dbl> <glue>                <glue>               
 1 PIK3C2B      Amp          0.125 1:204382589-204792596 1:204391756-204463852
 2 MDM4         Amp          0.125 1:204382589-204792596 1:204485511-204542871
 3 RP11-430C7.2 Amp          0.125 1:204382589-204792596 1:204497973-204498820
 4 RNA5SP74     Amp          0.125 1:204382589-204792596 1:204531541-204531649
 5 RP11-430C7.4 Amp          0.125 1:204382589-204792596 1:204572163-204585693
 6 LRRN2        Amp          0.125 1:204382589-204792596 1:204586298-204654861
 7 RP11-430C7.5 Amp          0.125 1:204382589-204792596 1:204595903-204598840
 8 AL161793.1   Amp          0.125 1:204382589-204792596 1:204622230-204622314
 9 RP11-23I7.1  Amp          0.125 1:204382589-204792596 1:204633000-204633741
10 RNA5SP75     Amp          0.125 1:204382589-204792596 1:204676448-204676558
# … with 307 more rows

完整代码：https://github.com/dxsbiocc/learn/blob/main/R/TCGA/CNV_analysis.R

4. 基因组变异可视化

4.1 OncoPrint

下面我们展示如何绘制基因组突变数据，首先，我们使用的是之前介绍过的 complexHeatmap 包的 OncoPrint 函数。

GDCquery_maf 函数用于下载突变数据

LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
GBMmut <- GDCquery_Maf(tumor = "GBM", pipelines = "mutect2")
mut <- LGGmut %>% add_row(GBMmut)
save(mut, file ="~/Downloads/mafMutect2LGGGBM.rda")

并提取胶质瘤相关通路中的基因信息

# 提取胶质瘤通路中基因的突变信息
Glioma_signaling <- as_tibble(TCGAbiolinks:::listEA_pathways) %>%
  filter(grepl("glioma", Pathway, ignore.case = TRUE)) %>%
  subset(Pathway == "Glioma Signaling")
# 只取 10 个基因的突变信息
Glioma_signaling_genes <- unlist(str_split(Glioma_signaling$Molecules, ","))[1:10]
mut <- mut[mut$Hugo_Symbol %in% Glioma_signaling_genes,]
# 获取对应的临床信息
gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical")
lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical")
clinical <- gbm_clin %>% add_row(lgg_clin)

绘制图形

TCGAvisualize_oncoprint(
  mut = mut,
  gene = Glioma_signaling_genes,
  annotation = clinical[, c("bcr_patient_barcode", "disease", "vital_status", "ethnicity")],
  annotation.position = "bottom",
  color=c("background"="#CCCCCC", "DEL"="#fb8072", "INS"="#80b1d3", "SNP"="#fdb462"),
  label.font.size = 16,
  rm.empty.columns = FALSE
)

4.2 circos plot

基因组变异信息也可以使用 circos 图形来展示，我们使用前面介绍的 circlize 包来展示 CNV 和突变信息

CNV 的数据是 GAIA 分析的结果，突变数据使用的是从 GDC 数据库中获取的 LGG 样本的 MAF 文件的结果

首先，处理一下突变数据

LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
# 提取需要的突变类型
mut_type <- c(
  "Missense_Mutation",
  "Nonsense_Mutation",
  "Nonstop_Mutation",
  "Frame_Shift_Del",
  "Frame_Shift_Ins"
)
mut <- filter(LGGmut, Variant_Classification %in% mut_type) %>%
  mutate(
    mut.id = glue("{Chromosome}:{Start_Position}-{End_Position}|{Reference_Allele}"),
    .before = Hugo_Symbol
  ) %>%
  # 只考虑至少在两个样本中发生突变的基因
  filter(duplicated(mut.id)) %>%
  dplyr::select(c("Chromosome","Start_Position","End_Position","Variant_Classification","Hugo_Symbol")) %>%
  distinct_all() %>% {
    # 将 typeNames 作为全局变量，后面还要使用
    typeNames <<- unique(.$Variant_Classification)
    type = structure(1:length(typeNames), names = typeNames)
    mutate(., type = type[Variant_Classification])
  } %>%
  dplyr::select(c(1:3,6,4,5))

CNV 数据处理

load("~/Downloads/GBM_CNV_results.rda")

cnv <- as_tibble(RecCNV) %>%
  # 值考虑小于阈值的区域，threshold = 0.3
  filter(`q-value` <= threshold) %>%
  select(c(1, 3, 4, 2)) %>%
  # 将染色体名称转换为 chr 开头的字符串
  mutate(
    Chromosome = ifelse(
      Chromosome == 23, "X", ifelse(Chromosome == 24, "Y", Chromosome)
    ),
    Chromosome = paste0("chr", Chromosome)
  ) %>%
  mutate(CNV = 1) %>%
  `names<-`(c("Chromosome","Start_position","End_position","Aberration_Kind","CNV"))

绘制 circos 图

library(circlize)

par(mar=c(1,1,1,1), cex=1)
circos.initializeWithIdeogram()
# 添加 CNV 结果
colors <- c("forestgreen","firebrick")
circos.genomicTrackPlotRegion(
  cnv, ylim = c(0, 1.2),
  panel.fun = function(region, value, ...) {
    circos.genomicRect(
      region, value, ytop.column = 2,
      ybottom = 0, col = colors[value[[1]] + 1],
      border = "white"
    )
    cell.xlim = get.cell.meta.data("cell.xlim")
    circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
  }
)
# 添加突变结果
colors <- structure(c("blue","green","red", "gold"), names = typeNames[1:4])
circos.genomicTrackPlotRegion(
  mut, ylim = c(1.2, 4.2),
  panel.fun = function(region, value, ...) {
    circos.genomicPoints(
      region, value, cex = 0.8, pch = 16, 
      col = colors[value[[2]]], ...)
  }
)
circos.clear()

# 添加图例
legend(
  -0.2, 0.2, bty = "n", y.intersp = 1,
  c("Amp", "Del"), pch = 15,
  col = c("firebrick", "forestgreen"),
  title = "CNVs", text.font = 1,
  cex = 0.8, title.adj = 0
)
legend(
  -0.2, 0, bty = "n", y.intersp = 1,
  names(colors), pch = 20, col = colors,
  title = "Mutations", text.font = 1,
  cex = 0.8, title.adj = 0
)

4.3 部分区域可视化

我们可以将基因组范围限制在某一条染色体上，来更好地查看该染色体上突变和拷贝数的变异。

可以使用如下图形来展示

par(mar=c(.1, .1, .1, .1),cex=1.5)
circos.par(
  "start.degree" = 90, canvas.xlim = c(0, 1),
  canvas.ylim = c(0, 1), gap.degree = 270,
  cell.padding = c(0, 0, 0, 0), 
  track.margin = c(0.005, 0.005)
)
circos.initializeWithIdeogram(chromosome.index = "chr17")
circos.par(cell.padding = c(0, 0, 0, 0))
# 添加 CNV 结果
colors <- c("forestgreen","firebrick")
names(colors)  <- c(0,1)
circos.genomicTrackPlotRegion(
  cnv, ylim = c(0, 1.2),
  panel.fun = function(region, value, ...) {
    circos.genomicRect(
      region, value, ytop.column = 2,
      ybottom = 0, col = colors[value[[1]]],
      border = "white"
    )
    cell.xlim = get.cell.meta.data("cell.xlim")
    circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
  }
)

# 添加单基因突变结果
gene.mut <- mutate(mut, genes.mut = glue("{Hugo_Symbol}-{type}")) %>%
  filter(!duplicated(genes.mut)) %>% {
    n.mut <- table(.$genes.mut)
    mutate(., num = n.mut[.$genes.mut])
  } %>%
  mutate(
    genes.num = as.character(glue("{Hugo_Symbol}({num})")),
    type = type / 2
  ) %>%
  dplyr::select(-(6:8))

colors <- c("blue","green","red","gold")
names(colors)  <- typeNames[1:4]
circos.genomicTrackPlotRegion(
  gene.mut, ylim = c(0.3, 2.2), track.height = 0.05,
  panel.fun = function(region, value, ...) {
    circos.genomicPoints(
      region, value, cex = 0.4, pch = 16,
      col = colors[value[[2]]], ...
    )
  }
)

circos.genomicTrackPlotRegion(
  gene.mut, ylim = c(0, 1), track.height = 0.1, bg.border = NA
)
i_track = get.cell.meta.data("track.index")

circos.genomicTrackPlotRegion(
  gene.mut, ylim = c(0, 1),
  panel.fun = function(region, value, ...) {
    circos.genomicText(
      region, value, y = 1, labels.column = 3,
      col = colors[value[[2]]], facing = "clockwise",
      adj = c(1, 0.5), posTransform = posTransform.text,
      cex = 0.4, niceFacing = TRUE
    )
  },
  track.height = 0.1,
  bg.border = NA
)

circos.genomicPosTransformLines(
  gene.mut,
  posTransform = function(region, value)
    posTransform.text( 
      region,  y = 1, labels = value[[3]],
      cex = 0.4, track.index = i_track + 1
    ),
  direction = "inside",
  track.index = i_track
)

circos.clear()

legend(
  0, 0.38, bty = "n", y.intersp = 1, c("Amp", "Del"),
  pch = 15, col = c("firebrick", "forestgreen"),
  title = "CNVs", text.font = 1, cex = 0.7, title.adj = 0
)
legend(
  0, 0.2, bty = "n", y.intersp = 1, names(colors),
  pch = 16, col = colors, title = "Mutations",
  text.font = 1, cex = 0.7, title.adj = 0
)

完整代码：https://github.com/dxsbiocc/learn/blob/main/R/TCGA/Mutation_CNV_visualize.R