TCGA 数据分析实战 —— CNV 与突变
前言
在介绍完 TCGAbiolinks
的查询下载和数据分析功能之后,我们简单展示几个示例,来练练手,加深对这个包的理解和使用
我们主要从:
- 基因组
- 转录组
- 表观组
3
个维度分别举例来进行说明
基因组分析
拷贝数变异(CNV
)在癌症的发生和发展研究中扮演重要的角色。由于基因组重排(如染色体缺失、重复、插入和易位),导致染色体片段的扩增或删失。
CNV
是大于 1kb
的基因组区域发生拷贝数的扩增和删失。
TCGA
的数据可以分为三个等级:
-
level 1
: 原始的测序数据(fasta
、fastq等
) -
level 2
: 比对好的bam
文件 -
level 3
: 经过标准化处理的数据
我们需要获取到 level 3
的数据来进行分析,来识别癌症基因组变异
1. 数据预处理
我们分析的是 legacy
数据库中 Affymetrix Genome-Wide Human SNP Array 6.0
平台的数据
获取 20
个 GBM
和 LGG
样本的 CNV
数据
query.gbm.nocnv <- GDCquery(
project = "TCGA-GBM",
data.category = "Copy number variation",
legacy = TRUE,
file.type = "nocnv_hg19.seg",
sample.type = c("Primary Tumor")
)
# 只挑选 20 个样本
query.gbm.nocnv$results[[1]] <- query.gbm.nocnv$results[[1]][1:20,]
GDCdownload(query.gbm.nocnv)
cnvMatrix <- GDCprepare(query.gbm.nocnv)
> head(cnvMatrix)
# A tibble: 6 x 6
Sample Chromosome Start End Num_Probes Segment_Mean
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 TCGA-28-2513-01A-01D-0784-01 1 3218610 74709711 40434 -0.0175
2 TCGA-28-2513-01A-01D-0784-01 1 74709724 74715340 3 -1.21
3 TCGA-28-2513-01A-01D-0784-01 1 74715466 233232858 79323 0.0051
4 TCGA-28-2513-01A-01D-0784-01 1 233233557 233234188 2 -1.56
5 TCGA-28-2513-01A-01D-0784-01 1 233234531 247813706 9387 0.0168
6 TCGA-28-2513-01A-01D-0784-01 2 484222 20397902 12886 0.0123
2. 识别 recurrent CNV
癌症相关的 CNV
会在许多样本中反复出现,我们使用 GAIA
包来识别这种显著的 recurrent CNV
GAIA
算法基于随机排列的统计检验,同时考虑样本内的显著性和同质性,以重复迭代的方式删除区间内的峰,直至剩下的峰没有显著性为止识别显著的 CNV
该包除了需要输入样本的观测数据,还需要基因组探针元数据,我们可以从 broadinstitute
网站上下载
gdac.root <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/"
file <- paste0(gdac.root, "genome.info.6.0_hg19.na31_minus_frequent_nan_probes_sorted_2.1.txt")
if(!file.exists(basename(file)))
downloader::download(file, file.path("~/Downloads", basename(file)))
如果下载出错的话,可以手动复制链接到浏览器中下载
markersMatrix <- readr::read_tsv(
file.path("~/Downloads", basename(file)),
col_names = FALSE, col_types = "ccn",
progress = FALSE
)
> head(markersMatrix)
# A tibble: 6 x 3
X1 X2 X3
<chr> <chr> <dbl>
1 CN_473963 1 61735
2 CN_473964 1 61808
3 CN_473965 1 61823
4 CN_477984 1 62152
5 CN_473981 1 62920
6 CN_473982 1 62937
处理 CNV
数据矩阵
cnvMatrix %<>%
filter(abs(Segment_Mean) > 0.3) %>%
mutate(label = if_else(Segment_Mean < -0.3, 0, 1)) %>%
dplyr::select(-Segment_Mean) %>%
`names<-`(c("Sample.Name", "Chromosome", "Start", "End", "Num.of.Markers", "Aberration")) %>%
mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome),
Chromosome = ifelse(Chromosome == "Y", 24, Chromosome),
Chromosome = as.integer(Chromosome)) %>%
# 注意要转换为 data.frame 格式
as.data.frame()
下载一些常见的 CNV
区域数据,需要过滤掉这些区域
file <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/CNV.hg19.bypos.111213.txt"
if(!file.exists(basename(file)))
downloader::download(file, file.path("~/Downloads", basename(file)))
commonCNV <- readr::read_tsv(
file.path("~/Downloads", basename(file)),
progress = FALSE
) %>%
mutate(markerID = paste(Chromosome, Start, sep = ":"))
处理探针数据
markersMatrix_filtered <- markersMatrix %>%
`names<-`(c("Probe.Name", "Chromosome", "Start")) %>%
mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome),
Chromosome = ifelse(Chromosome == "Y", 24, Chromosome),
Chromosome = as.integer(Chromosome)) %>%
mutate(markerID = paste(Chromosome, Start, sep = ":")) %>%
filter(!duplicated(markerID)) %>%
# 过滤一些常见的 CNV
filter(!markerID %in% commonCNV$markerID) %>%
dplyr::select(-markerID)
运行 gaia
,识别 recurrent CNV
library(gaia)
set.seed(200)
markers_obj <- load_markers(as.data.frame(markersMatrix_filtered))
nbsamples <- length(unique(cnvMatrix$Sample.Name))
cnv_obj <- load_cnv(cnvMatrix, markers_obj, nbsamples)
suppressWarnings({
cancer <- "GBM"
results <- runGAIA(
cnv_obj, markers_obj,
output_file_name = paste0("~/Downloads/GAIA_", cancer, "_flt.txt"),
# 指定需要分析的变异类型,-1 分析所有的变异
aberrations = -1,
# 指定需要分析的染色体,默认为 -1,表示所有染色体
chromosomes = -1,
# 使用近似的方式可以加快计算速度
approximation = TRUE,
# 设置迭代次数
num_iterations = 5000,
threshold = 0.25
)
})
绘制结果
RecCNV <- as_tibble(results) %>%
mutate_at(vars("q-value"), as.numeric) %>%
mutate_at(vars(-"q-value"), as.integer) %>% {
# 如果 q-value 为 0,则设置其值为数据中的最小非零值
minval = min(.$`q-value`[which(.$`q-value` != 0)])
mutate(., `q-value` = ifelse(`q-value` == 0, minval, `q-value`))
} %>%
mutate(score = -log10(`q-value`)) %>%
as.data.frame()
threshold <- 0.3
gaiaCNVplot(RecCNV,threshold)
save(results, RecCNV, threshold, file = paste0("~/Downloads/", cancer,"_CNV_results.rda"))
3. recurrent CNV 基因注释
在使用 GAIA
识别出癌症中重复出现的基因组区域变异之后,我们需要将其注释到对应的基因。
我们使用 biomaRt
来获取所有人类基因的基因组范围信息,然后将显著变异的基因组区域映射到对应的基因上
library(GenomicRanges)
# 使用 biomart 获取 GENCODE 数据库中的基因信息
genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg19") %>%
filter(external_gene_name != "" & chromosome_name %in% c(1:22, "X", "Y")) %>%
mutate(
chromosome_name = ifelse(
chromosome_name == "X", 23, ifelse(chromosome_name == "Y", 24, chromosome_name)
),
chromosome_name = as.integer(chromosome_name)
) %>%
arrange(chromosome_name, start_position) %>%
dplyr::select(c("external_gene_name", "chromosome_name", "start_position","end_position")) %>%
`names<-`(c("GeneSymbol","Chr","Start","End"))
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
save(genes_GR, genes, file = "~/Downloads/genes_GR.rda", compress = "xz")
将 CNV
区域注释到基因
# 选择需要的列
sCNV <- dplyr::select(RecCNV, c(1:4, 6)) %>%
filter(`q-value` <= threshold) %>%
arrange(1, 3) %>%
`names<-`(c("Chr","Aberration","Start","End","q-value"))
# 转换为 GenomicRanges 格式
sCNV_GR <- makeGRangesFromDataFrame(sCNV, keep.extra.columns = TRUE)
# 寻找交叠区间
hits <- findOverlaps(genes_GR, sCNV_GR, type = "within")
sCNV_ann <- cbind(sCNV[subjectHits(hits),], genes[queryHits(hits),])
library(glue)
# 格式化输出内容
AmpDel_genes <- as_tibble(sCNV_ann, .name_repair = "universal") %>%
mutate(
AberrantRegion = glue("{Chr...1}:{Start...3}-{End...4}"),
GeneRegion = glue("{Chr...7}:{Start...8}-{End...9}")
) %>%
dplyr::select(c(6, 2, 5, 10, 11)) %>%
mutate(Aberration = if_else( Aberration == 0, "Del", "Amp"))
save(RecCNV, AmpDel_genes, file = paste0("~/Downloads/", cancer, "_CNV_results.rda"))
输出结果如下
> AmpDel_genes
# A tibble: 317 x 5
GeneSymbol Aberration q.value AberrantRegion GeneRegion
<chr> <chr> <dbl> <glue> <glue>
1 PIK3C2B Amp 0.125 1:204382589-204792596 1:204391756-204463852
2 MDM4 Amp 0.125 1:204382589-204792596 1:204485511-204542871
3 RP11-430C7.2 Amp 0.125 1:204382589-204792596 1:204497973-204498820
4 RNA5SP74 Amp 0.125 1:204382589-204792596 1:204531541-204531649
5 RP11-430C7.4 Amp 0.125 1:204382589-204792596 1:204572163-204585693
6 LRRN2 Amp 0.125 1:204382589-204792596 1:204586298-204654861
7 RP11-430C7.5 Amp 0.125 1:204382589-204792596 1:204595903-204598840
8 AL161793.1 Amp 0.125 1:204382589-204792596 1:204622230-204622314
9 RP11-23I7.1 Amp 0.125 1:204382589-204792596 1:204633000-204633741
10 RNA5SP75 Amp 0.125 1:204382589-204792596 1:204676448-204676558
# … with 307 more rows
完整代码:https://github.com/dxsbiocc/learn/blob/main/R/TCGA/CNV_analysis.R
4. 基因组变异可视化
4.1 OncoPrint
下面我们展示如何绘制基因组突变数据,首先,我们使用的是之前介绍过的 complexHeatmap
包的 OncoPrint
函数。
GDCquery_maf
函数用于下载突变数据
LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
GBMmut <- GDCquery_Maf(tumor = "GBM", pipelines = "mutect2")
mut <- LGGmut %>% add_row(GBMmut)
save(mut, file ="~/Downloads/mafMutect2LGGGBM.rda")
并提取胶质瘤相关通路中的基因信息
# 提取胶质瘤通路中基因的突变信息
Glioma_signaling <- as_tibble(TCGAbiolinks:::listEA_pathways) %>%
filter(grepl("glioma", Pathway, ignore.case = TRUE)) %>%
subset(Pathway == "Glioma Signaling")
# 只取 10 个基因的突变信息
Glioma_signaling_genes <- unlist(str_split(Glioma_signaling$Molecules, ","))[1:10]
mut <- mut[mut$Hugo_Symbol %in% Glioma_signaling_genes,]
# 获取对应的临床信息
gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical")
lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical")
clinical <- gbm_clin %>% add_row(lgg_clin)
绘制图形
TCGAvisualize_oncoprint(
mut = mut,
gene = Glioma_signaling_genes,
annotation = clinical[, c("bcr_patient_barcode", "disease", "vital_status", "ethnicity")],
annotation.position = "bottom",
color=c("background"="#CCCCCC", "DEL"="#fb8072", "INS"="#80b1d3", "SNP"="#fdb462"),
label.font.size = 16,
rm.empty.columns = FALSE
)
4.2 circos plot
基因组变异信息也可以使用 circos
图形来展示,我们使用前面介绍的 circlize
包来展示 CNV
和突变信息
CNV
的数据是 GAIA
分析的结果,突变数据使用的是从 GDC
数据库中获取的 LGG
样本的 MAF
文件的结果
首先,处理一下突变数据
LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
# 提取需要的突变类型
mut_type <- c(
"Missense_Mutation",
"Nonsense_Mutation",
"Nonstop_Mutation",
"Frame_Shift_Del",
"Frame_Shift_Ins"
)
mut <- filter(LGGmut, Variant_Classification %in% mut_type) %>%
mutate(
mut.id = glue("{Chromosome}:{Start_Position}-{End_Position}|{Reference_Allele}"),
.before = Hugo_Symbol
) %>%
# 只考虑至少在两个样本中发生突变的基因
filter(duplicated(mut.id)) %>%
dplyr::select(c("Chromosome","Start_Position","End_Position","Variant_Classification","Hugo_Symbol")) %>%
distinct_all() %>% {
# 将 typeNames 作为全局变量,后面还要使用
typeNames <<- unique(.$Variant_Classification)
type = structure(1:length(typeNames), names = typeNames)
mutate(., type = type[Variant_Classification])
} %>%
dplyr::select(c(1:3,6,4,5))
CNV
数据处理
load("~/Downloads/GBM_CNV_results.rda")
cnv <- as_tibble(RecCNV) %>%
# 值考虑小于阈值的区域,threshold = 0.3
filter(`q-value` <= threshold) %>%
select(c(1, 3, 4, 2)) %>%
# 将染色体名称转换为 chr 开头的字符串
mutate(
Chromosome = ifelse(
Chromosome == 23, "X", ifelse(Chromosome == 24, "Y", Chromosome)
),
Chromosome = paste0("chr", Chromosome)
) %>%
mutate(CNV = 1) %>%
`names<-`(c("Chromosome","Start_position","End_position","Aberration_Kind","CNV"))
绘制 circos
图
library(circlize)
par(mar=c(1,1,1,1), cex=1)
circos.initializeWithIdeogram()
# 添加 CNV 结果
colors <- c("forestgreen","firebrick")
circos.genomicTrackPlotRegion(
cnv, ylim = c(0, 1.2),
panel.fun = function(region, value, ...) {
circos.genomicRect(
region, value, ytop.column = 2,
ybottom = 0, col = colors[value[[1]] + 1],
border = "white"
)
cell.xlim = get.cell.meta.data("cell.xlim")
circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
}
)
# 添加突变结果
colors <- structure(c("blue","green","red", "gold"), names = typeNames[1:4])
circos.genomicTrackPlotRegion(
mut, ylim = c(1.2, 4.2),
panel.fun = function(region, value, ...) {
circos.genomicPoints(
region, value, cex = 0.8, pch = 16,
col = colors[value[[2]]], ...)
}
)
circos.clear()
# 添加图例
legend(
-0.2, 0.2, bty = "n", y.intersp = 1,
c("Amp", "Del"), pch = 15,
col = c("firebrick", "forestgreen"),
title = "CNVs", text.font = 1,
cex = 0.8, title.adj = 0
)
legend(
-0.2, 0, bty = "n", y.intersp = 1,
names(colors), pch = 20, col = colors,
title = "Mutations", text.font = 1,
cex = 0.8, title.adj = 0
)
4.3 部分区域可视化
我们可以将基因组范围限制在某一条染色体上,来更好地查看该染色体上突变和拷贝数的变异。
可以使用如下图形来展示
par(mar=c(.1, .1, .1, .1),cex=1.5)
circos.par(
"start.degree" = 90, canvas.xlim = c(0, 1),
canvas.ylim = c(0, 1), gap.degree = 270,
cell.padding = c(0, 0, 0, 0),
track.margin = c(0.005, 0.005)
)
circos.initializeWithIdeogram(chromosome.index = "chr17")
circos.par(cell.padding = c(0, 0, 0, 0))
# 添加 CNV 结果
colors <- c("forestgreen","firebrick")
names(colors) <- c(0,1)
circos.genomicTrackPlotRegion(
cnv, ylim = c(0, 1.2),
panel.fun = function(region, value, ...) {
circos.genomicRect(
region, value, ytop.column = 2,
ybottom = 0, col = colors[value[[1]]],
border = "white"
)
cell.xlim = get.cell.meta.data("cell.xlim")
circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
}
)
# 添加单基因突变结果
gene.mut <- mutate(mut, genes.mut = glue("{Hugo_Symbol}-{type}")) %>%
filter(!duplicated(genes.mut)) %>% {
n.mut <- table(.$genes.mut)
mutate(., num = n.mut[.$genes.mut])
} %>%
mutate(
genes.num = as.character(glue("{Hugo_Symbol}({num})")),
type = type / 2
) %>%
dplyr::select(-(6:8))
colors <- c("blue","green","red","gold")
names(colors) <- typeNames[1:4]
circos.genomicTrackPlotRegion(
gene.mut, ylim = c(0.3, 2.2), track.height = 0.05,
panel.fun = function(region, value, ...) {
circos.genomicPoints(
region, value, cex = 0.4, pch = 16,
col = colors[value[[2]]], ...
)
}
)
circos.genomicTrackPlotRegion(
gene.mut, ylim = c(0, 1), track.height = 0.1, bg.border = NA
)
i_track = get.cell.meta.data("track.index")
circos.genomicTrackPlotRegion(
gene.mut, ylim = c(0, 1),
panel.fun = function(region, value, ...) {
circos.genomicText(
region, value, y = 1, labels.column = 3,
col = colors[value[[2]]], facing = "clockwise",
adj = c(1, 0.5), posTransform = posTransform.text,
cex = 0.4, niceFacing = TRUE
)
},
track.height = 0.1,
bg.border = NA
)
circos.genomicPosTransformLines(
gene.mut,
posTransform = function(region, value)
posTransform.text(
region, y = 1, labels = value[[3]],
cex = 0.4, track.index = i_track + 1
),
direction = "inside",
track.index = i_track
)
circos.clear()
legend(
0, 0.38, bty = "n", y.intersp = 1, c("Amp", "Del"),
pch = 15, col = c("firebrick", "forestgreen"),
title = "CNVs", text.font = 1, cex = 0.7, title.adj = 0
)
legend(
0, 0.2, bty = "n", y.intersp = 1, names(colors),
pch = 16, col = colors, title = "Mutations",
text.font = 1, cex = 0.7, title.adj = 0
)
完整代码:https://github.com/dxsbiocc/learn/blob/main/R/TCGA/Mutation_CNV_visualize.R