Bioconductor --- TCGA Workflow
今天介绍生信平台Bioconductor上的TCGA Workflow,首先是R包的安装
if (!"BiocManager" %in% rownames(installed.packages()))
install.packages("BiocManager")
BiocManager::install("TCGAWorkflow")
然后载入需要的R包,前者是各种示例数据集的封装,后者用于可视化
library(TCGAWorkflowData)
library(DT)
整体上我只是大致地跑一下,真正用到自己的科研项目上各函数各参数还要再细致雕琢。Bioconductor包的特点就是充满了S4对象,如果想流畅操作Bioconductor上的各种R包,对于S4型面向对象编程的深入理解是必不可少的。
1. Access to the data
1.1 Downloading data from TCGA data portal
R包TCGAbiolinks有3个主函数,GDCquery()
, GDCdownload()
, GDCprepare()
,分别用于查询,下载和将数据作为R对象加载。
R包TCGAbiolinks下载 GDC Legacy Archive 中的甲基化数据和基因表达数据
library(TCGAbiolinks)
library(SummarizedExperiment)
# Obs: The data in the legacy database has been aligned to hg19
query.met.gbm <- GDCquery(project = "TCGA-GBM",
legacy = TRUE,
data.category = "DNA methylation",
platform = "Illumina Human Methylation 450",
barcode = c("TCGA-76-4926-01B-01D-1481-05", "TCGA-28-5211-01C-11D-1844-05"))
GDCdownload(query.met.gbm)
met.gbm.450 <- GDCprepare(query = query.met.gbm,
save = TRUE,
save.filename = "gbmDNAmet450k.rda",
summarizedExperiment = TRUE)
query.met.lgg <- GDCquery(project = "TCGA-LGG",
legacy = TRUE,
data.category = "DNA methylation",
platform = "Illumina Human Methylation 450",
barcode = c("TCGA-HT-7879-01A-11D-2399-05", "TCGA-HT-8113-01A-11D-2399-05"))
GDCdownload(query.met.lgg)
met.lgg.450 <- GDCprepare(query = query.met.lgg,
save = TRUE,
save.filename = "lggDNAmet450k.rda",
summarizedExperiment = TRUE)
met.gbm.lgg <- cbind(assay(met.lgg.450), assay(met.gbm.450))
query.exp.lgg <- GDCquery(project = "TCGA-LGG",
legacy = TRUE,
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "results",
sample.type = "Primary solid Tumor")
GDCdownload(query.exp.lgg)
exp.lgg <- GDCprepare(query = query.exp.lgg, save = TRUE, save.filename = "lggExp.rda")
query.exp.gbm <- GDCquery(project = "TCGA-GBM",
legacy = TRUE,
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "results",
sample.type = "Primary solid Tumor")
GDCdownload(query.exp.gbm)
exp.gbm <- GDCprepare(query = query.exp.gbm, save = TRUE, save.filename = "gbmExp.rda")
exp.gbm.lgg <- cbind(assay(exp.lgg), assay(exp.gbm))
R包TCGAbiolinks下载 GDC data portal 中的拷贝数变异数据
# Data.category: Copy number variation aligned to hg38
query <- GDCquery(project = "TCGA-ACC",
data.category = "Copy Number Variation",
data.type = "Copy Number Segment",
barcode = c( "TCGA-OR-A5KU-01A-11D-A29H-01", "TCGA-OR-A5JK-01A-11D-A29H-01"))
GDCdownload(query)
data <- GDCprepare(query)
query <- GDCquery("TCGA-ACC",
"Copy Number Variation",
data.type = "Masked Copy Number Segment",
sample.type = c("Primary solid Tumor")) # see the barcodes with getResults(query)$cases
GDCdownload(query)
data <- GDCprepare(query)
R包TCGAbiolinks下载临床数据,有两种方法,第一种方法是使用函数GDCquery_clinical()
下载indexed GDC clinical data,第二种方法将下载包含患者所有临床数据的xml文件,并从中检索所需的信息。
The first one will download only the indexed GDC clinical data which includes diagnoses (vital status, days to death, age at diagnosis, days to last follow up, days to recurrence), treatments (days to treatment, treatment id, therapeutic agents, treatment intent type), demographic (gender, race, ethnicity) and exposures (cigarettes per day, weight, height, alcohol history) information.
# get indexed clinical patient data for GBM samples
gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical")
# get indexed clinical patient data for LGG samples
lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical")
# Bind the results, as the columns might not be the same,
# we will will plyr rbind.fill, to have all columns from both files
clinical <- plyr::rbind.fill(gbm_clin,lgg_clin)
# Fetch clinical data directly from the clinical XML files.
# if barcode is not set, it will consider all samples.
# We only set it to make the example faster
query <- GDCquery(project = "TCGA-GBM",
file.type = "xml",
data.category = "Clinical",
barcode = c("TCGA-08-0516","TCGA-02-0317"))
GDCdownload(query)
clinical <- GDCprepare_clinic(query, clinical.info = "patient")
clinical.drug <- GDCprepare_clinic(query, clinical.info = "drug")
clinical.radiation <- GDCprepare_clinic(query, clinical.info = "radiation")
clinical.admin <- GDCprepare_clinic(query, clinical.info = "admin")
R包TCGAbiolinks下载突变数据
mutation <– GDCquery_Maf(tumor = "ACC", pipelines = "mutect2")
访问从TCGA论文中检索到的亚型信息
gbm.subtypes <– TCGAquery_subtype(tumor = "gbm")
brca.subtypes <– TCGAquery_subtype(tumor = "brca")
补充介绍SummarizedExperiment S4对象,可以使用三个不同的accessor访问数据,assay()
获取表达矩阵,rowRanges()
获取基因信息,colData()
获取样本信息。
# R object --- SummarizedExperiment
library(TCGAWorkflowData)
library(SummarizedExperiment)
# Load object from TCGAWorkflowData package
# This object will be created in the further sections,
data(GBMIllumina_HiSeq)
# get expression matrix
data <- assay(gbm.exp)
datatable(data[1:10,],
options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),
rownames = TRUE)
# get genes information
genes.info <- rowRanges(gbm.exp)
genes.info
# get sample information
sample.info <- colData(gbm.exp)
1.2 Downloading data from Broad TCGA GDAC
R包RTCGAToolbox下载GDAC数据
library(RTCGAToolbox)
# Get the last run dates
lastRunDate <- getFirehoseRunningDates()[1]
lastAnalyseDate <- getFirehoseAnalyzeDates()[1]
# get DNA methylation data, RNAseq2 and clinical data for LGG
lgg.data <- getFirehoseData(dataset = "LGG",
gistic2_Date = getFirehoseAnalyzeDates(1), runDate = lastRunDate,
Methylation = FALSE, RNAseq2_Gene_Norm = FALSE, Clinic = TRUE, Mutation = TRUE,
fileSizeLimit = 10000)
# get DNA methylation data, RNAseq2 and clinical data for GBM
gbm.data <- getFirehoseData(dataset = "GBM",
runDate = lastDate, gistic2_Date = getFirehoseAnalyzeDates(1),
Methylation = FALSE, Clinic = TRUE, RNAseq2_Gene_Norm = TRUE, Mutation = TRUE,
fileSizeLimit = 10000)
# To access the data you should use the getData function or simply access with @ (for example gbm.data@Clinical)
lgg.mut <- getData(lgg.data, "Mutation")
lgg.clin <- getData(lgg.data, "clinical")
# Download GISTIC results
lastanalyzedate <- getFirehoseAnalyzeDates(1)
gistic <- getFirehoseData("GBM", gistic2Date = lastanalyzedate, clinical = FALSE, GISTIC = TRUE)
# get GISTIC results
gistic.allbygene <- getData(gistic, type = "GISTIC", platform = "AllByGene")
gistic.thresholedbygene <- getData(gistic, type = "GISTIC", platform = "ThresholdedByGene")
2. Genomic analysis
拷贝数变异(CNVs)在癌症的发生和发展中起着重要的作用。作为基因组重排(如缺失,重复,插入,倒位和易位)的结果,染色体片段可以发生缺失或扩增。CNVs是大于1 kb的基因组区域,在两种情况下拷贝数发生变化(例如,肿瘤与正常情况)。
TCGA收集拷贝数数据,并允许对癌症进行CNV分析。利用微阵列和基于测序的技术,对肿瘤和配对正常DNA样本进行分析,探测拷贝数变异。作者将展示如何分析来自TCGA的CNV level 3数据,以确定癌症基因组的反复变异,分析了来自SNP array (Affymetrix Genome-Wide Human SNP Array 6.0) 的GBM和LGG segmented CNV 。
2.1 Pre-Processing Data
library(TCGAbiolinks)
query.lgg.nocnv <- GDCquery(project = "TCGA-LGG",
data.category = "Copy number variation",
legacy = TRUE,
file.type = "nocnv_hg19.seg",
sample.type = c("Primary solid Tumor"))
GDCdownload(query.lgg.nocnv, files.per.chunk = 100)
lgg.nocnv <- GDCprepare(query.lgg.nocnv, save = TRUE, save.filename = "LGGnocnvhg19.rda")
query.gbm.nocnv <- GDCquery(project = "TCGA-GBM",
data.category = "Copy number variation",
legacy = TRUE,
file.type = "nocnv_hg19.seg",
sample.type = c("Primary solid Tumor"))
GDCdownload(query.gbm.nocnv, files.per.chunk = 100)
gbm.nocnv <- GDCprepare(query.gbm.nocnv, save = TRUE, save.filename = "GBMnocnvhg19.rda")
2.2 Identification of recurrent CNV in cancer
在许多分析的基因组中都存在与癌症相关的拷贝数变异。R包GAIA用于识别频发拷贝数变异。
library(TCGAbiolinks)
library(downloader)
library(readr)
library(gaia)
for(cancer in c("LGG","GBM")){
message(paste0("Starting", cancer))
# Prepare CNV matrix
cnvMatrix <- get(load(paste0(cancer, "nocnvhg19.rda")))
# Add label (0 for loss, 1 for gain)
cnvMatrix <- cbind(cnvMatrix, Label = NA)
cnvMatrix[cnvMatrix[, "Segment_Mean"] < -0.3, "Label"] <- 0
cnvMatrix[cnvMatrix[, "Segment_Mean"] > 0.3, "Label"] <- 1
cnvMatrix <- cnvMatrix [!is.na(cnvMatrix$Label),]
# Remove "Segment_Mean" and change col.names
cnvMatrix <- cnvMatrix[,-6]
colnames(cnvMatrix)<- c("Sample.Name", "Chromosome","Start","End","Num.of.Markers", "Aberration")
# Substitute Chromosomes "X" and "Y" with "23" and "24"
xidx <- which(cnvMatrix$Chromosome=="X")
yidx <- which(cnvMatrix$Chromosome=="Y")
cnvMatrix[xidx,"Chromosome"] <- 23
cnvMatrix[yidx,"Chromosome"] <- 24
cnvMatrix$Chromosome <- sapply(cnvMatrix$Chromosome,as.integer)
# Recurrent CNV identification with GAIA
# Retrieve probes meta file from broadinstitute website
# Recurrent CNV identification with GAIA
gdac.root <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/"
file <- paste0(gdac.root, "genome.info.6.0_hg19.na31_minus_frequent_nan_probes_sorted_2.1.txt")
# Retrieve probes meta file from broadinstitute website
if(!file.exists(basename(file))) downloader::download(file, basename(file))
markersMatrix <- readr::read_tsv(basename(file), col_names = FALSE, col_types = "ccn", progress = TRUE)
colnames(markersMatrix) <- c("Probe.Name", "Chromosome", "Start")
unique(markersMatrix$Chromosome)
xidx <- which(markersMatrix$Chromosome=="X")
yidx <- which(markersMatrix$Chromosome=="Y")
markersMatrix[xidx,"Chromosome"] <- 23
markersMatrix[yidx,"Chromosome"] <- 24
markersMatrix$Chromosome <- sapply(markersMatrix$Chromosome,as.integer)
markerID <- apply(markersMatrix,1,function(x) paste0(x[2],":",x[3]))
print(table(duplicated(markerID)))
## FALSE TRUE
## 1831041 186
# There are 186 duplicated markers
print(table(duplicated(markersMatrix$Probe.Name)))
## FALSE
## 1831227
# ... with different names!
# Removed duplicates
markersMatrix <- markersMatrix[-which(duplicated(markerID)),]
# Filter markersMatrix for common CNV
markerID <- apply(markersMatrix,1,function(x) paste0(x[2],":",x[3]))
file <- paste0(gdac.root, "CNV.hg19.bypos.111213.txt")
if(!file.exists(basename(file))) download(file, basename(file))
commonCNV <- readr::read_tsv(basename(file), progress = TRUE)
commonID <- apply(commonCNV,1,function(x) paste0(x[2],":",x[3]))
print(table(commonID %in% markerID))
print(table(markerID %in% commonID))
markersMatrix_fil <- markersMatrix[!markerID %in% commonID,]
markers_obj <- load_markers(as.data.frame(markersMatrix_fil))
nbsamples <- length(get(paste0("query.",tolower(cancer),".nocnv"))$results[[1]]$cases)
cnv_obj <- load_cnv(cnvMatrix, markers_obj, nbsamples)
results <- runGAIA(cnv_obj,
markers_obj,
output_file_name = paste0("GAIA_",cancer,"_flt.txt"),
aberrations = -1, # -1 to all aberrations
chromosomes = -1, # -1 to all chromosomes
num_iterations = 10,
threshold = 0.25)
# Set q–value threshold
threshold <- 0.0001
# Plot the results
RecCNV <- t(apply(results,1,as.numeric))
colnames(RecCNV) <- colnames(results)
RecCNV <- cbind(RecCNV, score=0)
minval <- format(min(RecCNV[RecCNV[,"q-value"]!=0,"q-value"]),scientific=FALSE)
minval <- substring(minval,1, nchar(minval)-1)
RecCNV[RecCNV[,"q-value"]==0,"q-value"] <- as.numeric(minval)
RecCNV[,"score"] <- sapply(RecCNV[,"q-value"],function(x) -log10(as.numeric(x)))
RecCNV[RecCNV[,"q-value"]==as.numeric(minval),]
gaiaCNVplot(RecCNV, threshold)
save(results, RecCNV, threshold, file = paste0(cancer,"_CNV_results.rda"))
message(paste0("Results saved as:", cancer,"_CNV_results.rda"))
}
R包TCGAbiolinks的函数gaiaCNVplot()
用于可视化,为了节省时间我只跑了部分样本,得到这样的示意图
经鉴定拷贝数显著改变的基因组区域 (corrected p-value < 10–4) 随后被注释,以报告可能与癌症相关的扩增和缺失基因。
2.3 Identification of recurrent CNV in cancer
library(biomaRt)
library(GenomicRanges)
# Get gene information from GENCODE using biomart
genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg19")
# mart <– useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
# genes <– getBM(attributes = c("hgnc_symbol", "chromosome_name","start_position","end_position"), mart=mart)
genes <- genes[genes$external_gene_name != "" & genes$chromosome_name %in% c(1:22,"X","Y"),]
genes[genes$chromosome_name == "X", "chromosome_name"] <- 23
genes[genes$chromosome_name == "Y", "chromosome_name"] <- 24
genes$chromosome_name <- sapply(genes$chromosome_name,as.integer)
genes <- genes[order(genes$start_position),]
genes <- genes[order(genes$chromosome_name),]
genes <- genes[,c("external_gene_name", "chromosome_name", "start_position","end_position")]
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
save(genes_GR,genes,file = "genes_GR.rda", compress = "xz")
# Get gene information from GENCODE using biomart
data(genes_GR) # downloaded in the previous step (available in TCGAWorkflowData)
cancer = "LGG"
load(paste0(cancer,"_CNV_results.rda"))
sCNV <- RecCNV[RecCNV[,"q-value"] <= threshold,c(1:4,6)]
sCNV <- sCNV[order(sCNV[,3]),]
sCNV <- sCNV[order(sCNV[,1]),]
colnames(sCNV) <- c("Chr","Aberration","Start","End","q-value")
sCNV_GR <- makeGRangesFromDataFrame(sCNV,keep.extra.columns = TRUE)
hits <- findOverlaps(genes_GR, sCNV_GR, type = "within")
sCNV_ann <- cbind(sCNV[subjectHits(hits),],genes[queryHits(hits),])
AberrantRegion <- paste0(sCNV_ann[,1],":",sCNV_ann[,3],"-",sCNV_ann[,4])
GeneRegion <- paste0(sCNV_ann[,7],":",sCNV_ann[,8],"-",sCNV_ann[,9])
AmpDel_genes <- cbind(sCNV_ann[,c(6,2,5)],AberrantRegion,GeneRegion)
AmpDel_genes[AmpDel_genes[,2] == 0,2] <- "Del"
AmpDel_genes[AmpDel_genes[,2] == 1,2] <- "Amp"
rownames(AmpDel_genes) <- NULL
save(RecCNV, AmpDel_genes, file = paste0(cancer,"_CNV_results.rda"))
2.4 Visualizing multiple genomic alteration events
R包circlize进行Circos圈图的绘制
LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
# Retrieve curated mutations for selected cancer (e.g. "LGG")
data(mafMutect2LGGGBM)
# Select only potentially damaging mutations
LGGmut <- LGGmut[LGGmut$Variant_Classification %in% c("Missense_Mutation",
"Nonsense_Mutation",
"Nonstop_Mutation",
"Frame_Shift_Del",
"Frame_Shift_Ins"),]
# Select recurrent mutations (identified in at least two samples)
mut.id <- paste0(LGGmut$Chromosome,":",LGGmut$Start_Position,"-",LGGmut$End_Position,"|",LGGmut$Reference_Allele)
mut <- cbind(mut.id, LGGmut)
# Prepare selected mutations data for circos plot
s.mut <- mut[mut$mut.id %in% unique(mut.id[duplicated(mut.id)]),]
s.mut <- s.mut[,c("Chromosome","Start_Position","End_Position","Variant_Classification","Hugo_Symbol")]
s.mut <- unique(s.mut)
typeNames <- unique(s.mut[,4])
type <- c(4:1)
names(type) <- typeNames[1:4]
Type <- type[s.mut[,4]]
s.mut <- cbind(s.mut,Type)
s.mut <- s.mut[,c(1:3,6,4,5)]
# Load recurrent CNV data for selected cancer (e.g. "LGG")
load("LGG_CNV_results.rda")
# Prepare selected sample CNV data for circos plot
s.cnv <- as.data.frame(RecCNV[RecCNV[,"q-value"] <= 0.0001,c(1:4,6)])
s.cnv <- s.cnv[,c(1,3,4,2)]
s.cnv[s.cnv$Chromosome == 23,"Chromosome"] <- "X"
s.cnv[s.cnv$Chromosome == 24,"Chromosome"] <- "Y"
Chromosome <- paste0("chr",s.cnv[,1])
s.cnv <- cbind(Chromosome, s.cnv[,-1])
s.cnv[,1] <- as.character(s.cnv[,1])
s.cnv[,4] <- as.character(s.cnv[,4])
s.cnv <- cbind(s.cnv,CNV=1)
colnames(s.cnv) <- c("Chromosome","Start_position","End_position","Aberration_Kind","CNV")
library(circlize)
# Draw genomic circos plot
par(mar=c(1,1,1,1), cex=1)
circos.initializeWithIdeogram()
# Add CNV results
colors <- c("forestgreen","firebrick")
names(colors) <- c(0,1)
circos.genomicTrackPlotRegion(s.cnv, ylim = c(0,1.2),
panel.fun = function(region, value, ...) {
circos.genomicRect(region, value, ytop.column = 2, ybottom = 0,
col = colors[value[[1]]],
border="white")
cell.xlim = get.cell.meta.data("cell.xlim")
circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
})
# Add mutation results
colors <- c("blue","green","red","gold")
names(colors) <- typeNames[1:4]
circos.genomicTrackPlotRegion(s.mut, ylim = c(1.2,4.2),
panel.fun = function(region, value, ...) {
circos.genomicPoints(region, value, cex = 0.8, pch = 16, col = colors[value[[2]]], ...)
})
circos.clear()
legend(-0.2, 0.2, bty="n", y.intersp=1, c("Amp","Del"), pch=15,
col=c("firebrick","forestgreen"), title="CNVs", text.font=1, cex=0.4, title.adj=0)
legend(-0.2, 0, bty="n", y.intersp=1, names(colors), pch=16,
col=colors, title="Mutations", text.font=1, cex=0.4, title.adj=0)
par(mar=c(1,1,1,1),cex=1.5)
circos.par("start.degree" = 90, canvas.xlim = c(0, 1), canvas.ylim = c(0, 1),
gap.degree = 270, cell.padding = c(0, 0, 0, 0), track.margin = c(0.005, 0.005))
circos.initializeWithIdeogram(chromosome.index = "chr17")
circos.par(cell.padding = c(0, 0, 0, 0))
# Add CNV results
colors <- c("forestgreen","firebrick")
names(colors) <- c(0,1)
circos.genomicTrackPlotRegion(s.cnv, ylim = c(0,1.2),
panel.fun = function(region, value, ...) {
circos.genomicRect(region, value, ytop.column = 2, ybottom = 0,
col = colors[value[[1]]],
border="white")
cell.xlim = get.cell.meta.data("cell.xlim")
circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
})
# Add mutation results representing single genes
genes.mut <- paste0(s.mut$Hugo_Symbol,"-",s.mut$Type)
s.mutt <- cbind(s.mut,genes.mut)
n.mut <- table(genes.mut)
idx <- !duplicated(s.mutt$genes.mut)
s.mutt <- s.mutt[idx,]
s.mutt <- cbind(s.mutt,num=n.mut[s.mutt$genes.mut])
genes.num <- paste0(s.mutt$Hugo_Symbol," (",s.mutt$num.Freq,")")
s.mutt <- cbind(s.mutt[,-c(6:8)],genes.num)
s.mutt[,6] <- as.character(s.mutt[,6])
s.mutt[,4] <- s.mutt[,4]/2
s.mutt$num.Freq <- NULL
colors <- c("blue","green","red","gold")
names(colors) <- typeNames[1:4]
circos.genomicTrackPlotRegion(s.mutt, ylim = c(0.3,2.2), track.height = 0.05,
panel.fun = function(region, value, ...) {
circos.genomicPoints(region, value, cex = 0.4, pch = 16, col = colors[value[[2]]], ...)
})
circos.genomicTrackPlotRegion(s.mutt, ylim = c(0, 1), track.height = 0.1, bg.border = NA)
i_track = get.cell.meta.data("track.index")
circos.genomicTrackPlotRegion(s.mutt, ylim = c(0,1),
panel.fun = function(region, value, ...) {
circos.genomicText(region, value,
y = 1,
labels.column = 3,
col = colors[value[[2]]],
facing = "clockwise", adj = c(1, 0.5),
posTransform = posTransform.text, cex = 0.4, niceFacing = TRUE)
}, track.height = 0.1, bg.border = NA)
circos.genomicPosTransformLines(s.mutt,
posTransform = function(region, value)
posTransform.text(region,
y = 1,
labels = value[[3]],
cex = 0.4, track.index = i_track+1),
direction = "inside", track.index = i_track)
circos.clear()
legend(0.25, 0.2, bty="n", y.intersp=1, c("Amp","Del"), pch=15,
col=c("firebrick","forestgreen"), title="CNVs", text.font=1, cex=0.4, title.adj=0)
legend(0, 0.2, bty="n", y.intersp=1, names(colors), pch=16,
col=colors, title="Mutations", text.font=1, cex=0.4, title.adj=0)
未完待续……