R语言与科研生信学习

TCGAbiolinks下载TCGA数据(更新版本)

2019-08-19  本文已影响0人  医科研

TCGAbiolinks数据下载

load package

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("TCGAbiolinks")
library(TCGAbiolinks)
library(DT)
library(dplyr)
library(SummarizedExperiment)

数据来源-根据TCGAbiolinks的官方说明

数据检索

query.met <- GDCquery(project = "TCGA-COAD",
                      data.category = "DNA Methylation",
                      legacy = FALSE,
                      platform = c("Illumina Human Methylation 450"))
query.exp <- GDCquery(project = "TCGA-COAD",
                      data.category = "Transcriptome Profiling",
                      data.type = "Gene Expression Quantification", 
                      workflow.type = "HTSeq - FPKM-UQ")
query.exp[1:5,1:5]
query.met[1:5,1:5]
datatable(getResults(query.exp, cols = c("data_type","cases")),
          filter = 'top',
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
          rownames = FALSE)

mRNA Expression

query <- GDCquery(project = "TCGA-BRCA",# Cancer type
                  data.category = "Transcriptome Profiling",
                  data.type = "Gene Expression Quantification", 
                  workflow.type = "HTSeq - Counts"# raw count 
                  )
GDCdownload(query,
            directory = "./project",#
            method = "api",
            files.per.chunk = 100)# 
data <- GDCprepare(query)
count_data=assay(data)
count_data[1:5,1:5]
dim(count_data)##56537 1222

## clinical information
colData(data)[1:5,1:5]

## save data
#save(count_data,file = "BRCA_count.Rdata")

clinical data

clinical <- GDCquery_clinic(project = "TCGA-BRCA", type = "clinical")
clinical[1:5,1:5]
dim(clinical)## 1097 68

## save
#save(clinical,file="BRCA_clinical.Rdata")
#write.csv(clinical,file="TCGAbiolinks-BRCA-clinical.csv")

## clinical-2
clinical_2<-colData(data)
#write.csv(clinical,file="TCGAbiolinks-BRCA-clinical.csv")

## 获取所有TCGA的临床信息
library(data.table)
library(dplyr)
library(regexPipes)
clinical <- TCGAbiolinks:::getGDCprojects()$project_id %>% 
    regexPipes::grep("TCGA",value=T) %>% ## TCGA
    sort %>% 
    plyr::alply(1,GDCquery_clinic, .progress = "text") %>% 
    rbindlist
dim(clinical)
clinical[1:5,1:5]
#readr::write_csv(clinical,path = paste0("BRCA_clin_indexed.csv"))

miRNA data

query <- GDCquery(project = "TCGA-BRCA", 
                    data.category = "Transcriptome Profiling", 
                    data.type = "miRNA Expression Quantification", 
                    workflow.type = "BCGSC miRNA Profiling")
## 检束结果
results<-getResults(query)
dim(results)
results[1:5,1:5]
colnames(results)

## download data
GDCdownload(query, 
            method = "api", 
            files.per.chunk = 20)# 减少下载失败风险
mir_exp<- GDCprepare(query = query,
                     summarizedExperiment=F)# set F

## 数据结构
dim(mir_exp)
mir_exp[1:5,1:5]

## save data
#save(mir_exp,file="BRCA_miRNA_raw.Rdata")

总结

除了下载功能以外,TCGAbiolinks还包括了一些数据分析挖掘功能:

参考资料

  1. TCGAbiolinks Vignettes
  2. TCGAbiolinks文章1
  3. TCGAbiolinks文章2

广而告之

说一个事,鉴于简书平台在信息传播方面有不足之处,应粉丝要求,白介素2的个人微信平台已经开启,继续聊临床与科研的故事,R语言,数据挖掘,文献阅读等内容。当然也不要期望过高,微信平台目前的定位是作为自己的读书笔记,如果对大家有帮助最好。如果感兴趣, 可以扫码关注下。


qrcode_for_gh_9eaa04438675_258.jpg
上一篇 下一篇

猜你喜欢

热点阅读