TCGA常见数据库下载方式
2021-07-22 本文已影响0人
医只蜗牛
TCGA常见下载方式:
【01】直接复制链接,在线下载解压
来源:
> getwd()
[1] "D:/R_code/follow_practice/xuetu_GEO_follow/week_practise/01_follow_practise/01_TP53_BRCA"
# Step1 download TCGA dateset ---------------------------------------------
rm(list=ls())
if (!file.exists( './data/TCGA-BRCA.htseq_counts.Rdata' )) {
gzfile <- "./raw_data/TCGA-BRCA.htseq_counts.tsv.gz"
download.file("https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-BRCA.htseq_counts.tsv.gz",
destfile = gzfile)
library(R.utils)
gunzip(gzfile, remove = F)
library(data.table)
raw_data <- fread( "./raw_data/TCGA-BRCA.htseq_counts.tsv",
sep = ' ', header = T)
raw_data <- as.data.frame( raw_data )
raw_data[1:5, 1:6]
rownames( raw_data ) <- raw_data[, 1]
raw_data <- raw_data[, -1]
raw_data[1:5, 1:6]
raw_data <- 2^raw_data - 1
raw_data <- ceiling( raw_data )
raw_data[1:5, 1:6]
pick_row <- apply( raw_data, 1, function(x){
sum(x == 0) < 10
})
raw_data <- raw_data[pick_row, ]
dim(raw_data )
save( raw_data, file = './data/TCGA-BRCA.htseq_counts.Rdata' )
}else{
load('./data/TCGA-BRCA.htseq_counts.Rdata')
}
# Step2 Grouping by special clinical information --------------------------
if (!file.exists( './raw_data/TCGA-BRCA.GDC_phenotype.tsv.gz' )) {
gzfile <- "./raw_data/TCGA-BRCA.GDC_phenotype.tsv.gz"
download.file("https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-BRCA.GDC_phenotype.tsv.gz",
destfile = gzfile)
phenoData <- read.table( gzfile,
header = T,
sep = ' ',
quote = '' )
save( phenoData, file = './data/TCGA-BRCA.GDC_phenotype.Rdata' )
}else{
load('./data/TCGA-BRCA.GDC_phenotype.Rdata')
}
pheno_num <- c()
invisible(
lapply(1:ncol(phenoData),
function(col_num){
## Assume that the classification project is between 2 and 4
if (1 < dim(table(phenoData[,col_num])) &
dim(table(phenoData[,col_num])) < 5) {
pheno_num <<- append(pheno_num, col_num, after = length(pheno_num))
}
}
)
)
View(phenoData[, pheno_num])
names(phenoData[, pheno_num])
## Category 3: TP53
if (!file.exists( './raw_data/TCGA-BRCA.mutect2_snv.tsv.gz' )) {
gzfile <- "./raw_data/TCGA-BRCA.mutect2_snv.tsv.gz"
download.file("https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-BRCA.mutect2_snv.tsv.gz",
destfile = gzfile)
mutype_file <- read.table( gzfile,
header = T,
sep = ' ',
quote = '' )
save( mutype_file, file = './data/TCGA-BRCA.mutect2_snv.Rdata' )
}else{
load('./data/TCGA-BRCA.mutect2_snv.Rdata')
}
## Pick columns that contains 'tp53'
TP53 <- mutype_file[mutype_file$gene == 'tp53' | mutype_file$gene == 'TP53',]
TP53_sample <- unique( sort( TP53$Sample_ID ) )
tumor_sample <- colnames(raw_data)[substr( colnames(raw_data),14,15) < 10]
TP53_sample <- intersect(tumor_sample, TP53_sample) #intersect取交集
noTP53_sample <- setdiff(tumor_sample, TP53_sample)
save(TP53_sample, noTP53_sample, file = './data/sample_by_TP53.Rdata')
# Step3 Filt sample ------------------------------------------------
load('./data/TCGA-BRCA.htseq_counts.Rdata')
tp53_sample <- c(TP53_sample, noTP53_sample)
AssayData <- raw_data[, tp53_sample]
dim(AssayData)
group_list <- c(rep('TP53', length(TP53_sample)),
rep('NO_TP53', length(noTP53_sample)))
save(AssayData, group_list, file = './data/tnbc_tumor_TP53_AssayData.Rdata')
【02】UCSCXenaTools包下载
来源:
UCSCXenaTools包下载,下载好之后直接是可操作形式,省去fread()函数读取和处理。
getwd()
[1] "D:/R_code/follow_practice/xuetu_GEO_follow/week_practise/01_follow_practise/03_TCGA-BRCA"
这地方需要注意,UCSCXenaTools下载的TCGA-BRCA.mutect2_snv.tsv这种可以直接使用,但是 TCGA-BRCA.htseq_counts.tsv不行,读出来的不一样【修正】。也可以直接用,直接赋值给
需要注意,临床信息可能不一样。临床信息中TCGAbiolinks包下载的更佳。
a= raw_data
a=as.data.frame(a) ##后面是一样的使用。
就是说,前面那种下载解压方法, 后面的可以直接保存曾.Rdata文件,然后上面fread()读出来的效果和.Rdata是一样的。但后面需要进行的操作不能少。
结合这个看,两者结合。后面有筛选的部分。
##等价于自己官网下载
rm(list=ls())
library("UCSCXenaTools")
raw_data<-XenaGenerate(subset = XenaCohorts =="GDC TCGA Breast Cancer (BRCA)")%>%
XenaFilter(filterDatasets = "TCGA-BRCA.htseq_counts.tsv") %>%
XenaQuery() %>%
XenaDownload() %>%
XenaPrepare() #加载数据
head(raw_data)
##查找特定的
# stad_set4 <- XenaScan(pattern = 'stomach cancer ')
# stad_set5 <- stad_set4 %>%
# XenaGenerate()
##返回第一步下载
save(raw_data,file = "TCGA-BRCA.htseq_counts.tsv")
save(raw_data,file = "TCGA-BRCA.htseq_counts.Rdata")
library("UCSCXenaTools")
phenotype_file<-XenaGenerate(subset = XenaCohorts =="GDC TCGA Breast Cancer (BRCA)")%>%
XenaFilter(filterDatasets = "TCGA-BRCA.GDC_phenotype.tsv") %>%
XenaQuery() %>%
XenaDownload() %>%
XenaPrepare() #加载数据
head(phenotype_file)
save(phenotype_file,file = "TCGA-BRCA.GDC_phenotype_file.tsv")
save(phenotype_file,file = "TCGA-BRCA.GDC_phenotype_file.Rdata")