第一步:TCGA数据提取
2020-05-28 本文已影响0人
碌碌无为的杰少
下载数据
利用tcga官网gdc
下载RNA
和clinical
数据,这是胃腺癌
的代码,可以全程套用,340例癌,33例癌旁。
options(stringsAsFactors = F)
library(stringr)
cancer_type="TCGA-stam"
if(!dir.exists("clinical"))dir.create("clinical")
if(!dir.exists("mrna"))dir.create("mrna")
dir()
#下面两行命令在terminal完成
#./gdc-client.exe download -m gdc_manifest.2020-05-19clinical.txt -d clinical
#./gdc-client.exe download -m gdc_manifest.2020-05-27rna.txt -d mrna
length(dir("./clinical/"))
length(dir("./mrna/"))
整理临床信息
library(XML)
#result <- xmlParse("./clinical/24e29974-c2e9-44a9-bd10-007b21c94d26/nationwidechildrens.org_clinical.TCGA-VQ-A91E.xml")
#rootnode <- xmlRoot(result)
#rootsize <- xmlSize(rootnode)
#print(rootnode[1])
#print(rootnode[2])
#xmldataframe <- xmlToDataFrame(rootnode[2])
#head(t(xmlToDataFrame(rootnode[2])))
xmls = dir("clinical/",pattern = "*.xml$",recursive = T)
td = function(x){
result <- xmlParse(file.path("clinical/",x))
rootnode <- xmlRoot(result)
xmldataframe <- xmlToDataFrame(rootnode[2])
return(t(xmldataframe))
}
cl = lapply(xmls,td)
cl_df <- t(do.call(cbind,cl))
cl_df[1:3,1:3]
clinical = data.frame(cl_df)
clinical[1:4,1:4]
image.png
整理表达矩阵
options(stringsAsFactors = F)
#x = read.table("mrna/01ebbb59-370a-439f-a2b2-3046055c43d4/14a248c3-0189-424d-8be1-8d8b39c705f0.htseq.counts.gz")
#x2 = read.table("mrna/0397a0ce-2224-4c97-a23b-9a3f767d7efc/bb3f1370-3234-4328-a9fb-8fa92da0d071.htseq.counts.gz")
#identical(x$V1,x2$V1)
#table(duplicated(x$V1))
count_files = dir("mrna/",pattern = "*.htseq.counts.gz$",recursive = T)
ex = function(x){
result <- read.table(file.path("mrna/",x),row.names = 1,sep = "\t")
return(result)
}
exp = lapply(count_files,ex)
exp <- do.call(cbind,exp)
dim(exp)
exp[1:4,1:4]
image.png
获得列名
meta <- jsonlite::fromJSON("metadata.cart.2020-05-27.json")
colnames(meta)
temp=meta$associated_entities[[1]]
ids <- meta$associated_entities;class(ids)
ids[[1]][,2]
class(ids[[1]][,2])
ID = sapply(ids,function(x){x[,2]})
file2id = data.frame(file_name = meta$file_name,
ID = ID)
head(file2id$file_name)
head(count_files)
count_files2 = stringr::str_split(count_files,"/",simplify = T)[,2]
count_files2[1] %in% file2id$file_name
file2id = file2id[match(count_files2,file2id$file_name),]
colnames(exp) = file2id$ID
exp[1:4,1:4]
image.png
过滤
dim(exp)
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 30), ]
dim(exp)
exp[1:4,1:4]
获得分组
table(str_sub(colnames(exp),14,15))
group_list = ifelse(as.numeric(str_sub(colnames(exp),14,15)) < 10,'tumor','normal')
group_list = factor(group_list,levels = c("normal","tumor"))
table(group_list)
save(exp,clinical,group_list,cancer_type,file = paste0(cancer_type,"gdc.Rdata"))
image.png