TCGATCGA数据挖掘

第一步:TCGA数据提取

2020-05-28  本文已影响0人  碌碌无为的杰少

下载数据

利用tcga官网gdc下载RNAclinical数据,这是胃腺癌的代码,可以全程套用,340例癌,33例癌旁。

options(stringsAsFactors = F)
library(stringr)
cancer_type="TCGA-stam"
if(!dir.exists("clinical"))dir.create("clinical")
if(!dir.exists("mrna"))dir.create("mrna")
dir()
#下面两行命令在terminal完成
#./gdc-client.exe download -m gdc_manifest.2020-05-19clinical.txt -d clinical
#./gdc-client.exe download -m gdc_manifest.2020-05-27rna.txt -d mrna

length(dir("./clinical/"))
length(dir("./mrna/"))

整理临床信息

library(XML)
#result <- xmlParse("./clinical/24e29974-c2e9-44a9-bd10-007b21c94d26/nationwidechildrens.org_clinical.TCGA-VQ-A91E.xml")
#rootnode <- xmlRoot(result)
#rootsize <- xmlSize(rootnode)
#print(rootnode[1])
#print(rootnode[2])
#xmldataframe <- xmlToDataFrame(rootnode[2])
#head(t(xmlToDataFrame(rootnode[2])))

xmls = dir("clinical/",pattern = "*.xml$",recursive = T)

td = function(x){
  result <- xmlParse(file.path("clinical/",x))
  rootnode <- xmlRoot(result)
  xmldataframe <- xmlToDataFrame(rootnode[2])
  return(t(xmldataframe))
}

cl = lapply(xmls,td)
cl_df <- t(do.call(cbind,cl))
cl_df[1:3,1:3]
clinical = data.frame(cl_df)
clinical[1:4,1:4]
image.png

整理表达矩阵

options(stringsAsFactors = F)
#x = read.table("mrna/01ebbb59-370a-439f-a2b2-3046055c43d4/14a248c3-0189-424d-8be1-8d8b39c705f0.htseq.counts.gz")
#x2 = read.table("mrna/0397a0ce-2224-4c97-a23b-9a3f767d7efc/bb3f1370-3234-4328-a9fb-8fa92da0d071.htseq.counts.gz")
#identical(x$V1,x2$V1)
#table(duplicated(x$V1))
count_files = dir("mrna/",pattern = "*.htseq.counts.gz$",recursive = T)

ex = function(x){
  result <- read.table(file.path("mrna/",x),row.names = 1,sep = "\t")
  return(result)
}
exp = lapply(count_files,ex)
exp <- do.call(cbind,exp)
dim(exp)
exp[1:4,1:4]
image.png

获得列名

meta <- jsonlite::fromJSON("metadata.cart.2020-05-27.json")
colnames(meta)
temp=meta$associated_entities[[1]]
ids <- meta$associated_entities;class(ids)
ids[[1]][,2]
class(ids[[1]][,2])
ID = sapply(ids,function(x){x[,2]})
file2id = data.frame(file_name = meta$file_name,
                     ID = ID)
head(file2id$file_name)
head(count_files)
count_files2 = stringr::str_split(count_files,"/",simplify = T)[,2]
count_files2[1] %in% file2id$file_name
file2id = file2id[match(count_files2,file2id$file_name),]
colnames(exp) = file2id$ID
exp[1:4,1:4]
image.png

过滤

dim(exp)
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 30), ]
dim(exp)
exp[1:4,1:4]

获得分组

table(str_sub(colnames(exp),14,15))
group_list = ifelse(as.numeric(str_sub(colnames(exp),14,15)) < 10,'tumor','normal')
group_list = factor(group_list,levels = c("normal","tumor"))
table(group_list)
save(exp,clinical,group_list,cancer_type,file = paste0(cancer_type,"gdc.Rdata"))
image.png
上一篇下一篇

猜你喜欢

热点阅读