TCGATCGA数据挖掘

第三步:TCGA miRNA数据提取

2020-06-02  本文已影响0人  碌碌无为的杰少

创建文件夹

options(stringsAsFactors = F)
library(stringr)
cancer_type="TCGA-stam"
if(!dir.exists("clinical"))dir.create("clinical")
if(!dir.exists("miRNA"))dir.create("miRNA")
dir()
#下面两行命令在terminal完成
#./gdc-client.exe download -m gdc_manifest.2020-05-19clinical.txt -d clinical
#./gdc-client.exe download -m gdc_manifest.2020-06-01.txt -d miRNA

length(dir("./clinical/"))
length(dir("./miRNA/"))

建立临床信息

library(XML)
result <- xmlParse("./clinical/015a247e-0ff0-4261-84a2-59dc75184386/nationwidechildrens.org_clinical.TCGA-VQ-A8PS.xml")
rootnode <- xmlRoot(result)
rootsize <- xmlSize(rootnode)
print(rootnode[1])
print(rootnode[2])
xmldataframe <- xmlToDataFrame(rootnode[2])
head(t(xmlToDataFrame(rootnode[2])))

xmls = dir("clinical/",pattern = "*.xml$",recursive = T)

td = function(x){
  result <- xmlParse(file.path("clinical/",x))
  rootnode <- xmlRoot(result)
  xmldataframe <- xmlToDataFrame(rootnode[2])
  return(t(xmldataframe))
}

cl = lapply(xmls,td)
cl_df <- t(do.call(cbind,cl))
cl_df[1:3,1:3]
clinical = data.frame(cl_df)
clinical[1:4,1:4]

建立表达矩阵

options(stringsAsFactors = F)
x = read.table("miRNA/00a47351-5052-4cb1-a38b-00da7d37f5a2/2790a83c-c46b-4363-9180-4d0997d004ba.mirbase21.mirnas.quantification.txt")
x2 = read.table("miRNA/05a9d2d2-e288-4dc8-97c6-c5e6b08dc6dc/c65a4303-17b1-49fb-82f2-7161baf895b9.mirbase21.mirnas.quantification.txt")
identical(x$V1,x2$V1)
table(duplicated(x$V1))
count_files = dir("miRNA/",pattern = "*.mirnas.quantification.txt$",recursive = T)

ex = function(x){
  result <- read.table(file.path("miRNA/",x),row.names = 1,sep = "\t",header = T)[1]
  return(result)
}
dd1 <- head(ex("0099ddb3-a514-476c-88e1-bf790c067223/a9c5301f-c110-4579-afd2-e3a8052b68c8.mirbase21.mirnas.quantification.txt"))

exp = lapply(count_files,ex)
exp <- do.call(cbind,exp)
dim(exp)
exp[1:4,1:4]
meta <- jsonlite::fromJSON("metadata.cart.2020-06-01.json")
colnames(meta)
temp=meta$associated_entities[[1]]
ids <- meta$associated_entities;class(ids)
ids[[1]][,2]
class(ids[[1]][,2])
ID = sapply(ids,function(x){x[,2]})
file2id = data.frame(file_name = meta$file_name,
                     ID = ID)
head(file2id$file_name)
head(count_files)
count_files2 = stringr::str_split(count_files,"/",simplify = T)[,2]
count_files2[1] %in% file2id$file_name
file2id = file2id[match(count_files2,file2id$file_name),]
colnames(exp) = file2id$ID
exp[1:4,1:4]

过滤

dim(exp)
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 100), ]
dim(exp)
exp[1:4,1:4]

分组信息

table(str_sub(colnames(exp),14,15))
group_list = ifelse(as.numeric(str_sub(colnames(exp),14,15)) < 10,'tumor','normal')
group_list = factor(group_list,levels = c("normal","tumor"))
table(group_list)
save(exp,clinical,group_list,cancer_type,file = paste0(cancer_type,"gdc.Rdata"))
上一篇下一篇

猜你喜欢

热点阅读