TCGA数据挖掘

第五步:TCGA mRNA和lncRNA区分

2020-06-02  本文已影响0人  碌碌无为的杰少

mRNA的提取

options(stringsAsFactors = F)
load("anno.Rdata")
#step4:表达矩阵拆分和注释
load("TCGA-stamgdc.Rdata")
sum(rownames(exp) %in% mRNA_anno$gene_id)
mRNA_exp = exp[rownames(exp) %in% mRNA_anno$gene_id,]

tmp = data.frame(gene_id = rownames(exp))
x = dplyr::inner_join(tmp,mRNA_anno,by = "gene_id")
#inner_join不改变顺序,可以确认一下
identical(tmp$gene_id,rownames(exp))
table(!duplicated(x$gene_name))
#行名不允许重复,因此一个ensambelid对应多个symbol的需要去掉。
mRNA_exp = mRNA_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(mRNA_exp) = x$gene_name

mRNA_exp[1:4,1:4]
mRNA_exp = na.omit(mRNA_exp)

lncRNA的提取

lnc_exp = exp[rownames(exp) %in% lnc_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp))
x = dplyr::inner_join(tmp,lnc_anno,by = "gene_id")
identical(tmp$gene_id,rownames(exp))
table(!duplicated(x$gene_name))
lnc_exp = lnc_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(lnc_exp) = x$gene_name
lnc_exp[1:4,1:4]
lnc_exp = na.omit(lnc_exp)
save(lnc_exp,mRNA_exp,file = paste0(cancer_type,"deg_before.Rdata"))
上一篇下一篇

猜你喜欢

热点阅读