ID转换 从gtf做 还有基因长度的提取
使用R包的时候,经常会很多匹配不到
从做数据的gtf来构建ID转换列表
下载相应的gtf文件后读入R
library(stringr)
library(readr)
gtf_in <- read_delim("Mus_musculus.GRCm38.102.gtf",
"\t", escape_double = FALSE, col_names = FALSE,
comment = "#", trim_ws = TRUE)
gtf_in <- gtf_in[gtf_in$X3=="transcript",]#第三列feature是gene的保留下来
gtf_in <- gtf_in[,-c(2,6,7,8)]
gene_info <- data.frame(str_split_fixed(unique(gtf_in$X9),";",8))
gene_info2 <- unique(data.frame("gene_name"=str_split_fixed(gene_info$X5,'"',3)[,2],
"transcript_id"=str_split_fixed(gene_info$X3,'"',3)[,2],
"gene_biotype"=str_split_fixed(gene_info$X7,'"',3)[,2]))
write.csv(gene_info2,"Mus_musculus.GRCm38.102.gene.csv")
在服务器上
awk '{if ($3=="exon") print $0}' genomic.gtf > /exon.gtf
sed -i 's/"//g' ./length/exon.gtf
https://www.jianshu.com/p/d53416962fa8