fasta文件更改header
2021-04-30 本文已影响0人
落寞的橙子
把它转化为data.frame后面问题就迎刃而解
gtf文件是为了提取chr, start , end 等信息,gtf文件读取需要安装rtracklayer包
rm(list = ls())
modify_fa<-function(fa_dir,gtf_dir,out_dir){
suppressMessages(library(Biostrings))
suppressMessages(library(tidyverse))
gtf <- rtracklayer::import(gtf_dir)
gtf_df=as.data.frame(gtf)
gtf_df$seq_name<-paste0(gtf_df$transcript_id,"_",gtf_df$gene_id)
gtf_df<- gtf_df %>% filter(type=="transcript")
gtf_df$new_strand<-as.factor(ifelse(gtf_df$strand=="+",1,-1))
gtf_df$new_name<-paste0(gtf_df$seq_name," ","transcript chromosome:GRCh38:",
gtf_df$seqnames,":",gtf_df$start,":",gtf_df$end,":",gtf_df$new_strand,
" ","gene:",gtf_df$gene_id)
gtf_df<-gtf_df %>% select(seq_name,new_name)
fa<-readDNAStringSet(fa_dir)
seq_name = names(fa)
sequence = paste(fa)
df <- data.frame(seq_name, sequence)
rt<-df %>% left_join(gtf_df,by="seq_name") %>% select(new_name,sequence)
rt$new_name<-paste0(">",rt$new_name)
D <- do.call(rbind, lapply(seq(nrow(rt)), function(i) t(rt[i, ])))
write.table(D,out_dir, row.names = FALSE, col.names = FALSE, quote = FALSE)
}
human_fa_dir="all.human.isoforms.fa"
human_gtf_dir="all.human.isoforms.gtf"
human_out_dir="all.human.isoforms.new.fa"
modify_fa(fa_dir=human_fa_dir,gtf_dir=human_gtf_dir,out_dir = human_out_dir)