IRscope代码拆解一

2020-05-30  本文已影响0人  小明的数据分析笔记本

IRScope 是用用来可视化 叶绿体基因组边界扩张收缩的一个shiny应用。正好自己最近在学习R语言里shiny相关的知识。就准备看看他的代码是怎么写的。目前看懂了一小部分,记录在这里。

他开头定义了很多函数

1、读入genbank函数的文件

read.gb<- function(file){
  return(readLines(file))
}

readLines()函数读入文本文件,结果好像是一个向量,文件中的每行是向量中的一个元素。

2、提取读入的genbank文件的fasta序列

这个代码稍微有点长,,逻辑还有点没看懂

FasExtract<- function(gb){
  fasta<-gb[(grep("ORIGIN", gb)+1):length(gb)]
  while(fasta[length(fasta)]=="") {
    fasta<- fasta[1:length(fasta)-1]
  }
  while(fasta[length(fasta)]=="//") {
    fasta<- fasta[1:length(fasta)-1]
  }
  fas<-""
  for (i in 1:length(fasta)){
    sort.let<- sort(unique(c(grep("c", strsplit(fasta[i], " ")[[1]]),grep("a", strsplit(fasta[i], " ")[[1]]), grep("t", strsplit(fasta[i], " ")[[1]]), grep("g", strsplit(fasta[i], " ")[[1]]))))
    try(if(length(sort.let)==!6) stop("Check the gb file; the columns of ORIGIN should be 6"))
    fasta[i]<- paste(strsplit(fasta[i], " ")[[1]][sort.let[1]],strsplit(fasta[i], " ")[[1]][sort.let[2]],strsplit(fasta[i], " ")[[1]][sort.let[3]],strsplit(fasta[i], " ")[[1]][sort.let[4]],strsplit(fasta[i], " ")[[1]][sort.let[5]],strsplit(fasta[i], " ")[[1]][sort.let[6]], sep="")
    fas<-paste(fas, fasta[i], sep="")
  }
  #fasta[length(fasta)]<- gsub("NA", "", fasta[length(fasta)])
  fas<-gsub("NA", "", fas)
  strsplit(fas, "")[[1]]
}

3、计算fasta序列GC含量

GC.content<- function(fas){
  return((sum(fas=="g")+sum(fas=="c"))/length(fas))
}

4、提取genbank文件中的物种拉丁名

sp.name<- function(gb){
  paste(strsplit(gb[2], " ")[[1]][3], strsplit(gb[2], " ")[[1]][4], sep=" ")
}
这四个函数连在一起用就是
> gbfile<-read.gb("sequence.gb")
> fas<-FasExtract(gbfile)
> fas
  [1] "a" "t" "g" "a" "t" "t" "g" "a" "a" "g" "t" "t" "t" "t" "t" "c" "t" "a"
 [19] "t" "t" "t" "g" "g" "a" "a" "t" "t" "g" "t" "c" "t" "t" "a" "g" "g" "t"
 [37] "c" "t" "a" "a" "t" "t" "c" "c" "t" "a" "t" "t" "a" "c" "t" "t" "t" "a"
 [55] "g" "c" "t" "g" "g" "a" "t" "t" "a" "t" "t" "t" "g" "t" "a" "a" "c" "t"
 [73] "g" "c" "a" "t" "a" "t" "t" "t" "a" "c" "a" "a" "t" "a" "c" "a" "g" "a"
 [91] "c" "g" "t" "g" "g" "t" "g" "a" "t" "c" "a" "g" "t" "t" "g" "g" "a" "c"
[109] "t" "t" "t" "t" "g" "a"
> GC.content(fas)
[1] 0.3157895
> sp.name(gbfile)
[1] "Punica granatum"

5、替换fasta序列中的非ATCG的字符

rdnFixer<- function(gb){
  seq<- FasExtract(gb)
  seq[which(seq=="u")]<-sample(c("t"), length(which(seq=="u")), TRUE)
  seq[which(seq=="r")]<-sample(c("a", "g"), length(which(seq=="r")), TRUE)
  seq[which(seq=="y")]<-sample(c("c", "t"), length(which(seq=="y")), TRUE)
  seq[which(seq=="s")]<-sample(c("c", "g"), length(which(seq=="s")), TRUE)
  seq[which(seq=="w")]<-sample(c("a", "t"), length(which(seq=="w")), TRUE)
  seq[which(seq=="k")]<-sample(c("g", "t"), length(which(seq=="k")), TRUE)
  seq[which(seq=="m")]<-sample(c("c", "a"), length(which(seq=="m")), TRUE)
  seq[which(seq=="b")]<-sample(c("c", "g", "t"), length(which(seq=="b")), TRUE)
  seq[which(seq=="d")]<-sample(c("a", "g", "t"), length(which(seq=="d")), TRUE)
  seq[which(seq=="h")]<-sample(c("c", "a", "t"), length(which(seq=="h")), TRUE)
  seq[which(seq=="v")]<-sample(c("c", "a", "g"), length(which(seq=="v")), TRUE)
  seq[which(seq=="n")]<-sample(c("c", "g", "t", "a"), length(which(seq=="n")), TRUE)
  seq[which(seq=="-")]<-sample(c("c", "g", "t", "a"), length(which(seq=="-")), TRUE)
  return(seq)
}

5、鉴定叶绿基因组的四个区域边界信息
这个代码就更长了,代码看起来就更吃力了。如何鉴定四个区域的边界位置逻辑有点看不懂呀!

IRinfo<- function(genome, parallel=TRUE){
  
  #' IR information
  #' 
  #' Detecting the starts and the length of the IR regions on the chloroplast genome
  #' 
  #' @param genome The plastid genome of a species as a simple vector of nucleotides 
  #' @return a vector of four elements as the start of the first and second IR region, their length and the total lenght of the genome, respectively
  #' @export 
  
  ###Preliminary functions 
  
  cirtick<<- function(tick, vector){#circular rotative function
    if(tick > length(vector)-1 || tick < 1){
      return(vector)
    }
    else {
      return(c(vector[(tick+1):length(vector)], vector[1:tick]))
    }
  }
  
  genome.comp.rev<<- function(genome){#reverse complement function
    gcr<-genome[length(genome):1]
    gcr<-gsub("a", "T", gcr)
    gcr<-gsub("t", "A", gcr)
    gcr<-gsub("g", "C", gcr)
    gcr<-gsub("c", "G", gcr)
    return(tolower(gcr))
  }
  
  #Checked
  phase.detector<- function(genome){#detecting the phase difference of the two inverted genomes
    shifter=84000 #this shifter is faster mode, to be sure set the value to 80000,
    gcr<- genome.comp.rev(genome)
    genome<-cirtick(shifter, genome) 
    l<-length(genome)
    track<- numeric(l+1)
    track[l+1]<- round(l/4)
    for (i in 1:l){
      track[i]<- sum(cirtick(i, genome)==gcr)
      if ((track[i] - track[l+1])/l > 0.1) {###stable version with 0.07 but changing it for the Guizotia abyssinica, for the parallel p.d fucntion as well
        break
      }
    }
    a<<- which(track==max(track))+shifter
    if ( a > l){
      return(a - l)
    }
    else {
      return(a)
    }
  }
  #the parallel version of the phase.detector function. Set with default 4
  p.d<<- function(genome, nCPU=2){#tested in the GlobEnv, it might fail when embedded in the bigger function
    genome<<-genome
    fun<<- function(shifter){#the function to be passed to the slaves for the parallel computing, the out put is with max 10 second either NA or the phase.detector
      gcr<- genome.comp.rev(genome)
      cir.genome<<-cirtick(shifter, genome)
      l<-length(genome)
      track<- numeric(l+1)
      track[l+1]<- round(l/4)
      s.time<- Sys.time()
      no.value<- FALSE
      for (i in 1:l){
        track[i]<- sum(cirtick(i, cir.genome)==gcr)
        i.time<- Sys.time()
        if ((track[i] - track[l+1])/l > 0.1) {
          break
        }
        if (i.time - s.time > 11){
          no.value<- TRUE
          break
        }
      }
      if (no.value) {
        a<<- NA
        return(a)
      }
      else {
        a<<- which(track==max(track))+shifter
        if ( a > l){
          return(a - l)
        }
        else {
          return(a)
        } 
      }
    }
    ini.forw<<- 84000
    ini.back<<- ini.forw
    mm<<- rep(NA, nCPU)
    sfStop()
    sfInit(parallel=TRUE, cpus=nCPU)
    while(sum(is.na(mm))==length(mm)){
      sfExport("genome", "cirtick", "genome.comp.rev", "fun", "mm", "ini.forw", "ini.back")
      mm<-unlist(sfLapply(c(seq(ini.forw, ini.forw+nCPU/2*1000-1, 1000), seq(ini.back, ini.back- nCPU/2*1000, -1000)[-1]),  fun))
      ini.forw<<- ini.forw+nCPU/2*1000
      ini.back<<-ini.back-nCPU/2*1000
    }
    sfStop()
    return(unique(mm)[which((is.na(unique(mm))==FALSE))])
  }
  
  #Checked
  True.sequence.finder<- function(genome, phase.difference){#finding the cordinate of the IR region
    #phase.difference<-phase.detector(genome)
    true.search<-cirtick(phase.difference, genome)==genome.comp.rev(genome)
    true.arm<- round(length(genome)/100)
    for (i in 1:length(genome)){
      if (sum(true.search[i:(true.arm+i-1)])==true.arm) {
        return(i); break
      }
    }
  }
  
  #Checked
  IR1start<-function(phase.difference,True.sequence.finder){
    return(phase.difference+True.sequence.finder)
  }
  
  #Checked
  IR.length<- function(IR1start, True.sequence.finder, genome){
    s<-IR1start
    t<-True.sequence.finder
    r<- genome.comp.rev(genome)
    T<-cirtick(s, genome)==cirtick(t, r)
    Tl<- list()
    for (i in 1:50){
      Tl[[i]]<- cirtick((s + i), genome)==cirtick(t, r)
    }
    for (i in 1:50){
      Tl[[i+50]]<- cirtick(s , genome)==cirtick((t+i), r)
    }
    count<-1
    while(T[count]){
      count<- count+1
    }
    ###jump from INDEL
    for (i in 1:50){
      if(sum(Tl[[i]][count:(count+9)])==10){
        while (Tl[[i]][count]){
          count<- (count + i)
        }
      }
      if(sum(Tl[[i]][(count+1):(count+10)])==10){
        while (Tl[[i]][(count+1)]){
          count<- ((count+1) + i)
        }
      }
      if(sum(Tl[[i]][(count+2):(count+11)])==10){
        while (Tl[[i]][(count+2)]){
          count<- ((count+2) + i)
        }
      }
      if(sum(Tl[[i]][(count+3):(count+12)])==10){
        while (Tl[[i]][(count+3)]){
          count<- ((count+3) + i)
        }
      }
      if(sum(Tl[[i]][(count+4):(count+13)])==10){
        while (Tl[[i]][(count+4)]){
          count<- ((count+4) + i)
        }
      }
      if(sum(Tl[[i]][(count+5):(count+14)])==10){
        while (Tl[[i]][(count+5)]){
          count<- ((count+5) + i)
        }
      }
    }
    for (i in 1:50){
      if(sum(Tl[[i+50]][count:(count+9)])==10){
        while (Tl[[i+50]][count]){
          count<- (count + i)
        }
      }
      if(sum(Tl[[i+50]][(count+1):(count+10)])==10){
        while (Tl[[i+50]][(count+1)]){
          count<- ((count+1) + i)
        }
      }
      if(sum(Tl[[i+50]][(count+2):(count+11)])==10){
        while (Tl[[i+50]][(count+2)]){
          count<- ((count+2) + i)
        }
      }
      if(sum(Tl[[i+50]][(count+3):(count+12)])==10){
        while (Tl[[i+50]][(count+3)]){
          count<- ((count+3) + i)
        }
      }
      if(sum(Tl[[i+50]][(count+4):(count+13)])==10){
        while (Tl[[i]][(count+4)]){
          count<- ((count+4) + i)
        }
      }
      if(sum(Tl[[i+50]][(count+5):(count+14)])==10){
        while (Tl[[i]][(count+5)]){
          count<- ((count+5) + i)
        }
      }
    }
    ##end of jump indel
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    ###jump from INDEL
    for (i in 1:50){
      if(sum(Tl[[i]][count:(count+9)])==10){
        while (Tl[[i]][count]){
          count<- (count + i)
        }
      }
      if(sum(Tl[[i]][(count+1):(count+10)])==10){
        while (Tl[[i]][(count+1)]){
          count<- ((count+1) + i)
        }
      }
      if(sum(Tl[[i]][(count+2):(count+11)])==10){
        while (Tl[[i]][(count+2)]){
          count<- ((count+2) + i)
        }
      }
      if(sum(Tl[[i]][(count+3):(count+12)])==10){
        while (Tl[[i]][(count+3)]){
          count<- ((count+3) + i)
        }
      }
      if(sum(Tl[[i]][(count+4):(count+13)])==10){
        while (Tl[[i]][(count+4)]){
          count<- ((count+4) + i)
        }
      }
      if(sum(Tl[[i]][(count+5):(count+14)])==10){
        while (Tl[[i]][(count+5)]){
          count<- ((count+5) + i)
        }
      }
    }
    for (i in 1:50){
      if(sum(Tl[[i+50]][count:(count+9)])==10){
        while (Tl[[i+50]][count]){
          count<- (count + i)
        }
      }
      if(sum(Tl[[i+50]][(count+1):(count+10)])==10){
        while (Tl[[i+50]][(count+1)]){
          count<- ((count+1) + i)
        }
      }
      if(sum(Tl[[i+50]][(count+2):(count+11)])==10){
        while (Tl[[i+50]][(count+2)]){
          count<- ((count+2) + i)
        }
      }
      if(sum(Tl[[i+50]][(count+3):(count+12)])==10){
        while (Tl[[i+50]][(count+3)]){
          count<- ((count+3) + i)
        }
      }
      if(sum(Tl[[i+50]][(count+4):(count+13)])==10){
        while (Tl[[i]][(count+4)]){
          count<- ((count+4) + i)
        }
      }
      if(sum(Tl[[i+50]][(count+5):(count+14)])==10){
        while (Tl[[i]][(count+5)]){
          count<- ((count+5) + i)
        }
      }
    }
    ##end of jump indel
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    for (i in 1:50){###jump from INDEL
      if(sum(Tl[[i]][count:(count+9)])==10){
        while (Tl[[i]][count]){
          count<- (count + i)
        }
      }
    }
    for (i in 1:50){
      if(sum(Tl[[i+50]][count:(count+9)])==10){
        while (Tl[[i+50]][count]){
          count<- (count + i)
        }
      }
    }##end of jump indel
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    for (i in 1:50){###jump from INDEL
      if(sum(Tl[[i]][count:(count+9)])==10){
        while (Tl[[i]][count]){
          count<- (count + i)
        }
      }
    }
    for (i in 1:50){
      if(sum(Tl[[i+50]][count:(count+9)])==10){
        while (Tl[[i+50]][count]){
          count<- (count + i)
        }
      }
    }##end of jump indel
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    for (i in 1:50){###jump from INDEL
      if(sum(Tl[[i]][count:(count+9)])==10){
        while (Tl[[i]][count]){
          count<- (count + i)
        }
      }
    }
    for (i in 1:50){
      if(sum(Tl[[i+50]][count:(count+9)])==10){
        while (Tl[[i+50]][count]){
          count<- (count + i)
        }
      }
    }##end of jump indel
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    if (sum(T[(count+10):(count+75)]==rep(TRUE, 66))==66){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    for (i in 1:50){###jump from INDEL
      if(sum(Tl[[i]][count:(count+9)])==10){
        while (Tl[[i]][count]){
          count<- (count + i)
        }
      }
    }
    for (i in 1:50){
      if(sum(Tl[[i+50]][count:(count+9)])==10){
        while (Tl[[i+50]][count]){
          count<- (count + i)
        }
      }
    }##end of jump indel
    if (sum(T[(count+10):(count+75)]==rep(TRUE, 66))==66){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    for (i in 1:50){###jump from INDEL
      if(sum(Tl[[i]][count:(count+9)])==10){
        while (Tl[[i]][count]){
          count<- (count + i)
        }
      }
    }
    for (i in 1:50){
      if(sum(Tl[[i+50]][count:(count+9)])==10){
        while (Tl[[i+50]][count]){
          count<- (count + i)
        }
      }
    }##end of jump indel
    if (sum(T[(count+10):(count+75)]==rep(TRUE, 66))==66){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    for (i in 1:50){###jump from INDEL
      if(sum(Tl[[i]][count:(count+9)])==10){
        while (Tl[[i]][count]){
          count<- (count + i)
        }
      }
    }
    for (i in 1:50){
      if(sum(Tl[[i+50]][count:(count+9)])==10){
        while (Tl[[i+50]][count]){
          count<- (count + i)
        }
      }
    }##end of jump indel
    if (sum(T[(count+10):(count+175)]==rep(TRUE, 166))==66){
      count<- count+10
      while(T[count]){
        count<- count+1
      } 
    }
    count
  }
  
  IR2start<- function(True.sequence.finder, IR.length, genome){
    return(length(genome)-(True.sequence.finder+IR.length-2))
  }
  
  #declaration
  if (parallel){
    phase.difference<- p.d(genome)
  }
  else{
    phase.difference<- phase.detector(genome)
  }
  Trsf<- True.sequence.finder(genome, phase.difference)
  IR1s<- IR1start(phase.difference, Trsf)
  IR.l<- IR.length(IR1s, Trsf, genome)
  IR2s<- IR2start(Trsf, IR.l, genome)
  
  #calculation
  return(c(IR1s, IR2s, IR.l, length(genome)))#returning the start of IR one and two follow by their lenght and the lenght of genome
  #gives a vector with four elemens as the start of the IRb and IRa and their length and the lenght of the genome sequence
}

如果要单独用这个函数需要加载 sonwfall()这个包,这里用到了并行计算

输入是 fasta
输出是 第一个反向重复区的位置,小单拷贝区的长度,总长

> gbfile<-read.gb("Taishanhong_CP_genome.gb")
> fas<-FasExtract(gbfile)
> IRinfo(fas)
snowfall 1.84-6.1 initialized (using snow 0.4-3): parallel execution on 2 CPUs.


Stopping cluster

[1]  89022 133173  25466 158638

6、获取genbank文件中所有的基因名

gene.name<-function(gb, type){
  if(type=="gene"){
    t <- gb[grep("  gene  ", gb)+1]
    for (i in 1:length(t)){
      crude<-strsplit(t[i], " ")[[1]]
      if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
        t[i] <- gb[grep("  gene  ", gb)+2][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
        t[i] <- gb[grep("  gene  ", gb)+3][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
        t[i] <- gb[grep("  gene  ", gb)+4][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      crude.name<-crude[which(crude!=rep("", length(crude)))]
      t[i]<-gsub("\"", "", strsplit(crude.name, "=")[[1]][2])
    }
    na<-which(is.na(t))
    if(length(na) > 0){
      for (i in 1:length(na)){
        warning(paste(paste(paste("Gene No.", na[i], " "), paste("is not properly named and is deleted from the list."), ""), paste("Check the gb file on line", which(gb==gb[grep("gene ", gb)+1][na]), ""), ""))
      }
    }
  }
  else if(type=="tRNA"){
    t <- gb[grep(" tRNA  ", gb)+1]
    for (i in 1:length(t)){
      crude<-strsplit(t[i], " ")[[1]]
      if(length(grep("=", crude))==0){
        t[i] <- gb[grep("tRNA  ", gb)+2][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
        t[i] <- gb[grep("  tRNA  ", gb)+3][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
        t[i] <- gb[grep("  tRNA  ", gb)+4][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      crude.name<-crude[which(crude!=rep("", length(crude)))]
      t[i]<-gsub("\"", "", strsplit(crude.name, "=")[[1]][2])
    }
    na<-which(is.na(t))
    if(length(na) > 0){
      for (i in 1:length(na)){
        warning(paste(paste(paste("Gene No.", na[i], " "), paste("is not properly named and is deleted from the list."), ""), paste("Check the gb file on line", which(gb==gb[grep("tRNA ", gb)+1]), ""), ""), "\n")
      }
    }       
  }
  else if(type=="rRNA"){
    t <- gb[grep("rRNA  ", gb)+1]
    for (i in 1:length(t)){
      crude<-strsplit(t[i], " ")[[1]]
      if(length(grep("=", crude))==0){
        t[i] <- gb[grep("rRNA  ", gb)+2][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
        t[i] <- gb[grep("  rRNA  ", gb)+3][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
        t[i] <- gb[grep("  rRNA  ", gb)+4][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      crude.name<-crude[which(crude!=rep("", length(crude)))]
      t[i]<-gsub("\"", "", strsplit(crude.name, "=")[[1]][2])
    }
    na<-which(is.na(t))
    if(length(na) > 0){
      for (i in 1:length(na)){
        warning(paste(paste(paste("Gene No.", na[i], " "), paste("is not properly named and is deleted from the list."), ""), paste("Check the gb file on line", which(gb==gb[grep("gene ", gb)+1][na]), ""), ""))
      }
    }
  }
  else if(type=="mRNA"){
    t <- gb[grep("mRNA  ", gb)+1]
    for (i in 1:length(t)){
      crude<-strsplit(t[i], " ")[[1]]
      if(length(grep("=", crude))==0){
        t[i] <- gb[grep("mRNA  ", gb)+2][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
        t[i] <- gb[grep("  mRNA  ", gb)+3][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
        t[i] <- gb[grep("  mRNA  ", gb)+4][i]
        crude<- strsplit(t[i], " ")[[1]]
      }
      crude.name<-crude[which(crude!=rep("", length(crude)))]
      t[i]<-gsub("\"", "", strsplit(crude.name, "=")[[1]][2])
    }
    na<-which(is.na(t))
    if(length(na) > 0){
      for (i in 1:length(na)){
        warning(paste(paste(paste("Gene No.", na[i], " "), paste("is not properly named and is deleted from the list."), ""), paste("Check the gb file on line", which(gb==gb[grep("gene ", gb)+1][na]), ""), ""))
      }
    }
  }
  else {
    stop("The type should be defined as either gene or tRNA")
  }
  t<-t[!is.na(t)]
  return(t)
  #intermediate gene name function to substract the gene names of either gene or tRNA, mRNA or rRNA from their second line information(or third), the input is the gb file.
}

可以获得所有的基因名称,tRNA或者rRNA的名称,但是不能够获得蛋白编码基因的名称,但是应该可以改,把代码里的mRNA统一换成CDS应该就可以了

> gbfile<-read.gb("Taishanhong_CP_genome.gb")
> gene.name(gbfile,'rRNA')
[1] "rrn16"  "rrn23"  "rrn4.5" "rrn5"   "rrn5"   "rrn4.5" "rrn23"  "rrn16" 

还有好多,今天就到这里了。
重点是画图函数,但是他的画图函数好长好长啊!

欢迎大家关注我的公众号
小明的数据分析笔记本

公众号二维码.jpg
上一篇下一篇

猜你喜欢

热点阅读