叶绿体基因组基因、外显子、内含子、基因间隔区提取
Bioinformatic_Scripts/extract_sequences_from_gb_files
一、用途
从注释好的gb格式(GenBank Flat File)的叶绿体基因组文件中批量提取基因(包括内含子)、编码基因(不包括内含子,外显子合并)、基因编码区(外显子单独提取)、基因非编码区(即基因外显子间隔区,包括嵌套基因外显子之间区域、以及内含子)、基因间隔区(不包括嵌套基因外显子之间的区域),以及各种自由组合,具体脚本在文件夹extract_sequences_from_gb_files里面。
二、脚本
1、文件夹extract_coding_noncoding,包括组合提取脚本
(1)提取基因和基因间隔区
基因:包括内含子。
基因间隔区:不包括嵌套基因外显子之间的区域,例如trnK-UUU-2-matK、matK-trnK-UUU-1。
1_extract_bed_gene_and_IGS.pl
#!/usr/bin/perl -w
use strict;
use File::Find;
use Data::Dumper;
$|=1;
print "Please type your input directory:";
my $dirname=<STDIN>;
chomp $dirname;
my $pattern=".gb";
my @filenames;
find(\&target,$dirname);
sub target{
if (/$pattern/){
push @filenames,"$File::Find::name";
}
return;
}
while (@filenames) {
my $filename_gb=shift @filenames;
my $filename_base=$filename_gb;
$filename_base=~ s/(.*).gb/$1/g;
open(my $in_gb,"<",$filename_gb);
open(my $out_gb,">","$filename_base\_temp1");
while (<$in_gb>){
$_=~ s/\r\n/\n/g;
if ($_=~ /\),\n/){
$_=~ s/\),\n/\),/g;
}elsif($_=~ /,\n/){
$_=~ s/,\n/,/g;
}
print $out_gb $_;
}
close $in_gb;
close $out_gb;
open(my $in_gbk,"<","$filename_base\_temp1");
open(my $out_gbk,">","$filename_base\_temp2");
while (<$in_gbk>){
$_=~ s/,\s+/,/g;
print $out_gbk $_;
}
close $in_gbk;
close $out_gbk;
#generate_bed_file
my (@row_array,$species_name,$length,$element,@genearray,@output1);
my $mark=0;
open (my $in_genbank,"<","$filename_base\_temp2");
while (<$in_genbank>){
chomp;
@row_array=split /\s+/,$_;
if (/^LOCUS/i){
$species_name=$row_array[1];
$length=$row_array[2];
}elsif(/ {5}gene {12}/){
if ($row_array[2]=~ /^\d+..\d+$/){
$row_array[2]="\+\t$row_array[2]\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~/^complement\((\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),complement\((\d+..\d+)\)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}
$element=$row_array[2];
$mark=1;
}elsif(/^\s+\/gene="(.*)"/ and $mark == 1){
$element=$species_name.":".$1.":".$element;
push @genearray,$element;
$element=();
$mark=0;
}
}
close $in_genbank;
foreach (@genearray){
my @array=split /:/,$_;
push @output1,"$array[0]\t$array[1]\t$array[2]\n";
}
@row_array=();
@genearray=();
#put_fasta_sequence_in_array
my $flag=0;
my @sequence;
my (@fas1,@fas2);
open(my $in_genebank,"<","$filename_base\_temp2");
while (<$in_genebank>){
if ($_=~ /ORIGIN/){
$flag=1;
}
if ($_=~ /\/\//){
$flag=2;
}
if ($flag==1){
next if ($_=~ /ORIGIN/);
push @sequence,$_;
}
}
close $in_genebank;
foreach (@sequence){
chomp;
$_=~ s/\s*//g;
$_=~ s/\d+//g;
push @fas1,$_;
}
my $fas1=join "",@fas1;
my (@fasta1,@fasta2);
push @fasta1,$species_name,$fas1;
@fasta2=@fasta1;
unlink "$filename_base\_temp1";
unlink "$filename_base\_temp2";
#edit_bed_file
my (%SP1,%GENE1,%STRAND1,%START1,%END1,%TYPE1,%STRAND2,%START2,%END2,%TYPE2,%STRAND3,%START3,%END3,%TYPE3,@output2);
my $cnt1=0;
foreach (@output1) {
chomp;
$cnt1++;
($SP1{$cnt1},$GENE1{$cnt1},$STRAND1{$cnt1},$START1{$cnt1},$END1{$cnt1},$TYPE1{$cnt1},$STRAND2{$cnt1},$START2{$cnt1},$END2{$cnt1},$TYPE2{$cnt1})=(split /\s+/,$_)[0,1,2,3,4,5,6,7,8,9];
}
foreach (1..$cnt1) {
if (defined $STRAND2{$_} eq "") {
push @output2,($SP1{$_}."\t".$GENE1{$_}."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
}elsif (defined $STRAND2{$_} ne "") {
if (($STRAND1{$_} eq "-") and ($STRAND2{$_} eq "-") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "-") and ($STRAND2{$_} eq "-") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "-") and ($STRAND2{$_} eq "+") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($STRAND2{$_} eq "+") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($STRAND2{$_} eq "+") and ($START1{$_} > $START2{$_}) and ($GENE1{$_} ne "rps12")){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($STRAND2{$_} eq "+") and ($START1{$_} > $START2{$_}) and ($GENE1{$_} eq "rps12")){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}
}
}
@output1=();
#sort_bed_file
my $col=3;
my (%sort,@output3);
foreach (@output2){
my @row=split /\t/,$_;
$sort{$_}=$row[$col];
}
foreach (sort {$sort{$a} <=> $sort{$b}} keys %sort){
push @output3,"$_\n";
}
@output2=();
#output_bed_file
open (my $out_bed,">","$filename_base\_bed_gene_IGS.txt");
foreach (@output3){
print $out_bed $_;
}
close $out_bed;
#extract_gene
my (%SP2,%GENE2,%STRAND4,%START4,%END4,%TYPE4,$seq1);
my $cnt2=0;
open(my $out_coding,">","$filename_base\_gene.fasta");
while (@fasta1){
my $header=shift @fasta1;
$seq1=shift @fasta1;
}
foreach (@output3){
chomp;
$cnt2++;
($SP2{$cnt2},$GENE2{$cnt2},$STRAND4{$cnt2},$START4{$cnt2},$END4{$cnt2},$TYPE4{$cnt2})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
if ($STRAND4{$cnt2} eq "-") {
my $rev_com=reverse $str;
$rev_com=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com."\n";
}elsif($STRAND4{$cnt2} eq "+"){
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str."\n";
}
}
close $out_coding;
#generate_IGS_ranges
my (%SP3,%GENE3,%STRAND5,%START5,%END5,%TYPE5,$last0,$last1,$last2,@output4);
my $cnt3=0;
foreach (@output3){
chomp;
$cnt3++;
($SP3{$cnt3},$GENE3{$cnt3},$STRAND5{$cnt3},$START5{$cnt3},$END5{$cnt3},$TYPE5{$cnt3})=(split /\s+/,$_)[0,1,2,3,4,5];
}
foreach (keys %SP3){
if ($_==1 and $START5{$_}!=1){
unshift @output4,$SP3{$_}."\t"."start".'-'.$GENE3{$_}."\t"."?"."/".$STRAND5{$_}."\t"."1"."\t".($START5{$_}-1)."\t"."?"."/".$TYPE5{$_}."\n";
}
}
foreach (1..($cnt3-1)) {
$last0=$_-1;
$last1=$_+1;
$last2=$_+2;
next if ((($END5{$_}+1) >= ($START5{$last1}-1)) and (($END5{$_}+1) < ($END5{$last1}-1)));
next if (($_ > 1) and (($END5{$_}+1) < ($END5{$last0}-1)) and (($END5{$_}+1) < ($START5{$last2}-1)));
if ((($END5{$_}+1) >= ($START5{$last1}-1)) and (($END5{$_}+1) >= ($END5{$last1}-1))){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last2}."\t".$STRAND5{$_}."/".$STRAND5{$last2}."\t".($END5{$_}+1)."\t".($START5{$last2}-1)."\t".$TYPE5{$_}."/".$TYPE5{$last2}."\n";
}else{
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last1}."\t".$STRAND5{$_}."/".$STRAND5{$last1}."\t".($END5{$_}+1)."\t".($START5{$last1}-1)."\t".$TYPE5{$_}."/".$TYPE5{$last1}."\n";
}
}
foreach (keys %SP3){
if ($_==$cnt3){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'."end"."\t".$STRAND5{$_}."/"."?"."\t".($END5{$_}+1)."\t".$length."\t".$TYPE5{$_}."/"."?"."\n";
}
}
@output3=();
#extract_IGS
my (%SP4,%GENE4,%STRAND6,%START6,%END6,%TYPE6,$seq2);
my $cnt4=0;
open(my $out_noncoding,">","$filename_base\_IGS.fasta");
while (@fasta2){
my $header=shift @fasta2;
$seq2=shift @fasta2;
}
foreach (@output4){
chomp;
$cnt4++;
($SP4{$cnt4},$GENE4{$cnt4},$STRAND6{$cnt4},$START6{$cnt4},$END6{$cnt4},$TYPE6{$cnt4})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq2,($START6{$cnt4}-1),($END6{$cnt4}-$START6{$cnt4}+1));
print $out_noncoding ">".$GENE4{$cnt4}."_".$SP4{$cnt4}."\n".$str."\n";
}
@output4=();
close $out_noncoding;
#unlink "$filename_base\_IGS.fasta";
}
(2)提取编码基因和基因外显子间隔区
编码基因:不包括内含子,外显子合并。
基因外显子间隔区:包括嵌套基因外显子之间的区域,例如trnK-UUU-2-matK、matK-trnK-UUU-1;包括内含子。
2_extract_bed_CDS_RNA_and_intergenic.pl
#!/usr/bin/perl -w
use strict;
use File::Find;
use Data::Dumper;
$|=1;
print "Please type your input directory:";
my $dirname=<STDIN>;
chomp $dirname;
my $pattern=".gb";
my @filenames;
find(\&target,$dirname);
sub target{
if (/$pattern/){
push @filenames,"$File::Find::name";
}
return;
}
while (@filenames) {
my $filename_gb=shift @filenames;
my $filename_base=$filename_gb;
$filename_base=~ s/(.*).gb/$1/g;
open(my $in_gb,"<",$filename_gb);
open(my $out_gb,">","$filename_base\_temp1");
while (<$in_gb>){
$_=~ s/\r\n/\n/g;
if ($_=~ /\),\n/){
$_=~ s/\),\n/\),/g;
}elsif($_=~ /,\n/){
$_=~ s/,\n/,/g;
}
print $out_gb $_;
}
close $in_gb;
close $out_gb;
open(my $in_gbk,"<","$filename_base\_temp1");
open(my $out_gbk,">","$filename_base\_temp2");
while (<$in_gbk>){
$_=~ s/,\s+/,/g;
print $out_gbk $_;
}
close $in_gbk;
close $out_gbk;
#generate_bed_file
my (@row_array,$species_name,$length,$element,@genearray,@output1);
my $mark=0;
open (my $in_genbank,"<","$filename_base\_temp2");
while (<$in_genbank>){
chomp;
@row_array=split /\s+/,$_;
if (/^LOCUS/i){
$species_name=$row_array[1];
$length=$row_array[2];
}elsif(/ {5}CDS {13}/ or / {5}tRNA {12}/ or / {5}rRNA {12}/){
if ($row_array[2]=~ /^\d+..\d+$/){
$row_array[2]="\+\t$row_array[2]\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~/^complement\((\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),complement\((\d+..\d+)\)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}
$element=$row_array[2];
$mark=1;
}elsif(/^\s+\/gene="(.*)"/ and $mark == 1){
$element=$species_name.":".$1.":".$element;
push @genearray,$element;
$element=();
$mark=0;
}
}
close $in_genbank;
foreach (@genearray){
my @array=split /:/,$_;
push @output1,"$array[0]\t$array[1]\t$array[2]\n";
}
@row_array=();
@genearray=();
#put_fasta_sequence_in_array
my $flag=0;
my @sequence;
my (@fas1,@fas2);
open(my $in_genebank,"<","$filename_base\_temp2");
while (<$in_genebank>){
if ($_=~ /ORIGIN/){
$flag=1;
}
if ($_=~ /\/\//){
$flag=2;
}
if ($flag==1){
next if ($_=~ /ORIGIN/);
push @sequence,$_;
}
}
close $in_genebank;
foreach (@sequence){
chomp;
$_=~ s/\s*//g;
$_=~ s/\d+//g;
push @fas1,$_;
}
my $fas1=join "",@fas1;
my (@fasta1,@fasta2);
push @fasta1,$species_name,$fas1;
@fasta2=@fasta1;
unlink "$filename_base\_temp1";
unlink "$filename_base\_temp2";
#edit_bed_file
my (%SP1,%GENE1,%STRAND1,%START1,%END1,%TYPE1,%STRAND2,%START2,%END2,%TYPE2,%STRAND3,%START3,%END3,%TYPE3,@output2);
my $cnt1=0;
foreach (@output1) {
chomp;
$cnt1++;
($SP1{$cnt1},$GENE1{$cnt1},$STRAND1{$cnt1},$START1{$cnt1},$END1{$cnt1},$TYPE1{$cnt1},$STRAND2{$cnt1},$START2{$cnt1},$END2{$cnt1},$TYPE2{$cnt1},$STRAND3{$cnt1},$START3{$cnt1},$END3{$cnt1},$TYPE3{$cnt1})=(split /\s+/,$_)[0,1,2,3,4,5,6,7,8,9,10,11,12,13];
}
foreach (1..$cnt1) {
if (defined $STRAND2{$_} eq "") {
push @output2,($SP1{$_}."\t".$GENE1{$_}."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
}elsif ((defined $STRAND2{$_} ne "") and (defined $STRAND3{$_} eq "")) {
if (($STRAND1{$_} eq "-") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}
}elsif ((defined $STRAND2{$_} ne "") and (defined $STRAND3{$_} ne "")) {
if (($STRAND1{$_} eq "-") and ($START1{$_} < $START2{$_}) and ($START2{$_} < $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START2{$_}) and ($START2{$_} > $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START3{$_}) and ($START3{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} < $START2{$_}) and ($START2{$_} < $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START2{$_}) and ($START2{$_} > $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START3{$_}) and ($START3{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}
}
}
#sort_bed_file
my $col=3;
my (%sort,@output3);
foreach (@output2){
my @row=split /\t/,$_;
$sort{$_}=$row[$col];
}
foreach (sort {$sort{$a} <=> $sort{$b}} keys %sort){
push @output3,"$_\n";
}
@output2=();
#output_bed_file
open (my $out_bed,">","$filename_base\_bed_CDS_RNA_intergenic.txt");
foreach (@output3){
print $out_bed $_;
}
close $out_bed;
#extract_gene
my (%SP2,%GENE2,%STRAND4,%START4,%END4,%TYPE4,%STRAND5,%START5,%END5,%TYPE5,%STRAND6,%START6,%END6,%TYPE6,$seq1);
my $cnt2=0;
open(my $out_coding,">","$filename_base\_CDS_RNA.fasta");
while (@fasta1){
my $header=shift @fasta1;
$seq1=shift @fasta1;
}
foreach (@output1){
chomp;
$cnt2++;
($SP2{$cnt2},$GENE2{$cnt2},$STRAND4{$cnt2},$START4{$cnt2},$END4{$cnt2},$TYPE4{$cnt2},$STRAND5{$cnt2},$START5{$cnt2},$END5{$cnt2},$TYPE5{$cnt2},$STRAND6{$cnt2},$START6{$cnt2},$END6{$cnt2},$TYPE6{$cnt2})=(split /\s+/,$_)[0,1,2,3,4,5,6,7,8,9,10,11,12,13];
if (defined $STRAND5{$cnt2} eq "") {
my $str1=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
if ($STRAND4{$cnt2} eq "-") {
my $rev_com1=reverse $str1;
$rev_com1=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com1."\n";
}elsif($STRAND4{$cnt2} eq "+"){
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str1."\n";
}
}elsif((defined $STRAND5{$cnt2} ne "") and (defined $STRAND6{$cnt2} eq "")) {
if (($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} < $START5{$cnt2})) {
my $str2=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1));
my $rev_com2=reverse $str2;
$rev_com2=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com2."\n";
}elsif(($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} > $START5{$cnt2})) {
my $str3=substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
my $rev_com3=reverse $str3;
$rev_com3=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com3."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} < $START5{$cnt2})){
my $str4=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str4."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} > $START5{$cnt2})){
my $str5=substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str5."\n";
}
}elsif ((defined $STRAND5{$cnt2} ne "") and (defined $STRAND6{$cnt2} ne "")) {
if (($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} < $START5{$cnt2}) and ($START5{$cnt2} < $START6{$cnt2})) {
my $str6=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
my $rev_com4=reverse $str6;
$rev_com4=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com4."\n";
}elsif(($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} > $START5{$cnt2}) and ($START5{$cnt2} > $START6{$cnt2})) {
my $str7=substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
my $rev_com5=reverse $str7;
$rev_com5=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com5."\n";
}elsif(($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} > $START6{$cnt2}) and ($START6{$cnt2} > $START5{$cnt2})) {
my $str8=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
my $str9=substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
my $rev_com6=reverse $str8;
$rev_com6=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com6.$str9."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} < $START5{$cnt2}) and ($START5{$cnt2} < $START6{$cnt2})){
my $str10=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str10."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} > $START5{$cnt2}) and ($START5{$cnt2} > $START6{$cnt2})){
my $str11=substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str11."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} > $START6{$cnt2}) and ($START6{$cnt2} > $START5{$cnt2})) {
my $str12=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str12."\n";
}
}
}
close $out_coding;
@output1=();
#generate_IGS_ranges
my (%SP3,%GENE3,%STRAND7,%START7,%END7,%TYPE7,$last0,$last1,$last2,@output4);
my $cnt3=0;
foreach (@output3){
chomp;
$cnt3++;
($SP3{$cnt3},$GENE3{$cnt3},$STRAND7{$cnt3},$START7{$cnt3},$END7{$cnt3},$TYPE7{$cnt3})=(split /\s+/,$_)[0,1,2,3,4,5];
}
foreach (keys %SP3){
if ($_==1 and $START7{$_}!=1){
unshift @output4,$SP3{$_}."\t"."start".'-'.$GENE3{$_}."\t"."?"."/".$STRAND7{$_}."\t"."1"."\t".($START7{$_}-1)."\t"."?"."/".$TYPE7{$_}."\n";
}
}
foreach (1..($cnt3-1)) {
$last0=$_-1;
$last1=$_+1;
$last2=$_+2;
next if ((($END7{$_}+1) >= ($START7{$last1}-1)) and (($END7{$_}+1) < ($END7{$last1}-1)));
next if (($_ > 1) and (($END7{$_}+1) < ($END7{$last0}-1)) and (($END7{$_}+1) < ($START7{$last2}-1)));
if ((($END7{$_}+1) >= ($START7{$last1}-1)) and (($END7{$_}+1) >= ($END7{$last1}-1))){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last2}."\t".$STRAND7{$_}."/".$STRAND7{$last2}."\t".($END7{$_}+1)."\t".($START7{$last2}-1)."\t".$TYPE7{$_}."/".$TYPE7{$last2}."\n";
}else{
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last1}."\t".$STRAND7{$_}."/".$STRAND7{$last1}."\t".($END7{$_}+1)."\t".($START7{$last1}-1)."\t".$TYPE7{$_}."/".$TYPE7{$last1}."\n";
}
}
foreach (keys %SP3){
if ($_==$cnt3){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'."end"."\t".$STRAND7{$_}."/"."?"."\t".($END7{$_}+1)."\t".$length."\t".$TYPE7{$_}."/"."?"."\n";
}
}
@output3=();
#extract_IGS
my (%SP4,%GENE4,%STRAND8,%START8,%END8,%TYPE8,$seq2);
my $cnt4=0;
open(my $out_noncoding,">","$filename_base\_intergenic.fasta");
while (@fasta2){
my $header=shift @fasta2;
$seq2=shift @fasta2;
}
foreach (@output4){
chomp;
$cnt4++;
($SP4{$cnt4},$GENE4{$cnt4},$STRAND8{$cnt4},$START8{$cnt4},$END8{$cnt4},$TYPE8{$cnt4})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq2,($START8{$cnt4}-1),($END8{$cnt4}-$START8{$cnt4}+1));
print $out_noncoding ">".$GENE4{$cnt4}."_".$SP4{$cnt4}."\n".$str."\n";
}
@output4=();
close $out_noncoding;
#unlink "$filename_base\_intergenic.fasta";
}
(3)提取基因编码区和基因非编码区
基因编码区:外显子单独提取,例如trnK-UUU-2、trnK-UUU-1。
基因非编码区:包括嵌套基因外显子之间的区域,例如trnK-UUU-2-matK、matK-trnK-UUU-1;包括内含子。同脚本2_extract_bed_CDS_RNA_and_intergenic.pl基因外显子间隔区。
3_extract_bed_coding_and_noncoding.pl
#!/usr/bin/perl -w
use strict;
use File::Find;
use Data::Dumper;
$|=1;
print "Please type your input directory:";
my $dirname=<STDIN>;
chomp $dirname;
my $pattern=".gb";
my @filenames;
find(\&target,$dirname);
sub target{
if (/$pattern/){
push @filenames,"$File::Find::name";
}
return;
}
while (@filenames) {
my $filename_gb=shift @filenames;
my $filename_base=$filename_gb;
$filename_base=~ s/(.*).gb/$1/g;
open(my $in_gb,"<",$filename_gb);
open(my $out_gb,">","$filename_base\_temp1");
while (<$in_gb>){
$_=~ s/\r\n/\n/g;
if ($_=~ /\),\n/){
$_=~ s/\),\n/\),/g;
}elsif($_=~ /,\n/){
$_=~ s/,\n/,/g;
}
print $out_gb $_;
}
close $in_gb;
close $out_gb;
open(my $in_gbk,"<","$filename_base\_temp1");
open(my $out_gbk,">","$filename_base\_temp2");
while (<$in_gbk>){
$_=~ s/,\s+/,/g;
print $out_gbk $_;
}
close $in_gbk;
close $out_gbk;
#generate_bed_file
my (@row_array,$species_name,$length,$element,@genearray,@output1);
my $mark=0;
open (my $in_genbank,"<","$filename_base\_temp2");
while (<$in_genbank>){
chomp;
@row_array=split /\s+/,$_;
if (/^LOCUS/i){
$species_name=$row_array[1];
$length=$row_array[2];
}elsif(/ {5}CDS {13}/ or / {5}tRNA {12}/ or / {5}rRNA {12}/){
if ($row_array[2]=~ /^\d+..\d+$/){
$row_array[2]="\+\t$row_array[2]\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~/^complement\((\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),complement\((\d+..\d+)\)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}
$element=$row_array[2];
$mark=1;
}elsif(/^\s+\/gene="(.*)"/ and $mark == 1){
$element=$species_name.":".$1.":".$element;
push @genearray,$element;
$element=();
$mark=0;
}
}
close $in_genbank;
foreach (@genearray){
my @array=split /:/,$_;
push @output1,"$array[0]\t$array[1]\t$array[2]\n";
}
@row_array=();
@genearray=();
#put_fasta_sequence_in_array
my $flag=0;
my @sequence;
my (@fas1,@fas2);
open(my $in_genebank,"<","$filename_base\_temp2");
while (<$in_genebank>){
if ($_=~ /ORIGIN/){
$flag=1;
}
if ($_=~ /\/\//){
$flag=2;
}
if ($flag==1){
next if ($_=~ /ORIGIN/);
push @sequence,$_;
}
}
close $in_genebank;
foreach (@sequence){
chomp;
$_=~ s/\s*//g;
$_=~ s/\d+//g;
push @fas1,$_;
}
my $fas1=join "",@fas1;
my (@fasta1,@fasta2);
push @fasta1,$species_name,$fas1;
@fasta2=@fasta1;
unlink "$filename_base\_temp1";
unlink "$filename_base\_temp2";
#edit_bed_file
my (%SP1,%GENE1,%STRAND1,%START1,%END1,%TYPE1,%STRAND2,%START2,%END2,%TYPE2,%STRAND3,%START3,%END3,%TYPE3,@output2);
my $cnt1=0;
foreach (@output1) {
chomp;
$cnt1++;
($SP1{$cnt1},$GENE1{$cnt1},$STRAND1{$cnt1},$START1{$cnt1},$END1{$cnt1},$TYPE1{$cnt1},$STRAND2{$cnt1},$START2{$cnt1},$END2{$cnt1},$TYPE2{$cnt1},$STRAND3{$cnt1},$START3{$cnt1},$END3{$cnt1},$TYPE3{$cnt1})=(split /\s+/,$_)[0,1,2,3,4,5,6,7,8,9,10,11,12,13];
}
foreach (1..$cnt1) {
if (defined $STRAND2{$_} eq "") {
push @output2,($SP1{$_}."\t".$GENE1{$_}."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
}elsif ((defined $STRAND2{$_} ne "") and (defined $STRAND3{$_} eq "")) {
if (($STRAND1{$_} eq "-") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}
}elsif ((defined $STRAND2{$_} ne "") and (defined $STRAND3{$_} ne "")) {
if (($STRAND1{$_} eq "-") and ($START1{$_} < $START2{$_}) and ($START2{$_} < $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START2{$_}) and ($START2{$_} > $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START3{$_}) and ($START3{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} < $START2{$_}) and ($START2{$_} < $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START2{$_}) and ($START2{$_} > $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START3{$_}) and ($START3{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}
}
}
@output1=();
#sort_bed_file
my $col=3;
my (%sort,@output3);
foreach (@output2){
my @row=split /\t/,$_;
$sort{$_}=$row[$col];
}
foreach (sort {$sort{$a} <=> $sort{$b}} keys %sort){
push @output3,"$_\n";
}
@output2=();
#output_bed_file
open (my $out_bed,">","$filename_base\_bed_coding_noncoding.txt");
foreach (@output3){
print $out_bed $_;
}
close $out_bed;
#extract_coding_regions
my (%SP2,%GENE2,%STRAND4,%START4,%END4,%TYPE4,$seq1);
my $cnt2=0;
open(my $out_coding,">","$filename_base\_coding.fasta");
while (@fasta1){
my $header=shift @fasta1;
$seq1=shift @fasta1;
}
foreach (@output3){
chomp;
$cnt2++;
($SP2{$cnt2},$GENE2{$cnt2},$STRAND4{$cnt2},$START4{$cnt2},$END4{$cnt2},$TYPE4{$cnt2})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
if ($STRAND4{$cnt2} eq "-") {
my $rev_com=reverse $str;
$rev_com=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com."\n";
}elsif($STRAND4{$cnt2} eq "+"){
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str."\n";
}
}
close $out_coding;
#generate_noncoding_ranges
my (%SP3,%GENE3,%STRAND5,%START5,%END5,%TYPE5,$last,@output4);
my $cnt3=0;
foreach (@output3){
chomp;
$cnt3++;
($SP3{$cnt3},$GENE3{$cnt3},$STRAND5{$cnt3},$START5{$cnt3},$END5{$cnt3},$TYPE5{$cnt3})=(split /\s+/,$_)[0,1,2,3,4,5];
}
foreach (keys %SP3){
if ($_==1 and $START5{$_}!=1){
unshift @output4,$SP3{$_}."\t"."start".'-'.$GENE3{$_}."\t"."?"."/".$STRAND5{$_}."\t"."1"."\t".($START5{$_}-1)."\t"."?"."/".$TYPE5{$_}."\n";
}
}
foreach (1..($cnt3-1)) {
$last=$_+1;
next if ($END5{$_}+1) >= ($START5{$last}-1);
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last}."\t".$STRAND5{$_}."/".$STRAND5{$last}."\t".($END5{$_}+1)."\t".($START5{$last}-1)."\t".$TYPE5{$_}."/".$TYPE5{$last}."\n";
}
foreach (keys %SP3){
if ($_==$cnt3){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'."end"."\t".$STRAND5{$_}."/"."?"."\t".($END5{$_}+1)."\t".$length."\t".$TYPE5{$_}."/"."?"."\n";
}
}
@output3=();
#extract_noncoding_regions
my (%SP4,%GENE4,%STRAND6,%START6,%END6,%TYPE6,$seq2);
my $cnt4=0;
open(my $out_noncoding,">","$filename_base\_noncoding.fasta");
while (@fasta2){
my $header=shift @fasta2;
$seq2=shift @fasta2;
}
foreach (@output4){
chomp;
$cnt4++;
($SP4{$cnt4},$GENE4{$cnt4},$STRAND6{$cnt4},$START6{$cnt4},$END6{$cnt4},$TYPE6{$cnt4})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq2,($START6{$cnt4}-1),($END6{$cnt4}-$START6{$cnt4}+1));
print $out_noncoding ">".$GENE4{$cnt4}."_".$SP4{$cnt4}."\n".$str."\n";
}
@output4=();
close $out_noncoding;
#unlink "$filename_base\_noncoding.fasta";
}
2、文件夹extract_gene_coding_CDS_RNA,包括分别提取脚本
(1)提取基因
基因:包括内含子。
1_extract_bed_gene.pl
#!/usr/bin/perl -w
use strict;
use File::Find;
use Data::Dumper;
$|=1;
print "Please type your input directory:";
my $dirname=<STDIN>;
chomp $dirname;
my $pattern=".gb";
my @filenames;
find(\&target,$dirname);
sub target{
if (/$pattern/){
push @filenames,"$File::Find::name";
}
return;
}
while (@filenames) {
my $filename_gb=shift @filenames;
my $filename_base=$filename_gb;
$filename_base=~ s/(.*).gb/$1/g;
my $latin_name=substr ($filename_base,rindex($filename_base,"\/")+1);
open(my $in_gb,"<",$filename_gb);
open(my $out_gb,">","$filename_base\_temp1");
while (<$in_gb>){
$_=~ s/\r\n/\n/g;
if ($_=~ /\),\n/){
$_=~ s/\),\n/\),/g;
}elsif($_=~ /,\n/){
$_=~ s/,\n/,/g;
}
print $out_gb $_;
}
close $in_gb;
close $out_gb;
open(my $in_gbk,"<","$filename_base\_temp1");
open(my $out_gbk,">","$filename_base\_temp2");
while (<$in_gbk>){
$_=~ s/,\s+/,/g;
print $out_gbk $_;
}
close $in_gbk;
close $out_gbk;
#generate_bed_file
my (@row_array,$species_name,$length,$element,@genearray,@output1);
my $mark=0;
open (my $in_genbank,"<","$filename_base\_temp2");
while (<$in_genbank>){
chomp;
@row_array=split /\s+/,$_;
if (/^LOCUS/i){
$species_name=$latin_name;
$length=$row_array[2];
}elsif(/ {5}gene {12}/){
if ($row_array[2]=~ /^\d+..\d+$/){
$row_array[2]="\+\t$row_array[2]\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~/^complement\((\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),complement\((\d+..\d+)\)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}
$element=$row_array[2];
$mark=1;
}elsif(/^\s+\/gene="(.*)"/ and $mark == 1){
$element=$species_name.":".$1.":".$element;
push @genearray,$element;
$element=();
$mark=0;
}
}
close $in_genbank;
foreach (@genearray){
my @array=split /:/,$_;
push @output1,"$array[0]\t$array[1]\t$array[2]\n";
}
@row_array=();
@genearray=();
#put_fasta_sequence_in_array
my $flag=0;
my @sequence;
my (@fas1,@fas2);
open(my $in_genebank,"<","$filename_base\_temp2");
while (<$in_genebank>){
if ($_=~ /ORIGIN/){
$flag=1;
}
if ($_=~ /\/\//){
$flag=2;
}
if ($flag==1){
next if ($_=~ /ORIGIN/);
push @sequence,$_;
}
}
close $in_genebank;
foreach (@sequence){
chomp;
$_=~ s/\s*//g;
$_=~ s/\d+//g;
push @fas1,$_;
}
my $fas1=join "",@fas1;
my (@fasta1,@fasta2);
push @fasta1,$species_name,$fas1;
@fasta2=@fasta1;
unlink "$filename_base\_temp1";
unlink "$filename_base\_temp2";
#edit_bed_file
my (%SP1,%GENE1,%STRAND1,%START1,%END1,%TYPE1,%STRAND2,%START2,%END2,%TYPE2,%STRAND3,%START3,%END3,%TYPE3,@output2);
my $cnt1=0;
foreach (@output1) {
chomp;
$cnt1++;
($SP1{$cnt1},$GENE1{$cnt1},$STRAND1{$cnt1},$START1{$cnt1},$END1{$cnt1},$TYPE1{$cnt1},$STRAND2{$cnt1},$START2{$cnt1},$END2{$cnt1},$TYPE2{$cnt1})=(split /\s+/,$_)[0,1,2,3,4,5,6,7,8,9];
}
foreach (1..$cnt1) {
if (defined $STRAND2{$_} eq "") {
push @output2,($SP1{$_}."\t".$GENE1{$_}."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
}elsif (defined $STRAND2{$_} ne "") {
if (($STRAND1{$_} eq "-") and ($STRAND2{$_} eq "-") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "-") and ($STRAND2{$_} eq "-") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "-") and ($STRAND2{$_} eq "+") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($STRAND2{$_} eq "+") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($STRAND2{$_} eq "+") and ($START1{$_} > $START2{$_}) and ($GENE1{$_} ne "rps12")){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($STRAND2{$_} eq "+") and ($START1{$_} > $START2{$_}) and ($GENE1{$_} eq "rps12")){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}
}
}
@output1=();
#sort_bed_file
my $col=3;
my (%sort,@output3);
foreach (@output2){
my @row=split /\t/,$_;
$sort{$_}=$row[$col];
}
foreach (sort {$sort{$a} <=> $sort{$b}} keys %sort){
push @output3,"$_\n";
}
@output2=();
#output_bed_file
open (my $out_bed,">","$filename_base\_gene.bed");
foreach (@output3){
print $out_bed $_;
}
close $out_bed;
#extract_gene
my (%SP2,%GENE2,%STRAND4,%START4,%END4,%TYPE4,$seq1);
my $cnt2=0;
open(my $out_coding,">","$filename_base\_gene.fasta");
while (@fasta1){
my $header=shift @fasta1;
$seq1=shift @fasta1;
}
foreach (@output3){
chomp;
$cnt2++;
($SP2{$cnt2},$GENE2{$cnt2},$STRAND4{$cnt2},$START4{$cnt2},$END4{$cnt2},$TYPE4{$cnt2})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
if ($STRAND4{$cnt2} eq "-") {
my $rev_com=reverse $str;
$rev_com=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com."\n";
}elsif($STRAND4{$cnt2} eq "+"){
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str."\n";
}
}
close $out_coding;
#generate_IGS_ranges
my (%SP3,%GENE3,%STRAND5,%START5,%END5,%TYPE5,$last0,$last1,$last2,@output4);
my $cnt3=0;
foreach (@output3){
chomp;
$cnt3++;
($SP3{$cnt3},$GENE3{$cnt3},$STRAND5{$cnt3},$START5{$cnt3},$END5{$cnt3},$TYPE5{$cnt3})=(split /\s+/,$_)[0,1,2,3,4,5];
}
foreach (keys %SP3){
if ($_==1 and $START5{$_}!=1){
unshift @output4,$SP3{$_}."\t"."start".'-'.$GENE3{$_}."\t"."?"."/".$STRAND5{$_}."\t"."1"."\t".($START5{$_}-1)."\t"."?"."/".$TYPE5{$_}."\n";
}
}
foreach (1..($cnt3-1)) {
$last0=$_-1;
$last1=$_+1;
$last2=$_+2;
next if ((($END5{$_}+1) >= ($START5{$last1}-1)) and (($END5{$_}+1) < ($END5{$last1}-1)));
next if (($_ > 1) and (($END5{$_}+1) < ($END5{$last0}-1)) and (($END5{$_}+1) < ($START5{$last2}-1)));
if ((($END5{$_}+1) >= ($START5{$last1}-1)) and (($END5{$_}+1) >= ($END5{$last1}-1))){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last2}."\t".$STRAND5{$_}."/".$STRAND5{$last2}."\t".($END5{$_}+1)."\t".($START5{$last2}-1)."\t".$TYPE5{$_}."/".$TYPE5{$last2}."\n";
}else{
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last1}."\t".$STRAND5{$_}."/".$STRAND5{$last1}."\t".($END5{$_}+1)."\t".($START5{$last1}-1)."\t".$TYPE5{$_}."/".$TYPE5{$last1}."\n";
}
}
foreach (keys %SP3){
if ($_==$cnt3){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'."end"."\t".$STRAND5{$_}."/"."?"."\t".($END5{$_}+1)."\t".$length."\t".$TYPE5{$_}."/"."?"."\n";
}
}
@output3=();
#extract_IGS
my (%SP4,%GENE4,%STRAND6,%START6,%END6,%TYPE6,$seq2);
my $cnt4=0;
open(my $out_noncoding,">","$filename_base\_IGS.fasta");
while (@fasta2){
my $header=shift @fasta2;
$seq2=shift @fasta2;
}
foreach (@output4){
chomp;
$cnt4++;
($SP4{$cnt4},$GENE4{$cnt4},$STRAND6{$cnt4},$START6{$cnt4},$END6{$cnt4},$TYPE6{$cnt4})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq2,($START6{$cnt4}-1),($END6{$cnt4}-$START6{$cnt4}+1));
print $out_noncoding ">".$GENE4{$cnt4}."_".$SP4{$cnt4}."\n".$str."\n";
}
@output4=();
close $out_noncoding;
unlink "$filename_base\_IGS.fasta";
}
(2)提取基因编码区
基因编码区:外显子单独提取,例如trnK-UUU-2、trnK-UUU-1。
2_extract_bed_coding.pl
#!/usr/bin/perl -w
use strict;
use File::Find;
use Data::Dumper;
$|=1;
print "Please type your input directory:";
my $dirname=<STDIN>;
chomp $dirname;
my $pattern=".gb";
my @filenames;
find(\&target,$dirname);
sub target{
if (/$pattern/){
push @filenames,"$File::Find::name";
}
return;
}
while (@filenames) {
my $filename_gb=shift @filenames;
my $filename_base=$filename_gb;
$filename_base=~ s/(.*).gb/$1/g;
my $latin_name=substr ($filename_base,rindex($filename_base,"\/")+1);
open(my $in_gb,"<",$filename_gb);
open(my $out_gb,">","$filename_base\_temp1");
while (<$in_gb>){
$_=~ s/\r\n/\n/g;
if ($_=~ /\),\n/){
$_=~ s/\),\n/\),/g;
}elsif($_=~ /,\n/){
$_=~ s/,\n/,/g;
}
print $out_gb $_;
}
close $in_gb;
close $out_gb;
open(my $in_gbk,"<","$filename_base\_temp1");
open(my $out_gbk,">","$filename_base\_temp2");
while (<$in_gbk>){
$_=~ s/,\s+/,/g;
print $out_gbk $_;
}
close $in_gbk;
close $out_gbk;
#generate_bed_file
my (@row_array,$species_name,$length,$element,@genearray,@output1);
my $mark=0;
open (my $in_genbank,"<","$filename_base\_temp2");
while (<$in_genbank>){
chomp;
@row_array=split /\s+/,$_;
if (/^LOCUS/i){
$species_name=$latin_name;
$length=$row_array[2];
}elsif(/ {5}CDS {13}/ or / {5}tRNA {12}/ or / {5}rRNA {12}/){
if ($row_array[2]=~ /^\d+..\d+$/){
$row_array[2]="\+\t$row_array[2]\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~/^complement\((\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),complement\((\d+..\d+)\)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}
$element=$row_array[2];
$mark=1;
}elsif(/^\s+\/gene="(.*)"/ and $mark == 1){
$element=$species_name.":".$1.":".$element;
push @genearray,$element;
$element=();
$mark=0;
}
}
close $in_genbank;
foreach (@genearray){
my @array=split /:/,$_;
push @output1,"$array[0]\t$array[1]\t$array[2]\n";
}
@row_array=();
@genearray=();
#put_fasta_sequence_in_array
my $flag=0;
my @sequence;
my (@fas1,@fas2);
open(my $in_genebank,"<","$filename_base\_temp2");
while (<$in_genebank>){
if ($_=~ /ORIGIN/){
$flag=1;
}
if ($_=~ /\/\//){
$flag=2;
}
if ($flag==1){
next if ($_=~ /ORIGIN/);
push @sequence,$_;
}
}
close $in_genebank;
foreach (@sequence){
chomp;
$_=~ s/\s*//g;
$_=~ s/\d+//g;
push @fas1,$_;
}
my $fas1=join "",@fas1;
my (@fasta1,@fasta2);
push @fasta1,$species_name,$fas1;
@fasta2=@fasta1;
unlink "$filename_base\_temp1";
unlink "$filename_base\_temp2";
#edit_bed_file
my (%SP1,%GENE1,%STRAND1,%START1,%END1,%TYPE1,%STRAND2,%START2,%END2,%TYPE2,%STRAND3,%START3,%END3,%TYPE3,@output2);
my $cnt1=0;
foreach (@output1) {
chomp;
$cnt1++;
($SP1{$cnt1},$GENE1{$cnt1},$STRAND1{$cnt1},$START1{$cnt1},$END1{$cnt1},$TYPE1{$cnt1},$STRAND2{$cnt1},$START2{$cnt1},$END2{$cnt1},$TYPE2{$cnt1},$STRAND3{$cnt1},$START3{$cnt1},$END3{$cnt1},$TYPE3{$cnt1})=(split /\s+/,$_)[0,1,2,3,4,5,6,7,8,9,10,11,12,13];
}
foreach (1..$cnt1) {
if (defined $STRAND2{$_} eq "") {
push @output2,($SP1{$_}."\t".$GENE1{$_}."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
}elsif ((defined $STRAND2{$_} ne "") and (defined $STRAND3{$_} eq "")) {
if (($STRAND1{$_} eq "-") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}
}elsif ((defined $STRAND2{$_} ne "") and (defined $STRAND3{$_} ne "")) {
if (($STRAND1{$_} eq "-") and ($START1{$_} < $START2{$_}) and ($START2{$_} < $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START2{$_}) and ($START2{$_} > $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START3{$_}) and ($START3{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} < $START2{$_}) and ($START2{$_} < $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START2{$_}) and ($START2{$_} > $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START3{$_}) and ($START3{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}
}
}
@output1=();
#sort_bed_file
my $col=3;
my (%sort,@output3);
foreach (@output2){
my @row=split /\t/,$_;
$sort{$_}=$row[$col];
}
foreach (sort {$sort{$a} <=> $sort{$b}} keys %sort){
push @output3,"$_\n";
}
@output2=();
#output_bed_file
open (my $out_bed,">","$filename_base\_coding.bed");
foreach (@output3){
print $out_bed $_;
}
close $out_bed;
#extract_coding_regions
my (%SP2,%GENE2,%STRAND4,%START4,%END4,%TYPE4,$seq1);
my $cnt2=0;
open(my $out_coding,">","$filename_base\_coding.fasta");
while (@fasta1){
my $header=shift @fasta1;
$seq1=shift @fasta1;
}
foreach (@output3){
chomp;
$cnt2++;
($SP2{$cnt2},$GENE2{$cnt2},$STRAND4{$cnt2},$START4{$cnt2},$END4{$cnt2},$TYPE4{$cnt2})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
if ($STRAND4{$cnt2} eq "-") {
my $rev_com=reverse $str;
$rev_com=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com."\n";
}elsif($STRAND4{$cnt2} eq "+"){
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str."\n";
}
}
close $out_coding;
#generate_noncoding_ranges
my (%SP3,%GENE3,%STRAND5,%START5,%END5,%TYPE5,$last,@output4);
my $cnt3=0;
foreach (@output3){
chomp;
$cnt3++;
($SP3{$cnt3},$GENE3{$cnt3},$STRAND5{$cnt3},$START5{$cnt3},$END5{$cnt3},$TYPE5{$cnt3})=(split /\s+/,$_)[0,1,2,3,4,5];
}
foreach (keys %SP3){
if ($_==1 and $START5{$_}!=1){
unshift @output4,$SP3{$_}."\t"."start".'-'.$GENE3{$_}."\t"."?"."/".$STRAND5{$_}."\t"."1"."\t".($START5{$_}-1)."\t"."?"."/".$TYPE5{$_}."\n";
}
}
foreach (1..($cnt3-1)) {
$last=$_+1;
next if ($END5{$_}+1) >= ($START5{$last}-1);
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last}."\t".$STRAND5{$_}."/".$STRAND5{$last}."\t".($END5{$_}+1)."\t".($START5{$last}-1)."\t".$TYPE5{$_}."/".$TYPE5{$last}."\n";
}
foreach (keys %SP3){
if ($_==$cnt3){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'."end"."\t".$STRAND5{$_}."/"."?"."\t".($END5{$_}+1)."\t".$length."\t".$TYPE5{$_}."/"."?"."\n";
}
}
@output3=();
#extract_noncoding_regions
my (%SP4,%GENE4,%STRAND6,%START6,%END6,%TYPE6,$seq2);
my $cnt4=0;
open(my $out_noncoding,">","$filename_base\_noncoding.fasta");
while (@fasta2){
my $header=shift @fasta2;
$seq2=shift @fasta2;
}
foreach (@output4){
chomp;
$cnt4++;
($SP4{$cnt4},$GENE4{$cnt4},$STRAND6{$cnt4},$START6{$cnt4},$END6{$cnt4},$TYPE6{$cnt4})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq2,($START6{$cnt4}-1),($END6{$cnt4}-$START6{$cnt4}+1));
print $out_noncoding ">".$GENE4{$cnt4}."_".$SP4{$cnt4}."\n".$str."\n";
}
@output4=();
close $out_noncoding;
unlink "$filename_base\_noncoding.fasta";
}
(3)提取CDS(蛋白)编码基因
CDS(蛋白)编码基因:不包括内含子,外显子合并。
3_extract_bed_CDS.pl
#!/usr/bin/perl -w
use strict;
use File::Find;
use Data::Dumper;
$|=1;
print "Please type your input directory:";
my $dirname=<STDIN>;
chomp $dirname;
my $pattern=".gb";
my @filenames;
find(\&target,$dirname);
sub target{
if (/$pattern/){
push @filenames,"$File::Find::name";
}
return;
}
while (@filenames) {
my $filename_gb=shift @filenames;
my $filename_base=$filename_gb;
$filename_base=~ s/(.*).gb/$1/g;
my $latin_name=substr ($filename_base,rindex($filename_base,"\/")+1);
open(my $in_gb,"<",$filename_gb);
open(my $out_gb,">","$filename_base\_temp1");
while (<$in_gb>){
$_=~ s/\r\n/\n/g;
if ($_=~ /\),\n/){
$_=~ s/\),\n/\),/g;
}elsif($_=~ /,\n/){
$_=~ s/,\n/,/g;
}
print $out_gb $_;
}
close $in_gb;
close $out_gb;
open(my $in_gbk,"<","$filename_base\_temp1");
open(my $out_gbk,">","$filename_base\_temp2");
while (<$in_gbk>){
$_=~ s/,\s+/,/g;
print $out_gbk $_;
}
close $in_gbk;
close $out_gbk;
#generate_bed_file
my (@row_array,$species_name,$length,$element,@genearray,@output1);
my $mark=0;
open (my $in_genbank,"<","$filename_base\_temp2");
while (<$in_genbank>){
chomp;
@row_array=split /\s+/,$_;
if (/^LOCUS/i){
$species_name=$latin_name;
$length=$row_array[2];
}elsif(/ {5}CDS {13}/){
if ($row_array[2]=~ /^\d+..\d+$/){
$row_array[2]="\+\t$row_array[2]\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~/^complement\((\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),complement\((\d+..\d+)\)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}
$element=$row_array[2];
$mark=1;
}elsif(/^\s+\/gene="(.*)"/ and $mark == 1){
$element=$species_name.":".$1.":".$element;
push @genearray,$element;
$element=();
$mark=0;
}
}
close $in_genbank;
foreach (@genearray){
my @array=split /:/,$_;
push @output1,"$array[0]\t$array[1]\t$array[2]\n";
}
@row_array=();
@genearray=();
#put_fasta_sequence_in_array
my $flag=0;
my @sequence;
my (@fas1,@fas2);
open(my $in_genebank,"<","$filename_base\_temp2");
while (<$in_genebank>){
if ($_=~ /ORIGIN/){
$flag=1;
}
if ($_=~ /\/\//){
$flag=2;
}
if ($flag==1){
next if ($_=~ /ORIGIN/);
push @sequence,$_;
}
}
close $in_genebank;
foreach (@sequence){
chomp;
$_=~ s/\s*//g;
$_=~ s/\d+//g;
push @fas1,$_;
}
my $fas1=join "",@fas1;
my (@fasta1,@fasta2);
push @fasta1,$species_name,$fas1;
@fasta2=@fasta1;
unlink "$filename_base\_temp1";
unlink "$filename_base\_temp2";
#edit_bed_file
my (%SP1,%GENE1,%STRAND1,%START1,%END1,%TYPE1,%STRAND2,%START2,%END2,%TYPE2,%STRAND3,%START3,%END3,%TYPE3,@output2);
my $cnt1=0;
foreach (@output1) {
chomp;
$cnt1++;
($SP1{$cnt1},$GENE1{$cnt1},$STRAND1{$cnt1},$START1{$cnt1},$END1{$cnt1},$TYPE1{$cnt1},$STRAND2{$cnt1},$START2{$cnt1},$END2{$cnt1},$TYPE2{$cnt1},$STRAND3{$cnt1},$START3{$cnt1},$END3{$cnt1},$TYPE3{$cnt1})=(split /\s+/,$_)[0,1,2,3,4,5,6,7,8,9,10,11,12,13];
}
foreach (1..$cnt1) {
if (defined $STRAND2{$_} eq "") {
push @output2,($SP1{$_}."\t".$GENE1{$_}."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
}elsif ((defined $STRAND2{$_} ne "") and (defined $STRAND3{$_} eq "")) {
if (($STRAND1{$_} eq "-") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}
}elsif ((defined $STRAND2{$_} ne "") and (defined $STRAND3{$_} ne "")) {
if (($STRAND1{$_} eq "-") and ($START1{$_} < $START2{$_}) and ($START2{$_} < $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START2{$_}) and ($START2{$_} > $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START3{$_}) and ($START3{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} < $START2{$_}) and ($START2{$_} < $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START2{$_}) and ($START2{$_} > $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START3{$_}) and ($START3{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}
}
}
#sort_bed_file
my $col=3;
my (%sort,@output3);
foreach (@output2){
my @row=split /\t/,$_;
$sort{$_}=$row[$col];
}
foreach (sort {$sort{$a} <=> $sort{$b}} keys %sort){
push @output3,"$_\n";
}
@output2=();
#output_bed_file
open (my $out_bed,">","$filename_base\_CDS.bed");
foreach (@output3){
print $out_bed $_;
}
close $out_bed;
#extract_CDS
my (%SP2,%GENE2,%STRAND4,%START4,%END4,%TYPE4,%STRAND5,%START5,%END5,%TYPE5,%STRAND6,%START6,%END6,%TYPE6,$seq1);
my $cnt2=0;
open(my $out_coding,">","$filename_base\_CDS.fasta");
while (@fasta1){
my $header=shift @fasta1;
$seq1=shift @fasta1;
}
foreach (@output1){
chomp;
$cnt2++;
($SP2{$cnt2},$GENE2{$cnt2},$STRAND4{$cnt2},$START4{$cnt2},$END4{$cnt2},$TYPE4{$cnt2},$STRAND5{$cnt2},$START5{$cnt2},$END5{$cnt2},$TYPE5{$cnt2},$STRAND6{$cnt2},$START6{$cnt2},$END6{$cnt2},$TYPE6{$cnt2})=(split /\s+/,$_)[0,1,2,3,4,5,6,7,8,9,10,11,12,13];
if (defined $STRAND5{$cnt2} eq "") {
my $str1=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
if ($STRAND4{$cnt2} eq "-") {
my $rev_com1=reverse $str1;
$rev_com1=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com1."\n";
}elsif($STRAND4{$cnt2} eq "+"){
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str1."\n";
}
}elsif((defined $STRAND5{$cnt2} ne "") and (defined $STRAND6{$cnt2} eq "")) {
if (($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} < $START5{$cnt2})) {
my $str2=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1));
my $rev_com2=reverse $str2;
$rev_com2=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com2."\n";
}elsif(($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} > $START5{$cnt2})) {
my $str3=substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
my $rev_com3=reverse $str3;
$rev_com3=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com3."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} < $START5{$cnt2})){
my $str4=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str4."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} > $START5{$cnt2})){
my $str5=substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str5."\n";
}
}elsif ((defined $STRAND5{$cnt2} ne "") and (defined $STRAND6{$cnt2} ne "")) {
if (($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} < $START5{$cnt2}) and ($START5{$cnt2} < $START6{$cnt2})) {
my $str6=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
my $rev_com4=reverse $str6;
$rev_com4=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com4."\n";
}elsif(($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} > $START5{$cnt2}) and ($START5{$cnt2} > $START6{$cnt2})) {
my $str7=substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
my $rev_com5=reverse $str7;
$rev_com5=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com5."\n";
}elsif(($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} > $START6{$cnt2}) and ($START6{$cnt2} > $START5{$cnt2})) {
my $str8=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
my $str9=substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
my $rev_com6=reverse $str8;
$rev_com6=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com6.$str9."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} < $START5{$cnt2}) and ($START5{$cnt2} < $START6{$cnt2})){
my $str10=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str10."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} > $START5{$cnt2}) and ($START5{$cnt2} > $START6{$cnt2})){
my $str11=substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str11."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} > $START6{$cnt2}) and ($START6{$cnt2} > $START5{$cnt2})) {
my $str12=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str12."\n";
}
}
}
close $out_coding;
@output1=();
#generate_intergenic_ranges
my (%SP3,%GENE3,%STRAND7,%START7,%END7,%TYPE7,$last0,$last1,$last2,@output4);
my $cnt3=0;
foreach (@output3){
chomp;
$cnt3++;
($SP3{$cnt3},$GENE3{$cnt3},$STRAND7{$cnt3},$START7{$cnt3},$END7{$cnt3},$TYPE7{$cnt3})=(split /\s+/,$_)[0,1,2,3,4,5];
}
foreach (keys %SP3){
if ($_==1 and $START7{$_}!=1){
unshift @output4,$SP3{$_}."\t"."start".'-'.$GENE3{$_}."\t"."?"."/".$STRAND7{$_}."\t"."1"."\t".($START7{$_}-1)."\t"."?"."/".$TYPE7{$_}."\n";
}
}
foreach (1..($cnt3-1)) {
$last0=$_-1;
$last1=$_+1;
$last2=$_+2;
next if ((($END7{$_}+1) >= ($START7{$last1}-1)) and (($END7{$_}+1) < ($END7{$last1}-1)));
next if (($_ > 1) and (($END7{$_}+1) < ($END7{$last0}-1)) and (($END7{$_}+1) < ($START7{$last2}-1)));
if ((($END7{$_}+1) >= ($START7{$last1}-1)) and (($END7{$_}+1) >= ($END7{$last1}-1))){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last2}."\t".$STRAND7{$_}."/".$STRAND7{$last2}."\t".($END7{$_}+1)."\t".($START7{$last2}-1)."\t".$TYPE7{$_}."/".$TYPE7{$last2}."\n";
}else{
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last1}."\t".$STRAND7{$_}."/".$STRAND7{$last1}."\t".($END7{$_}+1)."\t".($START7{$last1}-1)."\t".$TYPE7{$_}."/".$TYPE7{$last1}."\n";
}
}
foreach (keys %SP3){
if ($_==$cnt3){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'."end"."\t".$STRAND7{$_}."/"."?"."\t".($END7{$_}+1)."\t".$length."\t".$TYPE7{$_}."/"."?"."\n";
}
}
@output3=();
#extract_intergenic
my (%SP4,%GENE4,%STRAND8,%START8,%END8,%TYPE8,$seq2);
my $cnt4=0;
open(my $out_noncoding,">","$filename_base\_intergenic.fasta");
while (@fasta2){
my $header=shift @fasta2;
$seq2=shift @fasta2;
}
foreach (@output4){
chomp;
$cnt4++;
($SP4{$cnt4},$GENE4{$cnt4},$STRAND8{$cnt4},$START8{$cnt4},$END8{$cnt4},$TYPE8{$cnt4})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq2,($START8{$cnt4}-1),($END8{$cnt4}-$START8{$cnt4}+1));
print $out_noncoding ">".$GENE4{$cnt4}."_".$SP4{$cnt4}."\n".$str."\n";
}
@output4=();
close $out_noncoding;
unlink "$filename_base\_intergenic.fasta";
}
(4)提取RNA编码基因
RNA编码基因:不包括内含子,外显子合并。
4_extract_bed_RNA.pl
#!/usr/bin/perl -w
use strict;
use File::Find;
use Data::Dumper;
$|=1;
print "Please type your input directory:";
my $dirname=<STDIN>;
chomp $dirname;
my $pattern=".gb";
my @filenames;
find(\&target,$dirname);
sub target{
if (/$pattern/){
push @filenames,"$File::Find::name";
}
return;
}
while (@filenames) {
my $filename_gb=shift @filenames;
my $filename_base=$filename_gb;
$filename_base=~ s/(.*).gb/$1/g;
my $latin_name=substr ($filename_base,rindex($filename_base,"\/")+1);
open(my $in_gb,"<",$filename_gb);
open(my $out_gb,">","$filename_base\_temp1");
while (<$in_gb>){
$_=~ s/\r\n/\n/g;
if ($_=~ /\),\n/){
$_=~ s/\),\n/\),/g;
}elsif($_=~ /,\n/){
$_=~ s/,\n/,/g;
}
print $out_gb $_;
}
close $in_gb;
close $out_gb;
open(my $in_gbk,"<","$filename_base\_temp1");
open(my $out_gbk,">","$filename_base\_temp2");
while (<$in_gbk>){
$_=~ s/,\s+/,/g;
print $out_gbk $_;
}
close $in_gbk;
close $out_gbk;
#generate_bed_file
my (@row_array,$species_name,$length,$element,@genearray,@output1);
my $mark=0;
open (my $in_genbank,"<","$filename_base\_temp2");
while (<$in_genbank>){
chomp;
@row_array=split /\s+/,$_;
if (/^LOCUS/i){
$species_name=$latin_name;
$length=$row_array[2];
}elsif(/ {5}tRNA {12}/ or / {5}rRNA {12}/){
if ($row_array[2]=~ /^\d+..\d+$/){
$row_array[2]="\+\t$row_array[2]\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~/^complement\((\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^complement\(join\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\((\d+..\d+),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),complement\((\d+..\d+)\),complement\((\d+..\d+)\)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t-\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^order\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\(complement\((\d+..\d+)\),(\d+..\d+),(\d+..\d+)\)$/){
$row_array[2]="-\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t+\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}elsif($row_array[2]=~ /^join\((\d+..\d+),(\d+..\d+),complement\((\d+..\d+)\)\)$/) {
$row_array[2]="+\t$1\t$row_array[1]\t+\t$2\t$row_array[1]\t-\t$3\t$row_array[1]";
$row_array[2]=~ s/\../\t/g;
}
$element=$row_array[2];
$mark=1;
}elsif(/^\s+\/gene="(.*)"/ and $mark == 1){
$element=$species_name.":".$1.":".$element;
push @genearray,$element;
$element=();
$mark=0;
}
}
close $in_genbank;
foreach (@genearray){
my @array=split /:/,$_;
push @output1,"$array[0]\t$array[1]\t$array[2]\n";
}
@row_array=();
@genearray=();
#put_fasta_sequence_in_array
my $flag=0;
my @sequence;
my (@fas1,@fas2);
open(my $in_genebank,"<","$filename_base\_temp2");
while (<$in_genebank>){
if ($_=~ /ORIGIN/){
$flag=1;
}
if ($_=~ /\/\//){
$flag=2;
}
if ($flag==1){
next if ($_=~ /ORIGIN/);
push @sequence,$_;
}
}
close $in_genebank;
foreach (@sequence){
chomp;
$_=~ s/\s*//g;
$_=~ s/\d+//g;
push @fas1,$_;
}
my $fas1=join "",@fas1;
my (@fasta1,@fasta2);
push @fasta1,$species_name,$fas1;
@fasta2=@fasta1;
unlink "$filename_base\_temp1";
unlink "$filename_base\_temp2";
#edit_bed_file
my (%SP1,%GENE1,%STRAND1,%START1,%END1,%TYPE1,%STRAND2,%START2,%END2,%TYPE2,%STRAND3,%START3,%END3,%TYPE3,@output2);
my $cnt1=0;
foreach (@output1) {
chomp;
$cnt1++;
($SP1{$cnt1},$GENE1{$cnt1},$STRAND1{$cnt1},$START1{$cnt1},$END1{$cnt1},$TYPE1{$cnt1},$STRAND2{$cnt1},$START2{$cnt1},$END2{$cnt1},$TYPE2{$cnt1},$STRAND3{$cnt1},$START3{$cnt1},$END3{$cnt1},$TYPE3{$cnt1})=(split /\s+/,$_)[0,1,2,3,4,5,6,7,8,9,10,11,12,13];
}
foreach (1..$cnt1) {
if (defined $STRAND2{$_} eq "") {
push @output2,($SP1{$_}."\t".$GENE1{$_}."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
}elsif ((defined $STRAND2{$_} ne "") and (defined $STRAND3{$_} eq "")) {
if (($STRAND1{$_} eq "-") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} < $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
}
}elsif ((defined $STRAND2{$_} ne "") and (defined $STRAND3{$_} ne "")) {
if (($STRAND1{$_} eq "-") and ($START1{$_} < $START2{$_}) and ($START2{$_} < $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START2{$_}) and ($START2{$_} > $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "-") and ($START1{$_} > $START3{$_}) and ($START3{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} < $START2{$_}) and ($START2{$_} < $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START2{$_}) and ($START2{$_} > $START3{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}elsif(($STRAND1{$_} eq "+") and ($START1{$_} > $START3{$_}) and ($START3{$_} > $START2{$_})){
push @output2,($SP1{$_}."\t".$GENE1{$_}."-1"."\t".$STRAND1{$_}."\t".$START1{$_}."\t".$END1{$_}."\t".$TYPE1{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-2"."\t".$STRAND2{$_}."\t".$START2{$_}."\t".$END2{$_}."\t".$TYPE2{$_});
push @output2,($SP1{$_}."\t".$GENE1{$_}."-3"."\t".$STRAND3{$_}."\t".$START3{$_}."\t".$END3{$_}."\t".$TYPE3{$_});
}
}
}
#sort_bed_file
my $col=3;
my (%sort,@output3);
foreach (@output2){
my @row=split /\t/,$_;
$sort{$_}=$row[$col];
}
foreach (sort {$sort{$a} <=> $sort{$b}} keys %sort){
push @output3,"$_\n";
}
@output2=();
#output_bed_file
open (my $out_bed,">","$filename_base\_RNA.bed");
foreach (@output3){
print $out_bed $_;
}
close $out_bed;
#extract_gene
my (%SP2,%GENE2,%STRAND4,%START4,%END4,%TYPE4,%STRAND5,%START5,%END5,%TYPE5,%STRAND6,%START6,%END6,%TYPE6,$seq1);
my $cnt2=0;
open(my $out_coding,">","$filename_base\_RNA.fasta");
while (@fasta1){
my $header=shift @fasta1;
$seq1=shift @fasta1;
}
foreach (@output1){
chomp;
$cnt2++;
($SP2{$cnt2},$GENE2{$cnt2},$STRAND4{$cnt2},$START4{$cnt2},$END4{$cnt2},$TYPE4{$cnt2},$STRAND5{$cnt2},$START5{$cnt2},$END5{$cnt2},$TYPE5{$cnt2},$STRAND6{$cnt2},$START6{$cnt2},$END6{$cnt2},$TYPE6{$cnt2})=(split /\s+/,$_)[0,1,2,3,4,5,6,7,8,9,10,11,12,13];
if (defined $STRAND5{$cnt2} eq "") {
my $str1=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
if ($STRAND4{$cnt2} eq "-") {
my $rev_com1=reverse $str1;
$rev_com1=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com1."\n";
}elsif($STRAND4{$cnt2} eq "+"){
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str1."\n";
}
}elsif((defined $STRAND5{$cnt2} ne "") and (defined $STRAND6{$cnt2} eq "")) {
if (($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} < $START5{$cnt2})) {
my $str2=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1));
my $rev_com2=reverse $str2;
$rev_com2=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com2."\n";
}elsif(($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} > $START5{$cnt2})) {
my $str3=substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
my $rev_com3=reverse $str3;
$rev_com3=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com3."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} < $START5{$cnt2})){
my $str4=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str4."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} > $START5{$cnt2})){
my $str5=substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str5."\n";
}
}elsif ((defined $STRAND5{$cnt2} ne "") and (defined $STRAND6{$cnt2} ne "")) {
if (($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} < $START5{$cnt2}) and ($START5{$cnt2} < $START6{$cnt2})) {
my $str6=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
my $rev_com4=reverse $str6;
$rev_com4=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com4."\n";
}elsif(($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} > $START5{$cnt2}) and ($START5{$cnt2} > $START6{$cnt2})) {
my $str7=substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
my $rev_com5=reverse $str7;
$rev_com5=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com5."\n";
}elsif(($STRAND4{$cnt2} eq "-") and ($START4{$cnt2} > $START6{$cnt2}) and ($START6{$cnt2} > $START5{$cnt2})) {
my $str8=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
my $str9=substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
my $rev_com6=reverse $str8;
$rev_com6=~ tr/ACGTacgt/TGCAtgca/;
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$rev_com6.$str9."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} < $START5{$cnt2}) and ($START5{$cnt2} < $START6{$cnt2})){
my $str10=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str10."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} > $START5{$cnt2}) and ($START5{$cnt2} > $START6{$cnt2})){
my $str11=substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str11."\n";
}elsif(($STRAND4{$cnt2} eq "+") and ($START4{$cnt2} > $START6{$cnt2}) and ($START6{$cnt2} > $START5{$cnt2})) {
my $str12=substr($seq1,($START4{$cnt2}-1),($END4{$cnt2}-$START4{$cnt2}+1)).substr($seq1,($START5{$cnt2}-1),($END5{$cnt2}-$START5{$cnt2}+1)).substr($seq1,($START6{$cnt2}-1),($END6{$cnt2}-$START6{$cnt2}+1));
print $out_coding ">".$GENE2{$cnt2}."_".$SP2{$cnt2}."\n".$str12."\n";
}
}
}
close $out_coding;
@output1=();
#generate_IGS_ranges
my (%SP3,%GENE3,%STRAND7,%START7,%END7,%TYPE7,$last0,$last1,$last2,@output4);
my $cnt3=0;
foreach (@output3){
chomp;
$cnt3++;
($SP3{$cnt3},$GENE3{$cnt3},$STRAND7{$cnt3},$START7{$cnt3},$END7{$cnt3},$TYPE7{$cnt3})=(split /\s+/,$_)[0,1,2,3,4,5];
}
foreach (keys %SP3){
if ($_==1 and $START7{$_}!=1){
unshift @output4,$SP3{$_}."\t"."start".'-'.$GENE3{$_}."\t"."?"."/".$STRAND7{$_}."\t"."1"."\t".($START7{$_}-1)."\t"."?"."/".$TYPE7{$_}."\n";
}
}
foreach (1..($cnt3-1)) {
$last0=$_-1;
$last1=$_+1;
$last2=$_+2;
next if ((($END7{$_}+1) >= ($START7{$last1}-1)) and (($END7{$_}+1) < ($END7{$last1}-1)));
next if (($_ > 1) and (($END7{$_}+1) < ($END7{$last0}-1)) and (($END7{$_}+1) < ($START7{$last2}-1)));
if ((($END7{$_}+1) >= ($START7{$last1}-1)) and (($END7{$_}+1) >= ($END7{$last1}-1))){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last2}."\t".$STRAND7{$_}."/".$STRAND7{$last2}."\t".($END7{$_}+1)."\t".($START7{$last2}-1)."\t".$TYPE7{$_}."/".$TYPE7{$last2}."\n";
}else{
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'.$GENE3{$last1}."\t".$STRAND7{$_}."/".$STRAND7{$last1}."\t".($END7{$_}+1)."\t".($START7{$last1}-1)."\t".$TYPE7{$_}."/".$TYPE7{$last1}."\n";
}
}
foreach (keys %SP3){
if ($_==$cnt3){
push @output4,$SP3{$_}."\t".$GENE3{$_}.'-'."end"."\t".$STRAND7{$_}."/"."?"."\t".($END7{$_}+1)."\t".$length."\t".$TYPE7{$_}."/"."?"."\n";
}
}
@output3=();
#extract_IGS
my (%SP4,%GENE4,%STRAND8,%START8,%END8,%TYPE8,$seq2);
my $cnt4=0;
open(my $out_noncoding,">","$filename_base\_intergenic.fasta");
while (@fasta2){
my $header=shift @fasta2;
$seq2=shift @fasta2;
}
foreach (@output4){
chomp;
$cnt4++;
($SP4{$cnt4},$GENE4{$cnt4},$STRAND8{$cnt4},$START8{$cnt4},$END8{$cnt4},$TYPE8{$cnt4})=(split /\s+/,$_)[0,1,2,3,4,5];
my $str=substr($seq2,($START8{$cnt4}-1),($END8{$cnt4}-$START8{$cnt4}+1));
print $out_noncoding ">".$GENE4{$cnt4}."_".$SP4{$cnt4}."\n".$str."\n";
}
@output4=();
close $out_noncoding;
unlink "$filename_base\_intergenic.fasta";
}