perlPerl小推车

计算fa文件的条数,总长,N50,GC和CpG位点数目

2018-08-03  本文已影响14人  线断木偶人
#!/usr/bin/perl -w
use strict;
use Data::Dumper;

my $file='genome.fasta.gz';
open A,"gunzip -dc $file|" or die;
#$/=">";<A>;
my $num=0;
my %hash;
my $name;
while(<A>){
    chomp;
    if (/>(.+)/){
     $num ++;
     $name=$1;
    }else{
    tr/atcg/ATCG/;
    $hash{$name}.= $_;
    }

}
#print Dumper(\%hash);
close A;

my $len; my $all_len;
my $countC;my $countG;my $countCpG;
my %hash_len;
foreach my $key (sort keys  %hash){
    $len=length $hash{$key};
    $hash_len{$key}=$len;

    $all_len += $len;
    $countC +=$hash{$key}=~s/C/C/g;
    $countG +=$hash{$key}=~s/G/G/g;
    $countCpG +=$hash{$key}=~s/CG/CG/g;
}

my $half_all_len=$all_len/2;

my $N50=0;
my $in_len;
for my $len_num (sort{$b <=> $a} values %hash_len  ){
    $in_len += $len_num;
    if ($in_len >= $half_all_len){
       $N50 = $len_num;
       last;
    }
}

my $countCG = sprintf"%0.3f",($countC+$countG)/$all_len;
print "序列条数:$num\n总长:$all_len\n";
print "N50:$N50\n";
print "GC含量:$countCG\nCpG位点数:$countCpG\n";

输出结果:

序列条数:942
总长:95269455
N50:192515
GC含量:0.46
CpG位点数:1000

2.只计算每条的长度

#!/usr/bin/perl -w
use strict;
use Data::Dumper;
my $file=shift;
open A,$file or die;
$/=">";<A>;
while(<A>){
    my $id=$1 if(/^(\S+)/);
    $/=">";
    chomp;
    tr/atcg/ATCG/;
    s/^.+?\n//;
    s/\s//g;
    my $a = length $_;
    print "$id\t$a\n";
}
$/="\n";
close A;

结果

>111    200
>2222   40

或者是这样:

#!/usr/bin/perl -w
use strict;
use Data::Dumper;

my $file=shift;
open IN,$file or die;

$/='>';<IN>;$/="\n";
while(<IN>){
    my $id=$1 if(/^(\S+)/);
    $/='>';
    my $seq=<IN>;
    chomp($seq);
    $seq=~s/\s+//g;
    $seq=~tr/atcg/ATCG/;
    my $length=length($seq);
    $/="\n";
    print "$length\n";
}
close IN;
上一篇 下一篇

猜你喜欢

热点阅读