生信

基因家族 基于 blast比对 筛选串联重复基因

2021-03-22  本文已影响0人  夹竹桃的下午
#定义%identity>70并且aligment length>70 认为是串联重复基因
makeblastdb -in bhlh.cds -dbtype nucl -title bhlh
blastn -num_threads 24 -db bhlh.cds -query bhlh.cds -evalue 1e-20 -outfmt 7 >blastn.out
samtools faidx bhlh.cds

vi  KAKS_SHAIXUAN.pl 
use Getopt::Long;
my %opts;
use Data::Dumper;
GetOptions (\%opts,"in1=s","in2=s","out=s","h"); 
if (! defined($opts{in1}) ||! defined($opts{in2})||! defined($opts{out}) || defined($opts{h})){
    &USAGE;
}
open (IN1,"$opts{in1}") || die "open $opts{in} failed\n";
open (IN2,"$opts{in2}") || die "open $opts{ina} failed\n";
open (OUT,">$opts{out}") || die "open $opts{out} failed\n";
 my %cds_length;
while(<IN1>){
    chomp;
    my @line = split("\t",$_);
    $cds_length{$line[0]}= $line[1];
    #print "$cds_length{$line[0]}\n";
} 

while( <IN2>){
    
        chomp($_);
        my @line1 = split ("\t",$_);
        #print @line1;
        #print "\n";
        my $max_length = $cds_length{$line1[0]} > $cds_length{$line1[1]} ? $cds_length{$line1[0]}:$cds_length{$line1[1]};
        if(($line1[0] ne $line1[1]) && ($line1[2] > 70 )&& ($line1[3] > 0.70*$max_length)){
            print OUT $_."\t$max_length\n";
        
        }
        #print $cds_length1{$line1[0]};
    
}


close(IN1);
close(IN2);
close(OUT);
sub USAGE {
       print "usage: perl $0 -in1 cds_length   -in2 result.txt -out shaixuan_result.txt";
    exit;
}

perl  KAKS_SHAIXUAN.pl -in1 bhlh.cds.fai -in2 blastn.out -out shaixuan.result

vi gettangm.pl
#!/usr/bin/perl -w
use strict;
my %hash;
open IN, "shaixuan.result";
open OU, ">test.txt1";
while(<IN>){
        chomp;
        my @line=split/\s+/,$_;
        my $a=shift@line;
        my $b=shift@line;
        my @n=($a,$b);
        my @h = sort(@n);
        my $l="@h";
        $l=~s/\s/\t/g;
        next if exists $hash{$l};
        $hash{$l}= 1;
        print OU "$l\n";

}
perl   gettangm.pl   
得到test.txt1 即是串联重复ID 后面就可以用MCSCAN画图 
在 .anchors.simple里面对串联基因上色即可
上一篇下一篇

猜你喜欢

热点阅读