基因家族 基于 blast比对 筛选串联重复基因
2021-03-22 本文已影响0人
夹竹桃的下午
#定义%identity>70并且aligment length>70 认为是串联重复基因
makeblastdb -in bhlh.cds -dbtype nucl -title bhlh
blastn -num_threads 24 -db bhlh.cds -query bhlh.cds -evalue 1e-20 -outfmt 7 >blastn.out
samtools faidx bhlh.cds
vi KAKS_SHAIXUAN.pl
use Getopt::Long;
my %opts;
use Data::Dumper;
GetOptions (\%opts,"in1=s","in2=s","out=s","h");
if (! defined($opts{in1}) ||! defined($opts{in2})||! defined($opts{out}) || defined($opts{h})){
&USAGE;
}
open (IN1,"$opts{in1}") || die "open $opts{in} failed\n";
open (IN2,"$opts{in2}") || die "open $opts{ina} failed\n";
open (OUT,">$opts{out}") || die "open $opts{out} failed\n";
my %cds_length;
while(<IN1>){
chomp;
my @line = split("\t",$_);
$cds_length{$line[0]}= $line[1];
#print "$cds_length{$line[0]}\n";
}
while( <IN2>){
chomp($_);
my @line1 = split ("\t",$_);
#print @line1;
#print "\n";
my $max_length = $cds_length{$line1[0]} > $cds_length{$line1[1]} ? $cds_length{$line1[0]}:$cds_length{$line1[1]};
if(($line1[0] ne $line1[1]) && ($line1[2] > 70 )&& ($line1[3] > 0.70*$max_length)){
print OUT $_."\t$max_length\n";
}
#print $cds_length1{$line1[0]};
}
close(IN1);
close(IN2);
close(OUT);
sub USAGE {
print "usage: perl $0 -in1 cds_length -in2 result.txt -out shaixuan_result.txt";
exit;
}
perl KAKS_SHAIXUAN.pl -in1 bhlh.cds.fai -in2 blastn.out -out shaixuan.result
vi gettangm.pl
#!/usr/bin/perl -w
use strict;
my %hash;
open IN, "shaixuan.result";
open OU, ">test.txt1";
while(<IN>){
chomp;
my @line=split/\s+/,$_;
my $a=shift@line;
my $b=shift@line;
my @n=($a,$b);
my @h = sort(@n);
my $l="@h";
$l=~s/\s/\t/g;
next if exists $hash{$l};
$hash{$l}= 1;
print OU "$l\n";
}
perl gettangm.pl
得到test.txt1 即是串联重复ID 后面就可以用MCSCAN画图
在 .anchors.simple里面对串联基因上色即可