基因组暑期培训分析方法

「生信」物种演化树构建——Muscle→Pal2nal→Gblo

2019-04-08  本文已影响238人  bioinfo_boy

目录

  • 前言
  • muscle序列比对
  • pal2nal蛋白比对文件转换
  • Gblocks筛选
  • RAxML建树
  • 最后

前言

muscle序列比对

这里用OrthoMCL直系同源基因的聚类结果进行序列比对

提取蛋白之序列

$perl 01.extract_ortholog.pep.pl  OrthoMCL_out allpepfiles outputdir
#!/usr/bin/perl -w
use strict;
use Bio::SeqIO;
#用于提取orthomcl结果中的一比一直系同源蛋白

my $orthomclfile=shift;
open (I,"<$orthomclfile");
my $allpepfiles=shift;       #把所有蛋白序列cat到了一个文件里,scr
my $outdir=shift;
system("mkdir $outdir");

my %pep;
my $fa=Bio::SeqIO->new(-file=>$allpepfiles,-format=>'fasta');
while (my $seq_obj=$fa->next_seq){
    my $pep_name=$seq_obj->display_name;
    my $seq=$seq_obj->seq;
    $pep{$pep_name}=$seq;
}
print "pep.fa has done\n";


my %family;
while (<I>){
    chomp;
    my $line=$_;
    my @inf=split/\s+/,$line;
#    my $genefamily=$inf[0];
    my $genefamily=shift(@inf);
    $genefamily=~s/://;
    foreach my $gene (@inf){
        $gene=~/^(\w+)\|(.+)/;
        my $species=$1;
        my $id=$gene;
        $family{species}{$genefamily}{$species}++;
        $family{id}{$genefamily}{$species}=$gene;#########这个id与compliantfasta文件夹中的文件里的id一致 ##change,scr

    }
}
close I;

my @familyes=keys %{$family{species}};
foreach my $genefamily (@familyes){
    my @species=sort keys %{$family{species}{$genefamily}};
    my $count=0;
    foreach my $species (@species){
        $count+=$family{species}{$genefamily}{$species};
    }
    my $mark=scalar(@species);
    if ($count==$mark && $mark==9){###$mark==9这个9的设定是因为物种有9个,但如果物种总数变化了,这个数值也要变动 ##change,scr

        my $output="$outdir/$genefamily";
        open (O,">$output");
        foreach my $species (@species){
            my $id=$family{id}{$genefamily}{$species};
            my $ortholog=$pep{$id};
            print O ">$id\n$ortholog\n";
        }
        close O;
    }
}

批量生成命令行

需要对每个1:1直系同源基因家族进行序列比对

$perl 02.create_sh.muscle.pl 1:1家族提取目录 outputtxt
#!/user/bin/perl -w
use strict;
my $homolog_dir=shift;
my @files=<$homolog_dir/*>;
my $output_sh=shift;
open (O,">$output_sh");

my $outdir=shift;
system("mkdir $outdir");
my $muscle_path="muscle3.8.31_i86linux64";
foreach my $file (@files){
    $file=~/all_(\d+)/;
    my $sign=$1;
    my $outfile="$outdir/muscle$1";
    print O "$muscle_path -in $file -out $outfile\n";
}
close O;

运行muscal

上一步生成的sh脚本

muscle3.8.31_i86linux64 -in /home/Dai_XG/Analysis/Pdel_genome/05.evolution_analysis/02.phlogeny/01.muscle/out_01_pep/outdir_2/all_11457 -out /home/Dai_XG/Analysis/Pdel_genome/05.evolution_analysis/02.phlogeny/01.muscle/out_03_2/muscle11457

pal2nal蛋白比对文件转换

目的是根据蛋白比对结果, 获得对应比对情况的核酸比对结果, 相比于核酸比对, 差异会小很多

cds 提取

这里需要讲一下cds和mrna的区别. cds 前三个碱基对应蛋白质第一个氨基酸, 后三个碱基对应蛋白质最后一个氨基酸, 即 cds序列与蛋白质序列是一一对应的; mrna的起始大多数包含5'UTR, 末尾会带有poly-A的尾巴, 即是在外显子[0]和外显子[-1]上跟cds序列有区别

$perl 01.extract_ortholog.cds.pl OrthoMCL_out allcdsfiles outputdir

以下脚本对应的是已提取过cds的基因组, 未提取的提取过程单列章节记录

#!/usr/bin/perl.old -w
use strict;
use Bio::SeqIO;
#用于提取orthomcl结果中的一比一直系同源蛋白的cds

my $orthomclfile=shift;
open (I,"<$orthomclfile");
my $allnucledir=shift;
my @nuclefiles=<$allnucledir/*>;
my $outdir=shift;
system("mkdir $outdir");
my $genefamilylength_check=shift;
open (F,">$genefamilylength_check");

my %nucle;
foreach my $nuclefile (@nuclefiles){
    my $fa=Bio::SeqIO->new(-file=>$nuclefile,-format=>'fasta');
    while (my $seq_obj=$fa->next_seq){
    my $nucle_name=$seq_obj->display_name;
    my $seq=$seq_obj->seq;
    $nucle{$nucle_name}=$seq;
    }
    print "$nuclefile\n";
}

my %family;
while (<I>){
    chomp;
    my $line=$_;
    my @inf=split/\s+/,$line;
    my $genefamily=shift(@inf);
    $genefamily=~s/://;
    foreach my $gene (@inf){
    $gene=~/^(\w+)\|(.+)/;
    my $species=$1;
    my $id=$2;
    $family{species}{$genefamily}{$species}++;
    $family{id}{$genefamily}{$species}=$gene;#########这个id与compliantfasta文件夹中的文件里的id一致 ##change,scr

    }
}
close I;

my %familylength;
my @familyes=keys %{$family{species}};
foreach my $genefamily (@familyes){
    my @species=sort keys %{$family{species}{$genefamily}};
    my $count=0;
    foreach my $species (@species){
    $count+=$family{species}{$genefamily}{$species};
    }
    my $mark=scalar(@species);     ############必须9个物种同时存在
    if ($count==$mark && $mark==9){ ##########$mark==9,9这个数值可以根据总物种数的变动而变动  ##change,scr
    my $output="$outdir/$genefamily";
    open (O,">$output");
    foreach my $species (@species){
        my $id=$family{id}{$genefamily}{$species};
        my $ortholog=$nucle{$id};
        my $length=length($ortholog);
        if ($length<150){
        $familylength{$genefamily}++;
        }
        print O ">$id\n$ortholog\n";   ##change,scr
    }
    close O;
    }
}

my @familylength=keys %familylength;
foreach my $familylength (@familylength){
    print F "$familylength\n";
}
close F;

cds结果文件排序

按照muscle结果文件中各物种的先后顺序对cds文件中序列进行排序

$perl 01.sort_cds_muscle.pl muscleoutdir cdsoutdir outputdir
#!/usr/bin/perl -w
use strict;
#用于对muscle的结果与cds文件中的fasta进行排序
use Bio::SeqIO;
my $muscledir=shift;
my @pepfiles=<$muscledir/*>;
my $cdsdir=shift;
my $outdir=shift;
system ("mkdir $outdir");

foreach my $pepfile (@pepfiles){
    $pepfile=~/muscle(\d+)/;
    my $sign=$1;
    my $cdsfile="$cdsdir/all_$sign";

    my %pep;
    my $count=0;
    my $fa_P=Bio::SeqIO->new(-file=>$pepfile,-format=>'fasta');
    while (my $seq_obj=$fa_P->next_seq){
    my $pep_id=$seq_obj->display_name;
    $pep_id=~/^(\w+)\|(.+)/;
    my $species=$1;
    my $pepid=$2;
    $count++;
    $pep{$count}=$species;
    }

    my %nucle;
    my $fa_c=Bio::SeqIO->new(-file=>$cdsfile,-format=>'fasta');
    while (my $seq_obj=$fa_c->next_seq){
    my $nuc_id=$seq_obj->display_name;
    my $seq=$seq_obj->seq;
    $nuc_id=~/^(\w+)\|(.+)/;
    my $species=$1;
    my $nucid=$2;
    $nucle{$species}{$nuc_id}=$seq;
    }

    my $output="$outdir/paltoaln$sign";
    open(O,">$output");
    my @order=sort {$a<=>$b} keys %pep;
    foreach my $order (@order){
    my $species=$pep{$order};
    my @id=keys %{$nucle{$species}};           ##change,scr  当老子弱智?
    foreach my $id (@id){
        print O ">$id\n$nucle{$species}{$id}\n";
    }
    }
    close O;
}

批量生成命令行

逐个对muscle结果转换, 没问题吧

-脚本使用

$perl 02.create_pal2aln.pl pepoutdir cdsoutdir outputdir outputtxt
#!/usr/bin/perl -w
use strict;
my $pepdir=shift;
my @musclefiles=<$pepdir/*>;
my $cdsdir=shift;
my $outdir=shift;
system ("mkdir $outdir");
my $output_sh=shift;
open(O,">$output_sh");

my $pal2alnpath="/home/Dai_XG/software/pal2nal.v14/pal2nal.pl";
foreach my $pepfile (@musclefiles){
    $pepfile=~/muscle(\d+)/;
    my $sign=$1;
    print O "perl $pal2alnpath $pepfile $cdsdir/paltoaln$sign -output fasta >$outdir/pal2aln_$sign\n";
}
close O;

运行pal2aln

上一步生成的sh脚本

perl /home/Dai_XG/software/pal2nal.v14/pal2nal.pl /home/Dai_XG/Analysis/Pdel_genome/05.evolution_analysis/02.phlogeny/01.muscle/out_03_2/muscle10776 /home/Dai_XG/Analysis/Pdel_genome/05.evolution_analysis/02.phlogeny/02.pal2aln/out_01/outdir_2/paltoaln10776 -output fasta >../out_03/oudir_2/pal2aln_10776

Gblocks筛选

Gblocks(在线说明文档)用于从多序列比对结果中提取保守位点,以利于下一步的进化分析

批量生成命令行

对pal2aln的结果逐一处理

$perl 01.create_sh.gblock.pl pal2alnoutdir logdir outputtxt
#!/usr/bin/perl -w
use strict;
use File::Basename;
my $muscledir=shift;
my @musclefiles=<$muscledir/*>;
my $logdir=shift;
system("mkdir $logdir");
my $output=shift;
open (O,">$output");

my $gblockpath="Gblocks";
my $i=0;
foreach my $file (@musclefiles){
    my $basename=basename $file;
    if ($basename=~/pal2aln_\d+$/){
    my $logfile="$logdir/$basename.log";
#   print O "$gblockpath $file -b1=11 -b2=9 -b3=6 -b4=10 -b5=h -t=c -e=.gb >$logfile\n";
#   print O "$gblockpath $file -b5=h -t=c -e=.gb2 >$logfile\n";
    print O "$gblockpath $file -t=c -e=.gb1 >$logfile\n";

    }
}
close O;

运行Gblocks

上一步生成的sh脚本

Gblocks /home/Dai_XG/Analysis/Pdel_genome/05.evolution_analysis/02.phlogeny/02.pal2aln/out_03/oudir_2/pal2aln_10776 -t=c -e=.gb1 >/home/Dai_XG/Analysis/Pdel_genome/05.evolution_analysis/02.phlogeny/03.gblock/out_02/Gblocks_pal2aln_2.log/pal2aln_10776.log

RAxML建树

RAxML (Random Axelerated Maximum Likelikhood) 能使用多线程或并行化使用最大似然法构建进化树
安装和使用说明可以参考:
陈连福老师的博客
用RAxML构建极大似然进化树

合并cds序列

建树用的是核酸序列, 将以上步骤得到的1:1直系同源基因合并为一个supergene

$perl 01.cat_cds.pl pal2naloutdir outputdir
#!/usr/bin/perl -w
use strict;
use Bio::SeqIO;
my $trimmeddir=shift;##trimmed后的蛋白文件夹
my @pepfiles=<$trimmeddir/*.gb1>;
my $pepdir=shift;
system("mkdir $pepdir");

my %pep;
my $count=0;
foreach my $file (sort @pepfiles){
    my $fa=Bio::SeqIO->new(-file=>$file,-format=>'fasta');
    while (my $seq_obj=$fa->next_seq){
        my $pepid=$seq_obj->display_name;
        my $pepseq=$seq_obj->seq;
    $pepid=~/(\w+)\|(.+)/;
    my $species=$1;
    $pep{$species}.=$pepseq;
    }
    $count++;
}
print "total files:$count\n";

my @species=sort keys %pep;
foreach my $species (@species){
    my $output="$pepdir/$species.fa";
    open (O,">$output");
    print O ">$species\n$pep{$species}\n";
    close O;
}

生成Phylip文件

RAxML的输入文件为Phylip格式, 还好它配了fasta转Phylip的工具

#逐个转换并合并
$sh /home/Dai_XG/software/standard-RAxML/usefulScripts/convertFasta2Phylip.sh outdir_2/Vvin.fa >phy/Vvin
$sh /home/Dai_XG/software/standard-RAxML/usefulScripts/convertFasta2Phylip.sh outdir_2/Swil.fa >phy/Swil
.
.
.

运行RAxML

-脚本使用

#关于怎么获得多核运行版本的信息, 参考之前两个链接
$raxmlHPC-PTHREADS-AVX -T 30 -f a -x 12345 -p 12345 -# 100 -m GTRGAMMA -s 01.pal2aln_gblock.cds.phy -n gbolck.tre

最后

上一篇 下一篇

猜你喜欢

热点阅读