对目标基因进行注释 - 基于同源注释和从头注释

2024-03-24 本文已影响0人深山夕照深秋雨OvO

需求是想在一大堆基因组中鉴定目标基因比如叫 EPAS1

最简单的肯定是基于已有的结构注释的结果，进行功能注释，看能不能注释到EPAS1
参考https://www.jianshu.com/p/e4c1cc3ab347

但是实际情况中会遇到有的结构注释找不到EPAS1，也就是结构注释有问题
这个时候就需要另起炉灶了

软件安装 - GeMoMa / AUGUSTUS / EVidenceModeler
略
思路是做同源注释 + 从头注释并合并二者的结果
用同源注释抓取一些基因组片段，对被抓取下来的基因组片段进行从头注释以作补充
因为基因组数量太多了，就不考虑转录组注释
这个方法主要参考文章
首先在NCBI或者Uniprot上下载近缘已知物种的EPAS1的氨基酸序列 - 得到EPAS1.pep.fa

#2.1. GeMoMa
java -jar GeMoMa-1.9.jar CLI GeMoMaPipeline threads=14 s=pre-extracted GeMoMa.Score=ReAlign AnnotationFinalizer.r=NO o=true t=input.fa c=EPAS1.pep.fa

#然后会输出以下两个文件
#final_annotation.gff
#unfiltered_predictions_from_species_0.gff

#结果比较遗憾，我基于同源注释的注释结果没有一个可以通过GeMoMa的质量过滤
#所以后续使用unfiltered_predictions_from_species_0.gff

grep 'mRNA' unfiltered_predictions_from_species_0.gff | awk '{print $1"\t"$4-1"\t"$5}' | grep -v '#' | sort -u > GeMoMa.bed
bedtools getfasta -fi genome.fa -bed GeMoMa.bed -fo GeMoMa.fna


#2.2. AUGUSTUS
augustus --species=chicken GeMoMa.fna > GeMoMa.to.AUGUST.tmp

python 1.adjust.august.gff.py GeMoMa.to.AUGUST.tmp GeMoMa.to.AUGUST.gff
#因为是抓取了一部分基因组区域下来，所以和基因组原坐标也不同，需要用脚本进行坐标转换

perl ConvertFormat_augustus.pl GeMoMa.to.AUGUST.gff.tmp GeMoMa.to.AUGUST.gff.clean

3.EVM合并

#创建权重文件 weights，我这个方法做的同源注释其实要写成从头注释
#cat  weights.txt

#ABINITIO_PREDICTION    GeMoMa  5
#ABINITIO_PREDICTION    AUGUSTUS    5

cat GeMoMa.to.AUGUST.gff.clean unfiltered_predictions_from_species_0.gff > EVM.input.gff

/beegfs/suhan/zhuoran/software/EVidenceModeler-v2.1.0/EVidenceModeler --sample_id test --genome GeMoMa.fna --gene_predictions EVM.input.gff --segmentSize 100000 --overlapSize 10000 --weights weights.txt
#输出文件test.EVM.gff3就是注释结果




#这样的做法是把坐标对齐到了原来的基因组上，缺陷在于EVidenceModeler-v2.1.0没办法只对目标基因组区域运行
#所以EVidenceModeler仍然会读取全基因组，这在我的目的中明显是浪费时间和资源
#为了更快的得到结果，略微变动一下，见4.

4.这里我们就不获取原基因组的坐标了，直接将目标fasta文件抓出来
抓出来以后，再在此基础上运行再次运行同源注释和从头注释
这样最后EVidenceModeler会运行的非常快

grep 'mRNA' unfiltered_predictions_from_species_0.gff | awk '{print $1"\t"$4-1"\t"$5}' | grep -v '#' | sort -u > GeMoMa.bed
bedtools getfasta -fi genome.fa -bed GeMoMa.bed -fo GeMoMa.fna
java -jar GeMoMa-1.9.jar CLI GeMoMaPipeline threads=14 s=pre-extracted GeMoMa.Score=ReAlign AnnotationFinalizer.r=NO o=true t=GeMoMa.fna c=EPAS1.pep.fa

augustus --species=chicken GeMoMa.fna > GeMoMa.to.AUGUST.tmp
perl ConvertFormat_augustus.pl GeMoMa.to.AUGUST.tmp GeMoMa.to.AUGUST.gff
cat 
cat GeMoMa.to.AUGUST.gff unfiltered_predictions_from_species_0_2.gff > EVM.input.gff

/beegfs/suhan/zhuoran/software/EVidenceModeler-v2.1.0/EVidenceModeler --sample_id test --genome GeMoMa.fna --gene_predictions EVM.input.gff --segmentSize 100000 --overlapSize 10000 --weights weights.txt
#输出文件test.EVM.gff3就是注释结果


python 1.adjust.august.gff.py   test.EVM.gff3   test.EVM.gff3.tmp
#把坐标转换为原基因组的坐标

#pip install gff3tool
gff3_sort -g test.EVM.gff3.tmp -og test.EVM.gff3.tmp.sort
#对gff文件进行排序

#去除一些重复的注释结果
perl 2.uniq.gff.pl test.EVM.gff3.tmp.sort > EVM.gff.clean
#EVM.gff.clean这个就是最后的注释结果

用到的一些脚本

cat 1.adjust.august.gff.py

import sys

def adjust_coordinates(augustus_gff, adjusted_gff):
    with open(augustus_gff, "r") as infile, open(adjusted_gff, "w") as outfile:
        for line in infile:
            if line.startswith("#"):
                outfile.write(line)
                continue
            
            parts = line.strip().split("\t")
            if len(parts) < 9:
                print(f"Skipping incomplete line: {line.strip()}")
                continue  # 忽略不完整的行
            
            # 无论seqid的格式如何，我们只取冒号前的部分（如果有的话）
            seqid_parts = parts[0].split(":")
            chromosome_number = seqid_parts[0]
            
            start_position = None
            if len(seqid_parts) > 1:
                try:
                    # 尝试解析起始位置，格式假定为"染色体号:起始bp-终止bp"
                    positions = seqid_parts[1].split("-")
                    start_position = int(positions[0])
                except ValueError:
                    print(f"Warning: Unable to parse start position from seqid '{parts[0]}'. Skipping this entry.")
                    continue  # 如果无法解析起始位置，忽略这行

            if start_position is not None:
                try:
                    # 调整起始和结束坐标
                    parts[3] = str(int(parts[3]) + start_position)
                    parts[4] = str(int(parts[4]) + start_position)
                except ValueError:
                    print(f"Skipping line due to error in coordinate conversion: {line.strip()}")
                    continue  # 忽略无法处理的行
            
            # 更新seqid字段为染色体号
            parts[0] = chromosome_number
            outfile.write("\t".join(parts) + "\n")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python script.py augustus_output.gff3 adjusted_augustus_output.gff3")
        sys.exit(1)
    
    augustus_gff = sys.argv[1]
    adjusted_gff = sys.argv[2]
    
    adjust_coordinates(augustus_gff, adjusted_gff)

cat ConvertFormat_augustus.pl

#! /usr/bin/env perl
use strict;
use warnings;

my ($input,$output)=@ARGV;
die "Usage: $0 <input gff> <output gff>" if(@ARGV<2);

my %gff;

my %exon;
my $geneid;
my $start;
open(I,"< $input") or die "Cannot open $input\n";
while (<I>) {
    chomp;
    next if(/^#/);
    next if /^$/;
    my @a=split(/\t/);

    if($a[3]>$a[4]){
        my $tmp=$a[4];
        $a[4]=$a[3];
        $a[3]=$tmp;
    }
    $a[1]=lc($a[1]);
    if ($a[2] eq 'gene'){
        $geneid=$a[8];
        $start=$a[3];
        $a[8]="ID=$a[8]";
        $gff{$a[0]}{$start}{$geneid}{gff} .= join("\t",@a)."\n";
    }elsif($a[2] eq 'transcript'){
        $a[8]="ID=$a[8];Parent=$geneid";
        $a[2]="mRNA";
        $gff{$a[0]}{$start}{$geneid}{gff} .= join("\t",@a)."\n";
    }elsif($a[2] eq 'exon'){
        $a[8]=~/transcript_id\s+\"([^\"]+)\";\s+gene_id\s+\"([^\"]+)\";/ or die "$_\n";
        $exon{$1}++;
        $a[8]="ID=$1.exon$exon{$1};Parent=$1";
        $gff{$a[0]}{$start}{$geneid}{gff} .= join("\t",@a)."\n";
    }elsif($a[2] eq 'CDS'){
        $a[8]=~/transcript_id\s+\"([^\"]+)\";\s+gene_id\s+\"([^\"]+)\";/ or die "$_\n";
        $a[8]="ID=cds.$1;Parent=$1";
        $gff{$a[0]}{$start}{$geneid}{gff} .= join("\t",@a)."\n";
        $gff{$a[0]}{$start}{$geneid}{len} += abs($a[4]-$a[3])+1;
    }
}
close I;

open(O,"> $output") or die "Cannot create $output\n";
foreach my $chr(sort keys %gff){
    for my $pos (sort{$a<=>$b} keys %{$gff{$chr}}){
        for my $gene (sort keys %{$gff{$chr}{$pos}}){
            next if $gff{$chr}{$pos}{$gene}{len} < 150;
            print O "$gff{$chr}{$pos}{$gene}{gff}";
        }
    }
}
close O;

cat 2.uniq.gff.pl

#!/usr/bin/perl
use strict;
use warnings;

# 检查命令行参数
my $usage = "Usage: $0 <input.gff>\n";
my $infile = shift or die $usage;

# 打开GFF文件进行读取
open(my $fh, '<', $infile) or die "Cannot open file $infile: $!";

# 使用哈希表来跟踪前八列的唯一组合
my %seen;

while (my $line = <$fh>) {
    chomp $line;
    # 跳过注释行
    next if $line =~ /^\s*#/;

    # 分割行以获取各列
    my @cols = split(/\t/, $line);

    # 检查是否至少有八列
    if (@cols < 8) {
        warn "Ignoring line with less than 8 columns: $line\n";
        next;
    }

    # 构造一个键，由前八列通过特定分隔符连接而成，用于判断是否已经见过这个组合
    my $key = join("::", @cols[0..7]);

    # 如果这个键还没有见过，就打印这一行，并标记为已见
    unless ($seen{$key}++) {
        print "$line\n";
    }
}

close($fh);

对目标基因进行注释 - 基于同源注释和从头注释

用到的一些脚本

猜你喜欢

热点阅读