生物信息学习

fastq2fasta 转换小脚本

2017-12-08  本文已影响27人  正踪大米饭儿

一个小工具,主要针对 小RNA 数据操作使用,将 fastq reads 转换为 fasta 。

#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use File::Basename;

my ( $fastq, $out, $pre, $type, $help );

GetOptions(
    "fq=s"   => \$fastq,
    "out:s"  => \$out,
    "pre:s"  => \$pre,
    "type:s" => \$type,
    "help|h!"=> \$help
);

die &usage if (!defined $fastq || defined $help );

$out ||= "Result/out"; $pre ||= "seq";
$type ||= "1";
my $outdir = dirname( $out );
system ("mkdir -p $outdir");

## convert fastq to fasta and remove redundancy reads
## seq_id: seq_0000001_x345 

my ($seq, $total, $unique ) = &reads_counter($fastq);
my $fo = scalar ( length ($unique) ); 
my $ff = "%0".$fo."d";
my $i = 0;

open O1, ">$out.convert.fa" || die $!;
foreach my $k ( keys %$seq ){

    $i++;
    my $cnt = $seq->{$k};
    my $id;

    if ($type == 1){
        $id = $pre."_".sprintf ("$ff", $i)."_x$cnt";
    } elsif ($type == 2){
        $id = $pre.sprintf("$ff",$i)."\t$cnt";
    } else {
        print STDERR "ERROR! -type options must be 1 or 2.";
        print STDERR "1 for [{$pre}xxx_00002_x345] 2 for [{$pre}00002\t345]\n";
        exit;
    }

    print O1 ">$id\n$k\n";
}
close O1;

open STAT, ">$out.Reads.stat" || die $!;
print STAT "total_reads\tunique_reads\n";
print STAT "$total\t$unique\n";
close STAT;

## =============================== SUB MODULE ============================ ##

sub reads_counter{
    
    use PerlIO::gzip;
    my $infile = shift @_;
    
    if ( $infile =~ /\.gz$/ ) {
        open FQ, "<gzip:",$infile;
    } else {
        open FQ, "<$infile";
    }

    my $line = 0;
    my $total_reads = 0;
    my %seq;

    while (<FQ>){
        chomp;
        $line++;
        if ($line == 2){
            $total_reads++;
            $seq{$_}++;
        } elsif ($line == 4){
            $line = 0;
        } else {
            next;
        }
    }
    close FQ;

    my $unique_reads = scalar ( keys %seq );

    return ( \%seq, $total_reads, $unique_reads );
}


sub usage{

    my $name = basename($0);

    print STDERR <<USAGE;
===============================================================================
Name:
    $name

Usage:
    perl $name [options]

Options:
    -fq      input fastq [.fq|.fastq|.fq.gz|.fastq.gz]
    -out     out put [defualt: out.fa]
    -pre     prefix of sequence id.[defualt: seq]
    -type    type for out fomat.[1 or 2, defualt:1]
             1: xxx_000002_x234  2: xxx00002  234.
    --help   print this help information.
    -h

e.g:
    perl $name -fq tomA.fq.gz -out Result/out.fa -pre seq

===============================================================================
USAGE
    exit 1;
}

__END__

Author :    Liupeng@genebang.com
Date   :    2017-06-04

上一篇下一篇

猜你喜欢

热点阅读