扩增子分析:基于qiime2平台处理raw data的简单per

2020-12-29  本文已影响0人  生信学习者2

为什么我要干这样的傻事,为了让以后自己不干这样的傻事,记录一下这无语的代码,我也是醉了。再也不想用perl了,我要学习基于python的snakemake和nextflow语言组织我的流程。更多知识分享请到 https://zouhua.top/

输入文件

find /data/user/zouhua/pipeline/Amplicon/RawData/ -name "*gz" | grep 'R1' | sort | perl -e 'print"sampleid\tforward-absolute-filepath\treverse-absolute-filepath\n"; while(<>){chomp; $name=(split("\/", $_))[-1]; $name1=$name; $name2=$_; $name1=~s/_R1.fq.gz//g; $name2=~s/R1/R2/; print "$name1\t$_\t$name2\n";}' > pe-33-manifest-trimmed.tsv

perl脚本

#!/usr/bin/perl 

use warnings;
use strict;
use Getopt::Long;
use Cwd 'abs_path';


my ($file_fq, $file_meta, $primer_f, $primer_r, $threads, $mode, $classifier, $depth, $column, $out, $help, $version);
GetOptions(
    "f1|file_fq:s"    =>  \$file_fq,
    "f2|file_meta:s"  =>  \$file_meta,
    "pf|primer_f:s"   =>  \$primer_f,
    "pr|primer_r:s"   =>  \$primer_r,
    "t|threads:s"     =>  \$threads, 
    "m|mode:s"        =>  \$mode,
    "c|classifier:s"  =>  \$classifier, 
    "d|depth:s"       =>  \$depth,
    "co|column:s"     =>  \$column,    
    "o|out:s"   =>  \$out,
    "h|help:s"  =>  \$help,
    "v|version" =>  \$version
);
&usage if(!defined $out);
my $cwd = abs_path;

# output
my $dir = "$cwd/result/"; 
system "mkdir -p $dir" unless(-d $dir);

# script
my $dir_script = "$cwd/result/script/"; 
system "mkdir -p $dir_script" unless(-d $dir_script);

########## output #########################################
# bash script per step
# combine all steps in one script
my $qza_s        = join("", $dir_script, "Run.s1.import.sh");
my $primer_s     = join("", $dir_script, "Run.s2.primer.sh");
my $ASV_s        = join("", $dir_script, "Run.s3.ASV.sh");
my $filter_s     = join("", $dir_script, "Run.s4.filter.sh");
my $taxo_s       = join("", $dir_script, "Run.s5.taxo.sh");
my $diversity_s  = join("", $dir_script, "Run.s6.diversity.sh");
my $ANCOM_s      = join("", $dir_script, "Run.s7.ANCOM.sh");
my $LEfse_s      = join("", $dir_script, "Run.s8.LEfse.sh");
my $picrust_s    = join("", $dir_script, "Run.s9.picrust.sh");

# directory
my $qza_d         = join("", $dir, "01.qza");
system "mkdir -p $qza_d" unless(-d $qza_d);
my $primer_d      = join("", $dir, "02.remove_primer");
system "mkdir -p $primer_d" unless(-d $primer_d);
my $ASV_d         = join("", $dir, "03.ASV");
system "mkdir -p $ASV_d" unless(-d $ASV_d);
my $filter_d      = join("", $dir, "04.filter");
system "mkdir -p $filter_d" unless(-d $filter_d);
my $taxo_d        = join("", $dir, "05.taxonomy");
system "mkdir -p $taxo_d" unless(-d $taxo_d);
my $diversity_d   = join("", $dir, "06.diversity");
system "mkdir -p $diversity_d" unless(-d $diversity_d);
my $ANCOM_d       = join("", $dir, "07.ANCOM");
system "mkdir -p $ANCOM_d" unless(-d $ANCOM_d);
my $LEfse_d       = join("", $dir, "08.LEfse");
system "mkdir -p $LEfse_d" unless(-d $LEfse_d);
my $picrust_d     = join("", $dir, "09.Picrust2");
system "mkdir -p $picrust_d" unless(-d $picrust_d);

###################################
# get qiime env name
my $qiime = `conda env list | grep "qiime.*" | cut -d " " -f 1`;
my $picrust = `conda env list | grep "picrust.*" | cut -d " " -f 1`;

###################################
# step1: import fastq into qza
open(OT1, "> $qza_s") or die "can't open $qza_s \n";
my $qza_out = join("/", $qza_d, "paired-end-demux.qza");
my $qzv_out = join("/", $qza_d, "paired-end-demux-summary.qzv");

my $qza_shell = "qiime tools import --type \'SampleData[PairedEndSequencesWithQuality]\' --input-path $file_fq --output-path $qza_out --input-format PairedEndFastqManifestPhred33V2";
my $qzv_shell = "qiime demux summarize --i-data $qza_out --o-visualization $qzv_out";
print OT1 "source activate $qiime\n";
print OT1 "$qza_shell\n$qzv_shell\n";
print OT1 "echo \"step1: successfully convert fastq into qza\\n\" \n";
close(OT1);


###################################
# step2: remove adapter
# step1: import fastq into qza
open(OT2, "> $primer_s") or die "can't open $primer_s \n";
my $primer_zout = join("/", $primer_d, "paired-end-demux-trim.qza");
my $primer_vout = join("/", $primer_d, "paired-end-demux-trim-summary.qzv");

my $primer_zshell = "qiime cutadapt trim-paired --i-demultiplexed-sequences $qza_out --p-front-f $primer_f --p-front-r $primer_r --p-cores $threads --o-trimmed-sequences $primer_zout";
my $primer_vshell = "qiime demux summarize --i-data $primer_zout --o-visualization $primer_vout";
print OT2 "source activate $qiime\n";
print OT2 "$primer_zshell\n$primer_vshell\n";
print OT2 "echo \"step2: successfully remove primer sequence\\n\" \n";
close(OT2);


###################################
# step3: get ASVs (dada2 deblur)
$mode ||= "dada2";
open(OT3, "> $ASV_s") or die "can't open $ASV_s \n";
print OT3 "source activate $qiime\n";
my $ASV_table = join("/", $ASV_d, "table.qza");
my $ASV_vtable = join("/", $ASV_d, "table.qzv");
my $ASV_seq = join("/", $ASV_d, "rep-seqs.qza");
my $ASV_stats = join("/", $ASV_d, "stats.qza");
if($mode eq "dada2"){
    my $ASV_zshell = "qiime dada2 denoise-paired --i-demultiplexed-seqs $primer_zout --p-trunc-len-f 0 --p-trunc-len-r 0 --p-n-threads $threads --o-table $ASV_table --o-representative-sequences $ASV_seq --o-denoising-stats $ASV_stats";
    my $ASV_vshell = " qiime feature-table summarize --i-table $ASV_table --o-visualization $ASV_vtable --m-sample-metadata-file $file_meta";
    print OT3 "$ASV_zshell\n$ASV_vshell\n";
}elsif($mode eq "deblur"){
    my $ASV_join = join("/", $ASV_d, "paired-end-demux-trim-join.qza");
    my $ASV_vjoin = join("/", $ASV_d, "paired-end-demux-trim-join.qzv");
    my $ASV_filter = join("/", $ASV_d, "paired-end-demux-trim-join-filtered.qza");
    my $ASV_stats = join("/", $ASV_d, "paired-end-demux-trim-join-filter-stats.qza");    

    my $ASV_merge_shell = "qiime vsearch join-pairs --i-demultiplexed-seqs $primer_zout --o-joined-sequences $ASV_join";
    my $ASV_sum_shell = "qiime demux summarize --i-data $ASV_join --o-visualization $ASV_vjoin";
    my $ASV_quality_shell = "qiime quality-filter q-score --i-demux $ASV_join --o-filtered-sequences $ASV_filter --o-filter-stats $ASV_stats";
    my $ASV_denoise_shell = "qiime deblur denoise-16S --i-demultiplexed-seqs $ASV_filter --p-trim-length 300 --o-representative-sequences $ASV_seq --o-table $ASV_table --p-sample-stats --o-stats $ASV_stats";
    my $ASV_vshell = " qiime feature-table summarize --i-table $ASV_table --o-visualization $ASV_vtable --m-sample-metadata-file $file_meta";
    print OT3 "source activate $qiime\n";
    print OT3 "$ASV_merge_shell\n$ASV_sum_shell\n$ASV_quality_shell\n$ASV_denoise_shell\n$ASV_vshell\n";
    
}
print OT3 "echo \"step3: successfully get ASV table in $mode \\n\" \n";
close(OT3);


###################################
# step4: Filter the unsuitable ASV
# 1. remove low occurrence ASVs
# 2. remove contamination and mitochondria, chloroplast sequence
# 3. drop the low depth samples
open(OT4, "> $filter_s") or die "can't open $filter_s \n";
my $filter_low = join("/", $filter_d, "table_filter_low_freq.qza");
my $taxo = join("/", $taxo_d, "taxonomy.qza");
my $filter_contam = join("/", $filter_d, "table_filter_low_freq_contam.qza");
my $filter_sample = join("/", $filter_d, "final_table.qza");
my $filter_req = join("/", $filter_d, "final_rep_seqs.qza");

my $filter_tax_shell = "qiime feature-classifier classify-sklearn --i-classifier $classifier --i-reads $ASV_seq --o-classification $taxo --p-n-jobs $threads";
my $filter_low_shell = "qiime feature-table filter-features --i-table $ASV_table --p-min-frequency 10 --p-min-samples 1 --o-filtered-table $filter_low";
my $filter_contam_shell = "qiime taxa filter-table --i-table $filter_low --i-taxonomy $taxo --p-exclude mitochondria,chloroplast --o-filtered-table  $filter_contam";
my $filter_sample_shell = "qiime feature-table filter-samples --i-table  $filter_contam --p-min-frequency 4000 --o-filtered-table  $filter_sample";
my $filter_final_shell = "qiime feature-table filter-seqs --i-data $ASV_seq --i-table $filter_sample --o-filtered-data $filter_req";

print OT4 "source activate $qiime\n";
print OT4 "$filter_tax_shell\n$filter_low_shell\n$filter_contam_shell\n$filter_sample_shell\n$filter_final_shell\n";
print OT4 "echo \"step4: successfully Filter the unsuitable ASV \\n\" \n";
close(OT4);


##################################
# step5: Taxonomic annotation
open(OT5, "> $taxo_s") or die "can't open $taxo_s\n";
my $taxo_final = join("/", $taxo_d, "taxonomy-final.qza");
my $taxo_barplot = join("/", $taxo_d, "taxonomy-final-barplots.qzv");

my $taxo_tax_shell = "qiime feature-classifier classify-sklearn --i-classifier $classifier  --i-reads $filter_req --o-classification $taxo_final --p-n-jobs $threads";
my $taxo_bar_shell = "qiime taxa barplot --i-table $filter_sample --i-taxonomy $taxo_final --m-metadata-file $file_meta --o-visualization $taxo_barplot";

print OT5 "source activate $qiime\n";
print OT5 "$taxo_tax_shell\n$taxo_bar_shell\n";
print OT5 "echo \"step5: successfully Taxonomic annotation \\n\" \n";
close(OT5);


##################################
# step6: Constructing phylogenetic tree
open(OT6, "> $diversity_s") or die "can't open $diversity_s\n";
my $div_align = join("/", $diversity_d, "final_rep_seqs_aligned.qza");
my $div_mask = join("/", $diversity_d, "final_rep_seqs_masked.qza");
my $div_urtree = join("/", $diversity_d, "unrooted-tree.qza");
my $div_rtree = join("/", $diversity_d, "rooted-tree.qza");

my $div_phy_shell = "qiime phylogeny align-to-tree-mafft-fasttree --i-sequences $filter_req --o-alignment $div_align --o-masked-alignment $div_mask --p-n-threads $threads --o-tree $div_urtree --o-rooted-tree $div_rtree";

print OT6 "source activate $qiime\n";
print OT6 "$div_phy_shell\n";

##################################
# step7: rarefication curve and diversity analysis
my $rare_qzv_name = join("-", "depth", $depth, "alpha-rarefaction.qzv");
my $rare_metrics = join("-", "depth", $depth, "core-metrics");  
my $div_rare = join("/", $diversity_d, $rare_qzv_name);
my $div_metrics = join("/", $diversity_d, $rare_metrics);

my $div_rare_shell = "qiime diversity alpha-rarefaction --i-table $filter_sample --i-phylogeny $div_rtree --p-max-depth $depth --m-metadata-file $file_meta --o-visualization $div_rare";
my $div_metric_shell = "qiime diversity core-metrics-phylogenetic --i-phylogeny $div_rtree --i-table $filter_sample --p-sampling-depth $depth --m-metadata-file $file_meta --output-dir $div_metrics";

print OT6 "$div_rare_shell\n$div_metric_shell\n";
print OT6 "echo \"step6: successfully Constructe phylogenetic tree \\n\" \n";
close(OT6);


#################################
# step8: Analysis of composition of microbiomes
# all diversity index and distance 
# add pseudocount for log transform
open(OT7, "> $ANCOM_s") or die "can't open $ANCOM_s\n";
my $ancom_add = join("/", $ANCOM_d, "final_table_pseudocount.qza");
my $ancom_out = join("/", $ANCOM_d, "ancom_output");

my $ancom_add_shell = "qiime composition add-pseudocount --i-table $filter_sample --p-pseudocount 1 --o-composition-table $ancom_add";
my $ancom_out_shell = "qiime composition ancom --i-table $ancom_add --m-metadata-file $file_meta --m-metadata-column $column --output-dir $ancom_out";

print OT7 "source activate $qiime\n";
print OT7 "$ancom_add_shell\n$ancom_out_shell\n";
print OT7 "echo \"step7: successfully Constructe phylogenetic tree \\n\" \n";
close(OT7);


#################################
# step9: LDA Effect Size
open(OT8, "> $LEfse_s") or die "can't open $LEfse_s\n";
my $LEfse_collapse = join("/", $LEfse_d, "collapse.table.qza");
my $LEfse_freq = join("/", $LEfse_d, "collapse.frequency.table.qza");
my $LEfse_tsv = join("/", $LEfse_d, "collapse.frequency.table.tsv");

my $LEfse_collapse_shell = "qiime taxa collapse --i-table $filter_sample --o-collapsed-table $LEfse_collapse --p-level 6 --i-taxonomy $taxo_final";
my $LEfse_fre_shell = "qiime feature-table relative-frequency --i-table $LEfse_collapse --o-relative-frequency-table $LEfse_freq --output-dir $LEfse_d/output";
my $LEfse_export_shell = "qiime tools export --input-path $LEfse_freq --output-path  $LEfse_d/output";
my $LEfse_tsv_shell = "biom convert -i $LEfse_d/output/feature-table.biom -o $LEfse_tsv --header-key \"taxonomy\" --to-tsv";

print OT8 "source activate $qiime\n";
print OT8 "$LEfse_collapse_shell\n$LEfse_fre_shell\n$LEfse_export_shell\n$LEfse_tsv_shell\n";
print OT8 "echo \"step8: successfully LDA Effect Size \\n\" \n";
close(OT8);


#################################
# step10: Functional prediction: picrust2
open(OT9, "> $picrust_s") or die "can't open $picrust_s\n";
my $picrust_out = join("/", $picrust_d, "picrust2_out_pipeline");

print OT9 "source activate $qiime\n";
my $picrust_out_shell = "qiime tools export --input-path $filter_req --output-path $picrust_d"; 
print OT9 "$picrust_out_shell\n";
print OT9 "source deactivate\n";
print OT9 "source activate $picrust\n";
my $picrust_final_shell = "picrust2_pipeline.py -s $picrust_d/dna-sequences.fasta -i $LEfse_d/output/feature-table.biom -o $picrust_out  -p 30";
print OT9 "$picrust_final_shell\n";
print OT9 "echo \"step9: successfully Functional prediction \\n\" \n";
close(OT9);

###################################################
# combine all scripts
open(OT, "> $out") or die "can't open $out\n";
print OT "sh $qza_s\nsh $primer_s\nsh $ASV_s\nsh $filter_s\nsh $taxo_s\n";
print OT "sh $diversity_s\nsh $ANCOM_s\nsh $LEfse_s\nsh $picrust_s\n";
close(OT);

sub usage{
    print <<USAGE;
usage:
    perl $0 -f1 <fastq file> -f2 <sample group> -pf <forward primer> -pr <reverse primer> -t <threads> -m <dada2 or deblur> -c <classifier> -d <depth> -co <column> -o <out> 
options:
    -f1|fastq file       :[essential]. 
    -f2|sample group     :[essential].
    -pf|forward          :[essential].
    -pr|reverse          :[essential].
    -t|threads           :[essential].
    -m|denoise           :[essential] dada2 or deblur. 
    -c|classifier        :[essential].
    -d|depth             :[essential].
    -co|column           :[essential].  
    -o|out               :[essential].
USAGE
    exit;
};

sub version {
    print <<VERSION;
    version:    v1.0
    update:     20201228 - 20201228
    author:     zouhua1\@outlook.com
VERSION
};

运行

perl qiime2_v0.pl -f1 pe-33-manifest-trimmed.tsv -f2 sample-metadata.tsv -pf CCTAYGGGRBGCASCAG -pr GGACTACNNGGGTATCTAAT -t 20 -c silva-138-99-nb-classifier.qza -d 60000 -co Group -o Run.all.sh

最终目录结构

Amplicon_analysis
├── RawData-> /data/user/zouhua/rawdata/
├── pe-33-manifest-trimmed.tsv
├── qiime2_v0.pl
├── result
│   ├── 01.qza
│   ├── 02.remove_primer
│   ├── 03.ASV
│   ├── 04.filter
│   ├── 05.taxonomy
│   ├── 06.diversity
│   ├── 07.ANCOM
│   ├── 08.LEfse
│   ├── 09.Picrust2
│   └── script
│       ├── Run.s1.import.sh
│       ├── Run.s2.primer.sh
│       ├── Run.s3.ASV.sh
│       ├── Run.s4.filter.sh
│       ├── Run.s5.taxo.sh
│       ├── Run.s6.diversity.sh
│       ├── Run.s7.ANCOM.sh
│       ├── Run.s8.LEfse.sh
│       └── Run.s9.picrust.sh
├── Run.all.sh
├── sample-metadata.tsv
├── silva-138-99-nb-classifier.qza -> /data/share/database/qiime2_db/silva/silva-138-99-nb-classifier.qza
└── work.sh

引用

参考文章如引起任何侵权问题,可以与我联系,谢谢。

上一篇下一篇

猜你喜欢

热点阅读