GATK4检测体系突变(SNV&Indel)
2021-06-30 本文已影响0人
evolisgreat

以肿瘤数据为例进行演示。
Step 1. Alignment and sorting
--- | --- |
---|---|
Tool | BWA--MEM, SAMtools |
Input | fastq files, reference genome ;$id.sorted.bam |
Output | $id.sorted.bam ; |
Notes | After geting clean fastq data. Read group info is provided with the -R flag. This information is key for downstream GATK functionality. GATK will not work without a read group tag. BamSortIing step can be done using the SAMtools |
Command | bwa.sh, bamqc.sh |
## bwa.sh
INDEX=/mnt/J_DRIVE/Project_WD/0.Reference/index/bwa/hg38
cat /home/allen/Bio_Project/tumor/2-clean/sam_config | while read id
do
echo "start bwa for ${id}" `date`
fq1=/home/allen/Bio_Project/tumor/2-clean/subsam100K/${id}_1_100k.fq
fq2=/home/allen/Bio_Project/tumor/2-clean/subsam100K/${id}_2_100k.fq
bwa mem -M -t 2 -R "@RG\tID:${id}\tSM:${id}\tLB:WXS\tPL:Illumina" ${INDEX} ${fq1} ${fq2} | samtools sort -@ 2 -m 1G -o /home/allen/Bio_Project/tumor/3-align/${id}.sorted.bam \
1>/home/allen/Bio_Project/tumor/3-align/bwa_sort.log 2>&1
echo "end bwa for ${id}" `date`
done
## bamqc.sh
cat /home/allen/Bio_Project/tumor/2-clean/sam_config | while read id
do
bam=/home/allen/Bio_Project/tumor/3-align/${id}.sorted.bam
samtools stats -@ 2 --reference /mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/Homo_sapiens_assembly38.fasta ${bam} > /home/allen/Bio_Project/tumor/3-align/stats/${id}.bam.stat
plot-bamstats -p /home/allen/Bio_Project/tumor/3-align/stats/${id}} /home/allen/Bio_Project/tumor/3-align/stats/${id}.bam.stat
done
Step 2. Mark Duplicates
--- | --- |
---|---|
Tool | GATK4-MarkDuplicates, SAMtools |
Input | $id.sorted.bam |
Output | $id.marked.bam, ……bai |
Notes | After the MarkDuplicates step, bam metrics file will be given by -M, .bai file can be done using the samtools index command. |
Command | markdup.sh |
#markdup.sh
GATK=gatk
cat /home/allen/Bio_Project/tumor/2-clean/sam_config | while read id
do
BAM=/home/allen/Bio_Project/tumor/3-align/${id}.sorted.bam
if [ ! -f /home/allen/Bio_Project/tumor/4-gatk4/1-Markdup/ok.${id}_marked.status ]
then
echo "start MarkDuplicates for ${id}" `date`
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" MarkDuplicates \
-I ${BAM} \
--REMOVE_DUPLICATES true \
-O /home/allen/Bio_Project/tumor/4-gatk4/1-Markdup/${id}_marked.bam \
-M /home/allen/Bio_Project/tumor/4-gatk4/1-Markdup/${id}.metrics \
1>/home/allen/Bio_Project/tumor/4-gatk4/1-Markdup/${id}_markdup.log 2>&1
if [ $? -eq 0 ]
then
touch /home/allen/Bio_Project/tumor/4-gatk4/1-Markdup/ok.${id}_marked.status
fi
echo "end MarkDuplicates for ${id}" `date`
samtools index -@ 2 -m 2G -b /home/allen/Bio_Project/tumor/4-gatk4/1-Markdup/${id}_marked.bam \
/home/allen/Bio_Project/tumor/4-gatk4/1-Markdup/${id}_marked.bai
fi
done
Step 3. Base Quality Score Recalibration (BQSR)
--- | --- |
---|---|
Tool | GATK4-BaseRecalibrator and ApplyBQSR |
Input | ${id}.marked.bam, reference genome, known snp, known indel |
Output | ${id}_bqsr.bam |
Notes | |
Command | BQSR.sh |
#BQSR.sh
GATK=gatk
snp=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/dbsnp_146.hg38.vcf.gz
indel=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
ref=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/Homo_sapiens_assembly38.fasta
cat /home/allen/Bio_Project/tumor/2-clean/sam_config | while read id
do
if [ ! -f /home/allen/Bio_Project/tumor/4-gatk4/2-BQSR/${id}_bqsr.bam ]
then
echo "start BQSR for ${id}" `date`
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" BaseRecalibrator \
-R ${ref} \
-I /home/allen/Bio_Project/tumor/4-gatk4/1-Markdup/${id}_marked.bam \
--known-sites ${snp} \
--known-sites ${indel} \
-O /home/allen/Bio_Project/tumor/4-gatk4/2-BQSR/${id}_recal.table \
1>/home/allen/Bio_Project/tumor/4-gatk4/2-BQSR/${id}_recal.log 2>&1
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" ApplyBQSR \
-R ${ref} \
-I /home/allen/Bio_Project/tumor/4-gatk4/1-Markdup/${id}_marked.bam \
-bqsr /home/allen/Bio_Project/tumor/4-gatk4/2-BQSR/${id}_recal.table \
-O /home/allen/Bio_Project/tumor/4-gatk4/2-BQSR/${id}_bqsr.bam \
1>/home/allen/Bio_Project/tumor/4-gatk4/2-BQSR/${id}_ApplyBQSR.log 2>&1
echo "end BQSR for ${id}" `date`
fi
done
Step 4. Call Variants
--- | --- |
---|---|
Tool | GATK4 |
Input | ${id}_bqsr.bam, reference genome, known snp, known indel, bed |
Output | merge.vcf |
Notes | Variant calling with GenomicsDBImport method. VQSR的时候可能会报错,注意-an参数 |
Command | germline.sh, mutect2.sh |
找胚系突变。
#germline.sh
gatk=gatk
snp=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/dbsnp_146.hg38.vcf.gz
indel=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
ref=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/Homo_sapiens_assembly38.fasta
bed=/home/allen/Bio_Project/tumor/4-gatk4/3-Germline/hglft-hg38.bed
cat germline_config | while read id
do
echo "start HC for ${id}" 'date'
$gatk --java-options "-Xmx4g -Djava.io.temdir=/home" HaplotypeCaller -ERC GVCF \
-R ${ref} \
-I /home/allen/Bio_Project/tumor/4-gatk4/2-BQSR/${id}_bqsr.bam \
--dbsnp ${snp} \
-L ${bed} \
-O /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/${id}.raw.vcf \
1>/home/allen/Bio_Project/tumor/4-gatk4/3-Germline/${id}.HC.log 2>&1
echo "finish HC for ${id}" 'date'
done
cd /home/allen/Bio_Project/tumor/4-gatk4/3-Germline
for chr in chr{1..22} chrX chrY chrM
do
time $gatk --java-options "-Xmx4g -Djava.io.temdir=/home" GenomicsDBImport \
-R ${ref} \
$(ls /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/*raw.vcf | awk '{print "-V "$0" "}') \
-L ${chr} \
--genomicsdb-workspace-path /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/chr_database/gvcf_${chr}.db
time $gatk --java-options "-Xmx4g -Djava.io.temdir=/home" GenotypeGVCFs \
-R ${ref} \
-V gendb:///home/allen/Bio_Project/tumor/4-gatk4/3-Germline/chr_database/gvcf_${chr}.db \
-O /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/genotypegvcfs/gvcfs_${chr}.vcf
done
$gatk --java-options "-Xmx4g -Djava.io.temdir=/home" GatherVcfs \
$(for i in {1..22}; do echo "-I /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/genotypegvcfs/gvcfs_chr${i}.vcf"; done) \
-O merge.vcf \
1>/home/allen/Bio_Project/tumor/4-gatk4/3-Germline/${id}.GatherVcfs.log 2>&1
#VQSR.sh
#先对Indels进行过滤,再对snp进行过滤
gatk=gatk
hapmap=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/hapmap_3.3.hg38.vcf.gz
omni=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/1000G_omni2.5.hg38.vcf.gz
snp=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz
dbsnp=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/dbsnp_146.hg38.vcf.gz
ref=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/Homo_sapiens_assembly38.fasta
indel=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
##Indel Mode
time $gatk --java-options "-Xmx4g -Xms4g" VariantRecalibrator \
-R $ref \
-V /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/merge.vcf \
-resource:mills,known=true,training=true,truth=true,prior=12.0 $indel \
-resource:dbsnp,known=true,training=false,truth=false,prior=2.0 $dbsnp \
-an QD -an SOR \
--mode INDEL \
--max-gaussians 4 \
--rscript-file /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.plots.R \
--tranches-file /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.tranches \
-O /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.recal \
1>/home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.indel.vqsr.log 2>&1
time $gatk --java-options "-Xmx5g -Xms5g" ApplyVQSR \
-R $ref \
-V /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/merge.vcf \
--truth-sensitivity-filter-level 99.0 \
--tranches-file /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.tranches \
--recal-file /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.recal \
--mode INDEL \
-O /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.VQSR.vcf \
1>/home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.indels.ApplyVQSR.log 2>&1
echo "** Indels VQSR done **"
##SNP Mode
time $gatk --java-options "-Xmx4g -Xms4g" VariantRecalibrator \
-R $ref \
-V /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.VQSR.vcf \
-resource:hapmap,known=false,training=true,truth=true,prior=15.0 $hapmap \
-resource:omini,known=false,training=true,truth=false,prior=12.0 $omni \
-resource:1000G,known=false,training=true,truth=false,prior=10.0 $snp \
-resource:dbsnp,known=true,training=false,truth=false,prior=2.0 $dbsnp \
-an QD -an MQ \
--mode SNP \
--max-gaussians 4 \
-tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 95.0 -tranche 90.0 \
--rscript-file /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.snps.plots.R \
--tranches-file /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.snps.tranches \
-O /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.snps.recal \
1>/home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.indels.snps.vqsr.log 2>&1
time $gatk --java-options "-Xmx5g -Xms5g" ApplyVQSR \
-R $ref \
-V /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.VQSR.vcf \
--truth-sensitivity-filter-level 99.0 \
--tranches-file /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.snps.tranches \
--recal-file /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.snps.recal \
--mode SNP \
-O /home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.HC.indels.snps.VQSR.vcf \
1>/home/allen/Bio_Project/tumor/4-gatk4/3-Germline/vqsr/merge.indels.snps.ApplyVQSR.log 2>&1
echo "** SNPs VQSR done **"
找体系突变
## mutect2.sh
GATK=gatk
ref=/mnt/J_DRIVE/Project_WD/0.Reference/GATK/hg38/Homo_sapiens_assembly38.fasta
bed=/home/allen/Bio_Project/tumor/4-gatk4/3-Somatic/hglft-hg38.bed
cat /home/allen/Bio_Project/tumor/4-gatk4/3-Somatic/sam_config2 | while read id
do
arr=(${id})
sample=${arr[1]}
T=/home/allen/Bio_Project/tumor/4-gatk4/2-BQSR/${arr[1]}_bqsr.bam
N=/home/allen/Bio_Project/tumor/4-gatk4/2-BQSR/${arr[0]}_bqsr.bam
echo "start Mutect2 for ${id}" `date`
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" Mutect2 \
-R ${ref} \
-I ${T} -tumor $(basename "$T" _bqsr.bam) \
-I ${N} -normal $(basename "$N" _bqsr.bam) \
-L ${bed} \
-O /home/allen/Bio_Project/tumor/4-gatk4/3-Somatic/${sample}_mutect2.vcf
$GATK FilterMutectCalls \
-R ${ref} \
-V /home/allen/Bio_Project/tumor/4-gatk4/3-Somatic/${sample}_mutect2.vcf \
-O /home/allen/Bio_Project/tumor/4-gatk4/3-Somatic/${sample}_somatic.vcf
echo "end Mutect2 for ${id}" `date`
cat /home/allen/Bio_Project/tumor/4-gatk4/3-Somatic/${sample}_somatic.vcf | perl -alne '{if(/^#/){print}else{next unless $F[6] eq "PASS";next if $F[0] =~/_/;print } }' > /home/allen/Bio_Project/tumor/4-gatk4/3-Somatic/${sample}_filter.vcf
done