11.BWA重新跑-qualimap-GATK
2022-07-18 本文已影响0人
jiarf
重新跑的脚本
## bwa.sh
cd /data1/jiarongf/learning/cancer-WES/3.clean
cat ../data/case |while read id
do
fq1=${id}_1_val_1.fq.gz
fq2=${id}_2_val_2.fq.gz
echo $fq1
echo $fq2
#bwa mem -M -t 11 -R "@RG ID:$id SM:$id LB:WXS PL:Illumina" /nas01/Genome/bwa_Homo_sapiens.GRCh38_index/Homo_sapiens_index $fq1 $fq2 \
#bwa mem -M -t 11 -R "@RG\tID:$id\tSM:$id\tLB:WXS\tPL:Illumina" /nas01/Genome/bwa_Homo_sapiens.GRCh38_index/Homo_sapiens_index $fq1 $fq2 \
# | samtools sort -@ 11 -m 1G --reference /nas01/Genome/fasta_gtf/Homo_sapiens.GRCh38.dna.toplevel.fa -o ../4.align/${id}.bam -
bwa mem -M -t 11 -R "@RG\tID:$id\tSM:$id\tLB:WXS\tPL:Illumina" /data1/jiarongf/learning/cancer-WES/bwa/Homo_sapiens_assembly38 $fq1 $fq2 \
| samtools sort -@ 11 -m 1G --reference /data1/jiarongf/learning/cancer-WES/bwa/Homo_sapiens_assembly38.fa -o ../4.bwaout/${id}.bam -
done
#bwa mem -M -t 11 -R "@RG\tID:$id\tSM:$id\tLB:WXS\tPL:Illumina" /nas01/Genome/bwa_Homo_sapiens.GRCh38_index/Homo_sapiens_index case5_germline_WES_1_val_1.fq.gz case5_germline_WES_2_val_2.fq.gz > case5_germline_WES.sam
#samtools sort -@ 11 -m 1G --reference /nas01/Genome/fasta_gtf/Homo_sapiens.GRCh38.dna.toplevel.fa -o ../4.align/case5_germline_WES.bam case5_germline_WES.sam
跑完之后的就有chr了
case5_biorep_C_WES.4916 147 chr1 451229 0 75M = 451196 -108 GGTCCAGGCAACAGCCAGAAATGAAAGGCACATTCTTGGGCTCATAATGGTCAGATAGTGGAGGGGCTTACATAG ??=@>@BDDA@DCECDBEA@?CE@@DCDD@D?BACACBCDD@D@@?A@BCAD>D@?CACBD@BBAAB@?>?=:?? NM:i:0 MD:Z:75 MC:Z:76M AS:i:75 XS:i:75 RG:Z:case5_biorep_C_WES
qualimap
#cd /data1/jiarongf/learning/cancer-WES/4.align
#ls /data1/jiarongf/learning/cancer-WES/4.align/*bam | while read id
cd /data1/jiarongf/learning/cancer-WES/4.bwaout
ls /data1/jiarongf/learning/cancer-WES/4.bwaout/*bam | while read id
do
file=$(basename ${id} .bam)
#qualimap bamqc --java-mem-size=40G -gff /data1/jiarongf/learning/cancer-WES/4.align/hg38.exon2.bed -nr 100000 -nw 500 -nt 16 -bam $id -outdir /data1/jiarongf/learning/cancer-WES/4.align/qualimap/${file}
qualimap bamqc --java-mem-size=40G -gff /data1/jiarongf/learning/cancer-WES/4.align/hg38.exon.bed -nr 100000 -nw 500 -nt 16 -bam $id -outdir /data1/jiarongf/learning/cancer-WES/4.bwaout/qualimap/${file}
done
#cat CCDS.current.txt |grep "Public" | perl -alne '{/\[(.*?)\]/;next unless $1;$gene=$F[2];$exons=$1;$exons=~s/\s//g;$exons=~s/-/\t/g;print "$F[0]\t$_\t$gene" foreach split/,/,$exons;}' | sort -u | bedtools sort -i |awk '{print $0"\t0\t+"}' > hg38.exon2.bed
#cat CCDS.current.txt |grep "Public" | perl -alne '{/\[(.*?)\]/;next unless $1;$gene=$F[2];$exons=$1;$exons=~s/\s//g;$exons=~s/-/\t/g;print "$F[0]\t$_\t$gene" foreach split/,/,$exons;}' | sort -u | bedtools sort -i |awk '{print "chr"$0"\t0\t+"}' > hg38.exon.bed
#samtools view test_marked.bam | awk '$3="chr"$3' | awk -F ' ' '$7=($7=="=" || $7=="*"?$7:sprintf("chr%s",$7))' | tr ' ' '\t' | samtools view -b > output.bam
# [E::sam_parse1] missing SAM header
# [W::sam_read1] Parse error at line 199
# [main_samview] truncated file.
#samtools view -H test_marked.bam | sed -e 's/SN:\([0-9XY]\)/SN:chr\1/' -e 's/SN:MT/SN:chrM/' | samtools reheader - test_marked.bam > output.bam
#[E::cram_get_ref] Failed to populate reference for id 0
#samtools view -T test_marked.bam | sed -e 's/SN:\([0-9XY]\)/SN:chr\1/' -e 's/SN:MT/SN:chrM/' | samtools reheader - test_marked.bam > test.CHR.bam
#Segmentation fault (core dumped)
#samtools view -h test_marked.bam | sed -e '/^@SQ/s/SN\:/SN\:chr/' -e '/^[^@]/s/\t/\tchr/2'|awk -F ' ' '$7=($7=="=" || $7=="*"?$7:sprintf("chr%s",$7))' |tr " " "\t" | samtools view -h -b -@ 10 -S - > test.chr.bam
#成功!
(wes) 09:11:09 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
bash qualimap.sh > ../log/qualimap.sh2.log 2>&1
qualimap.sh2.log
log的一部分截图如上面,跑成功了
跑了一整天,从早上九点,到晚上十一点
GATK
1.1 markduplicate
GATK=/data1/jiarongf/learning/cancer-WES/run/gatk-4.2.6.1/gatk
cd /data1/jiarongf/learning/cancer-WES/4.bwaout
#cd /data1/jiarongf/learning/cancer-WES/4.align
cat ../data/case | while read id
do
BAM=./${id}.bam
if [ ! -f ${id}_marked.bam ]
then
echo "start MarkDuplicates for ${id}" `date`
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" MarkDuplicates \
-I ${BAM} \
-O ${id}_marked.bam \
-M ${id}.metrics \
1>../log/${id}_log2.mark 2>&1
echo "end MarkDuplicates for ${id}" `date`
samtools index -@ 16 -m 4G -b ${id}_marked.bam
fi
done
# gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" MarkDuplicates \
# -I case6_techrep_2_WES.bam \
# -O case6_techrep_2_WES_marked.bam \
# -M case6_techrep_2_WES.metrics \
# 1>../log/case6_techrep_2_WES_log.mark 2>&1
(wes) 08:56:21 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
bash markDuplicates.sh > ../log/markDuplicates2.sh.log 2>&1
(wes) 21:11:13 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
cat ../log/markDuplicates2.sh.log | tail
start MarkDuplicates for case6_techrep_2_WES Tue May 10 19:17:49 CST 2022
end MarkDuplicates for case6_techrep_2_WES Tue May 10 19:42:06 CST 2022
start MarkDuplicates for case1_techrep_2_WES Tue May 10 19:42:35 CST 2022
end MarkDuplicates for case1_techrep_2_WES Tue May 10 20:08:32 CST 2022
start MarkDuplicates for case2_techrep_2_WES Tue May 10 20:09:05 CST 2022
end MarkDuplicates for case2_techrep_2_WES Tue May 10 20:31:39 CST 2022
start MarkDuplicates for case5_techrep_2_WES Tue May 10 20:32:05 CST 2022
end MarkDuplicates for case5_techrep_2_WES Tue May 10 20:58:15 CST 2022
start MarkDuplicates for case5_biorep_C_WES Tue May 10 20:59:28 CST 2022
end MarkDuplicates for case5_biorep_C_WES Tue May 10 21:10:59 CST 2022
跑完了
1.2 bqsr
GATK=/data1/jiarongf/learning/cancer-WES/run/gatk-4.2.6.1/gatk
snp=/data1/jiarongf/learning/cancer-WES/run/gatk_resource/dbsnp_146.hg38.vcf.gz
indel=/data1/jiarongf/learning/cancer-WES/run/gatk_resource/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
ref=/data1/jiarongf/learning/cancer-WES/run/gatk_resource/Homo_sapiens_assembly38.fasta
cd /data1/jiarongf/learning/cancer-WES/4.bwaout
#cd /data1/jiarongf/learning/cancer-WES/5.GATK
cat ../data/case | while read id
#cat ../data/small.case.txt | while read id
do
if [ ! -f ${id}_bqsr.bam ]
then
echo "start BQSR for ${id}" `date`
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./tmp/" BaseRecalibrator \
-R $ref \
-I ${id}_marked.bam \
--known-sites $snp \
--known-sites $indel \
-O ${id}_recal.table \
1>../log/${id}_log2.recal 2>&1
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./tmp" ApplyBQSR \
-R $ref \
-I ${id}_marked.bam \
-bqsr ${id}_recal.table \
-O ${id}_bqsr.bam \
1>../log/${id}_log2.ApplyBQSR 2>&1
echo "end BQSR for ${id}" `date`
fi
done
# gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" BaseRecalibrator \
# -R /data1/jiarongf/learning/cancer-WES/run/gatk_resource/Homo_sapiens_assembly38.fasta \
# -I case1_biorep_A_techrep_WES_marked.chr.sed.bam \
# --known-sites /data1/jiarongf/learning/cancer-WES/run/gatk_resource/dbsnp_146.hg38.vcf.gz \
# --known-sites /data1/jiarongf/learning/cancer-WES/run/gatk_resource/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
# -O case1_biorep_A_techrep_WES_recal.table \
(wes) 14:31:27 jiarongf@172.16.10.223:/data1/jiarongf/learning/cancer-WES/run
$
bash bqsr.sh > ../log/bqsr.sh2.log 2>&1