WES 分析(从下载数据到IGV)
2019-01-27 本文已影响62人
dandanwu90
1. 数据下载
SRP077971 选取其中三个wes下载
cat >list.txt
SRR5943131
SRR5943132
SRR5943133
下载sra数据,每个耗时~25min
cat >download.sh
cat list.txt |while read id;do (prefetch ${id});done
nohup bash download.sh &
figure1 sra
sra转为fastq.gz(每个耗时~30min)
#--gzip,输出文件为gzip的压缩格式
#--split-3,将一个样本初始的3文件以配对的形式拆分为_1.fastq,_2.fastq;若只有一个fastq,则忽略拆分,保留一个fastq
#-O 输出文件
#1>${id}.sra.log 日志信息保存
#2>&1 报错信息保存在日志1中
mkdir sra && cd sra
mv /home/ncbi/project/sra/SRR* ./
ls *|while read id; do (fastq-dump --gzip --split-3 -O ${id}) 1>${id}.sra.log 2>&1;done
figure2 fastq
3. 质控
3.1 序列质量
# xargs 将ls的内容用管道符传递给fastqc,
#-t 为线程数,运行速度更快
cd ../
mkdir qc && cd qc
ls ../sra/*gz|xargs fastqc -t 3 -o ./
multiqc ./
figure3 qc.png
figure4 multiqc
figure4.1序列中有N figure4.2有overlap,有adaptor
3.2 Quality control (qc) 每个样本耗时~30min
出错:输出路径-o ../clean 以及../clean/ ,log日志 1>${sample}.log 2>&1的路径
#ls *_1.fastq.gz >1将fq1样本名称放在一个文本里
#ls *_2.fastq.gz >2 将fq2样本名称放在一个文本里
#paste将同一个样本的fq1,fq2 放在同一行,进行处理
#-q 25 去除序列中的接头及低质量的序列,默认值为
#--phred33 质控去除接头时(Sanger/Illumina 1.9+ encoding)以Phred 为ASCII+33质量值进行
# --stringency 3 3‘端adaptor重叠为3bp时,对序列进行过滤
mkdir raw && cd raw
mv ../*fastq.gz ./
ls *_1.fastq.gz >1
ls *_2.fastq.gz >2
paste 1 2 >config
cat >qc.sh
vim qc.sh
cat config|while read sample
do
arr=(${sample})
fq1=${arr[0]}
fq2=${arr[1]}
trim_galore -q 25 --phred33 --length 36 -e 0.1 --stringency 3 --paired -o ../clean $fq1 $fq2
done
nohup nash qc.sh &
figure4 clean
4. mapping (每个样品1h10min)
bwa从sam到排序好的bam:将打断测序的reads重新mapping到参考基因组上得到sam文件,然后用samtools对文件进行排序
#文件路径home/vip11/project/wes/test/clean/SRR5943131_1_trimmed.fq.gz
#不用传参数或是文件,cut -d"/"指定分割符为"/"; -f 8 1 取第8列的第一个元素
#cut -d"_" -f 1 指定分隔符为"_" 取第一列,
#得到只含样本名称,超级超级好用!!!
ls *1.fq.gz >1
ls *2.fq.gz >2
cut -d"/" -f 8 1 |cut -d"_" -f 1 > 0
paste 0 1 2 > config
cat >mapping.sh
INDEX=“/home/qmcui/database/reference/index/bwa/hg38”
cat config |while read sample
do
arr=($sample)
fq1=${arr[1]}
fq2=${arr[2]}
sample=${arr[0]}
echo $sample $fq1 $fq2
bwa mem -t 2 -R "@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" $INDEX $fq1 $fq2 | samtools sort -@ 2 -o $sample.bam -
done
#bwa mem 算法,bwa mem [options] <idxbase> <in1.fq> [in2.fq]
#-t 2 两个线程
#—R 读区时显示的头文件信息;如:'@RG\tID:foo\tSM:bar' [null]
#"@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" 出现在bam文件前文件信息:包括样本ID,样本名称,样本测序类型WGS以及测序平台信息。
# bam统计
ls *.bam | while read id ;do (samtools flagstat $id > $(basename $id ".bam").stat);done
figure5 stat
5. GATK4 流程
5.1 markduplicate (去除建库过程中起始和终止完全相同的序列)
#gatk MarkDuplicates 必须参数-I:inputfile, -O:file, -M:file to write duplication metrics
mkdir gatk_markdup && cd gatk_markdup
cut -d"/" -f 8 1|cut -d"_" -f 1 >sample.txt
cat sample.txt|while read sample;
do
gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" MarkDuplicates -I ../${sample}.sort.bam -O ${sample}_marked.bam -M ${sample}.metrics 1>${sample}_log.mark 2>&1
done
#5.2 fixmateinformation
mkdir fixbam && cd fixbam
cp ../gatk_markdup/sample.txt ./
cat >fixbam.sh
vim fixbam.sh
cat sample.txt|while read sample
do
gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" FixMateInformation
-I ../gatk_markdup/${sample}_markdu.bam
-O ${sample}_mkfixed.bam -SO coordinate 1>${sample}_log.fix 2>&1
done
#samtools index 子命令建索引
cat >idex.sh
vim idex.sh
cat sample.txt|while read sample
do
samtools index ${sample}_mkfixed.bam
done
#5.3 每30min一个样本
cat >recal.sh
vim recal.sh
ref="/home/qmcui/database/reference/index/hisat/hg38/hg38.fa"
indel="/home/qmcui/tmp/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
snp="/home/qmcui/dbsnp_146.hg38.vcf.gz"
cat sample.txt|while read sample
do
gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" BaseRecalibrator -R $ref -I ${sample}_mkfixed.bam --known-sites $snp --known-sites $indel -O ${sample}_recal.table
done
#5.4 每10min一个样本
cat >bqsr.sh
vim bqsr.sh
ref="/home/qmcui/database/reference/index/hisat/hg38/hg38.fa"
cat sample.txt|while read sample
do
gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" ApplyBQSR -R $ref -I ${sample}_mkfixed.bam -bqsr ${sample}_recal.table -O ${sample}_bqsr.bam 1>${sample}_log.ApplyBQSR 2>&1
done
#5.5 每1h一个样本
cat >rawvcf.sh
vim rawvcf.sh
ref="/home/qmcui/database/reference/index/hisat/hg38/hg38.fa"
snp="/home/qmcui/dbsnp_146.hg38.vcf.gz"
cat sample.txt|while read sample
do
gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" HaplotypeCaller -R $ref -I ${sample}_bqsr.bam --dbsnp $snp -O ${sample}_raw.vcf 1>${sample}_log.HC 2>&1
done
6 按照区域call variation
6.1
#每个样本2h40min
cat >gvcf.sh
vim gvcf.sh
ref="/home/qmcui/database/reference/index/hisat/hg38/hg38.fa"
snp="/home/qmcui/dbsnp_146.hg38.vcf.gz"
cat sample.txt|while read sample
do
gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" HaplotypeCaller -ERC GVCF -R $ref -I ${sample}.bqsr.bam --dbsnp $snp -O ${sample}_raw.gvcf 1>${sample}.gvcf.log 2>&1
done
6.2 按区域找变异(依据不同染色体,3min/chr)
seq 22|while read id;do echo chr*{id};done >bed.txt
cat >> bed.txt
chrX
chrY
chrM
cat >final_vcf.sh
vim final_vcf.sh
ref="/home/qmcui/database/reference/index/hisat/hg38/hg38.fa"
snp="/home/qmcui/dbsnp_146.hg38.vcf.gz"
cat bed.txt|while read bed;do gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" GenomicsDBImport -L $bed -R $ref -V SRR5943131_raw.gvcf -V SRR5943132_raw.gvcf -V SRR5943133_raw.gvcf --genomicsdb-workspace-path gvcfs_${bed}.db
gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" GenotypeGVCFs -R $ref -V gendb://gvcfs_${bed}.db --dbsnp $snp -O final_${bed}.vcf
done
nohup bash gvcf.sh &
6.3 将区域变异合并(1-2min)
cat bed.txt|while read bed;do (gatk MergeVcfs -I final_${bed}.vcf -O raw.combine.vcf.gz);done
image.png
7 raw vcf filter,按照snp和indel分别过滤
**容易错**raw vcf 过滤 可以新建文件夹,运动代码时,输入文件夹上绝对路径,或者要将输入文件移动到新的文件夹时,一定要移动所有相关的文件!!!
mkdir vcf_filter && cd vcf_filter
mv ../rgvcf/raw.combine.vcf.gz ./
mv ../rgvcf/raw.combine.vcf.gz.tbi ./
7.1 snp filter
gatk SelectVariants -select-type SNP -V raw.combine.vcf.gz -O raw.snp.vcf.gz
gatk VariantFiltration -V raw.snp.vcf.gz --filter-expression "QUAL < 30.0 || QD < 2.0 || MQ < 40.0 || FS > 60.0 || SOR > 3.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" --filter-name "Filter" -O filter.snp.vcf.gz
gatk SelectVariants --exclude-filtered true -V filter.snp.vcf.gz -O filtered.snp.vcf.gz
7.2 indel filter
gatk SelectVariants -select-type INDEL -V raw.combine.vcf.gz -O raw.indel.vcf.gz
gatk VariantFiltration -V raw.indel.vcf.gz --filter-expression "QUAL < 30.0 | QD < 2.0 || FS > 200.0 || SOR > 10.0 || Inbreeding Coeff < -0.8 || ReadPosRankSum < -20.0" --filter-name "Filter" -O filter.indel.vcf.gz
gatk SelectVariants --exclude-filtered true -V filter.indel.vcf.gz -O filtered.indel.vcf.gz