五、数据比对
2021-02-27 本文已影响0人
白米饭睡不醒
1.参考基因组准备
(1)参考基因组数据库




🍀 参考基因组下载
## 参考基因组准备:注意参考基因组版本信息
# 下载,Ensembl:http://asia.ensembl.org/index.html
# ftp://ftp.ensembl.org/pub/release-95/fasta/homo_sapiens/dna/
# 进入到参考基因组目录
cd /teach/database/genome/Ensembl/Homo_sapiens/GRCh38_release95
# 下载基因组序列
wget -c ftp://ftp.ensembl.org/pub/release-95/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
# 下载基因组注释文件
wget -c ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz
(2)数据文件格式







2.数据比对
(1) Hisat2比对
#解压文件
gzip -d Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
# 进入参考基因组目录
cd /teach/database/genome/Ensembl/Homo_sapiens/GRCh38_release95
# Hisat2构建索引
hisat2-build Homo_sapiens.GRCh38_release95.genome.fa Homo_sapiens.GRCh38_release95.genome
#进入路径
~/project/Human-16-Asthma-Trans/mapping/Hisat2
# 输入输出定义文件夹
index=~/database/genome/Ensembl/Homo_sapiens/GRCh38_release95/GRCh38_release95/Homo_sapiens.GRCh38_release95.genome
inputdir=~/project/Human-16-Asthma-Trans/data/cleandata/trim_galore
outdir=~/project/Human-16-Asthma-Trans/mapping/Hisat2/
# 单个样本比对
hisat2 -p 10 -x ${index} -1 ${inputdir}/SRR1039510_1_val_1.fq.gz -2 ${inputdir}/SRR1039510_2_val_2.fq.gz -S ${outdir}/SRR1039510.Hisat_aln.sam
# sam转bam
samtools sort -@ 15 -o SRR1039510.Hisat_aln.sorted.bam SRR1039510.Hisat_aln.sam
# 对bam建索引
samtools index SRR1039510.Hisat_aln.sorted.bam SRR1039510.Hisat_aln.sorted.bam.bai
# 多个样本批量进行比对,排序,建索引
# Hisat.sh内容
cat /teach/project/Human-16-Asthma-Trans/data/rawdata/sra/sampleId.txt | while read id
do
echo "hisat2 -p 10 -x ${index} -1 ${inputdir}/${id}_1_val_1.fq.gz -2 ${inputdir}/${id}_2_val_2.fq.gz 2>${id}.log | samtools sort -@ 5 -o ${outdir}/${id}.Hisat_aln.sorted.bam - && samtools index ${outdir}/${id}.Hisat_aln.sorted.bam ${outdir}/${id}.Hisat_aln.sorted.bam.bai"
done >Hisat.sh
# 提交后台运行
nohup sh Hisat.sh >Hisat.log &
# 统计比对情况
multiqc -o ./ SRR*log






(2) subjunc 比对
- subjunc参考文档:http://subread.sourceforge.net/
# 进入参考基因组目录
cd /teach/database/genome/Ensembl/Homo_sapiens/GRCh38_release95
# subjunc构建索引
subread-buildindex -o Homo_sapiens.GRCh38_release95.genome Homo_sapiens.GRCh38_release95.genome.fa
# 输入输出定义文件夹
index=/teach/database/genome/Ensembl/Homo_sapiens/GRCh38_release95/Homo_sapiens.GRCh38_release95.genome
inputdir=~/project/Human-16-Asthma-Trans/data/cleandata/trim_galore
outdir=/trainee/Last11/project/Human-16-Asthma-Trans/mapping/subjunc
# subjunc单样本比对
subjunc -T 10 -i ${index} -r ${inputdir}/SRR1039510_1_val_1.fq.gz -R ${inputdir}/SRR1039510_2_val_2.fq.gz -o ${outdir}/SRR1039510.Subjunc.bam 1>${outdir}/SRR1039510.Subjunc.log 2>&1
# subjunc多样本比对
start=getlocatime()
cat /teach/data/airway/sra/sampleId.txt | while read id
do
echo "subjunc -T 10 -i ${index} -r ${inputdir}/${id}_1_val_1.fq.gz -R ${inputdir}/${id}_2_val_2.fq.gz -o ${outdir}/${id}.Subjunc.bam 1>${outdir}/${id}.Subjunc.log 2>&1 && samtools sort -@ 6 -o ${outdir}/${id}.Subjunc.sorted.bam ${outdir}/${id}.Subjunc.bam && samtools index ${outdir}/${id}.Subjunc.sorted.bam ${outdir}/${id}.Subjunc.sorted.bam.bai "
done >subjunc.sh
# 运行
nohup sh subjunc.sh >subjunc.log &

3.sam/bam格式








🐬 samtools工具使用
# view查看bam文件
samtools view SRR1039510.Hisat_aln.sorted.bam
samtools view -H SRR1039510.Hisat_aln.sorted.bam
samtools view -h SRR1039510.Hisat_aln.sorted.bam
# index对bam文件建索引
samtools index SRR1039510.Hisat_aln.sorted.bam SRR1039510.Hisat_aln.sorted.bam.bai
# flagstat统计比对结果
samtools flagstat -@ 3 SRR1039510.Hisat_aln.sorted.bam
# sort排序 sam转bam并排序
samtools sort -@ 3 -o SRR1039510.Hisat_aln.sorted.bam SRR1039510.Hisat_aln.sam
# depth统计测序深度
# 得到的结果中,一共有3列以指标分隔符分隔的数据,第一列为染色体名称,第二列为位点,第三列为覆盖深度
samtools depth SRR1039510.Hisat_aln.sorted.bam >SRR1039510.Hisat_aln.sorted.bam.depth.txt
# 计算某一个基因的测序深度
# 找到基因的坐标
zless -S Homo_sapiens.GRCh38.95.gff3.gz |awk '{if($3=="gene")print}' |grep 'ID=gene:ENSG00000186092' |awk '{print $1"\t"$4"\t"$5}' >ENSG00000186092.bed
samtools depth -b ENSG00000186092.bed SRR1039510.Hisat_aln.sorted.bam >ENSG00000186092.bed.depth
# 如何找到多比对的reads,flag值的理解
# (0x100) 代表着多比对情况,所以直接用samtools view -f 0x100可以提取 multiple比对的 情况
🐳 统计比对结果
# 进入比对文件夹
cd /teach/project/Human-16-Asthma-Trans/Mapping/Hisat
# 单个样本
samtools flagstat -@ 3 SRR1039510.Hisat_aln.sorted.bam
# 多个样本
ls *.sorted.bam | while read id
do
echo "samtools flagstat -@ 10 ${id} > ${id/bam/flagstat} "
done >flagstat.sh
# 运行
nohup sh flagstat.sh >flagstat.log &
# 质控
multiqc -o ./ *.flagstat



