转录组 | 参考基因组
2022-03-12 本文已影响0人
生信师姐
三个常用参考基因组数据库:
- Ensembl:www.ensembl.org
- NCBI:https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/index.shtml
- UCSC:http://www.genome.ucsc.edu/
-
Ensembl官网
image
data:image/s3,"s3://crabby-images/f5219/f521993eba25a7be9d0f9849addcce1a9a0adca5" alt=""
data:image/s3,"s3://crabby-images/c1223/c1223be50ad5cdafa58da4ebf482b6a2b74bd628" alt=""
data:image/s3,"s3://crabby-images/66972/66972d98f46012328be784c5b91f8615c162dbef" alt=""
data:image/s3,"s3://crabby-images/2e246/2e2466b6b17308a9abd3d77ae356e95b000695ac" alt=""
-
参考基因组注释文件
image
data:image/s3,"s3://crabby-images/692e1/692e148045aaebf2267f478f5ddd220023b5cece" alt=""
data:image/s3,"s3://crabby-images/b4aaf/b4aaf9269c4beb7335e2458d64a138b421304da9" alt=""
data:image/s3,"s3://crabby-images/02550/02550e016ea0ad46ec43612f8e360783aca0f888" alt=""
习题
1.fastq与fasta文件转换
应用:加深对两种文件格式的理解,并且fa有后续应用。
#进入fastq目录
less -S SRR1039510_1.fastq.gz | paste - - - - |awk '{print $1"\n"$4}' >SRR1039510_1.fa
#将@替换为>
sed -i 's/@/>/g' SRR1039510_1.fa
data:image/s3,"s3://crabby-images/55ca0/55ca0abca597fea08b4ed6995d7a18e7669db188" alt=""
2.使用fa文件做NT比对,得到测序样本的NT比对信息
应用:随机收取2000reads做NT比对,可以发现测序样本是否含有其他物种污染等。
3.从gff或者gft文件中获取基因的ID与symbol对应关系,以及biotype类型
应用:ID与symbol转换本地化,不依赖于第三方工具和软件包,并可以根据biotype类型区分mRNA,lncRNA以及miRNA等信息。
hisat2比对
data:image/s3,"s3://crabby-images/872fc/872fc80a139dd8129b9a40f53ca1add63ab7c3e7" alt=""
data:image/s3,"s3://crabby-images/e7b01/e7b0181d78646601a30b85f88901d3928292bf67" alt=""
# 进入参考基因组目录
cd ~/database/genome/Ensembl/Homo_sapiens/GRCh38_release95
# Hisat2构建索引
hisat2-build Homo_sapiens.GRCh38_release95.genome.fa Homo_sapiens.GRCh38_release95.genome
# 输入输出定义文件夹
index=~/database/genome/Ensembl/Homo_sapiens/GRCh38_release95/Homo_sapiens.GRCh38_release95.genome.
inputdir=~/project/Human_16-Asthma-Trans/data/cleandata/trim_galore
outdir=~/project/Human_16-Asthma-Trans/Mapping/Hisat2
data:image/s3,"s3://crabby-images/aeb3c/aeb3cf5a4ea550bb671968711db101f928030a7f" alt=""
# 单个样本比对
hisat2 -p 3 -x ${index} -1 ${inputdir}/SRR1039510_1_val_1.fq.gz -2 ${inputdir}/SRR1039510_2_val_2.fq.gz -S ${outdir}/SRR1039510.Hisat_aln.sam
data:image/s3,"s3://crabby-images/84f28/84f28004bcccd4208371f8c42788f22db2edef15" alt=""
data:image/s3,"s3://crabby-images/030bd/030bd189b22a57ded16a6d07051780045905c628" alt=""
data:image/s3,"s3://crabby-images/503c2/503c258351fc9d466b6c2ebc8e9d00f1bc0a3d87" alt=""
data:image/s3,"s3://crabby-images/1ba40/1ba405ad2492ae319544e887217e2b07a245dae6" alt=""
# sam转bam
samtools sort -@ 3 -o SRR1039510.Hisat_aln.sorted.bam SRR1039510.Hisat_aln.sam
# 对bam建索引
samtools index SRR1039510.Hisat_aln.sorted.bam SRR1039510.Hisat_aln.sorted.bam.bai
# 多个样本批量进行比对,排序,建索引
# Hisat.sh内容
cat /teach/project/Human-16-Asthma-Trans/data/rawdata/sra/sampleId.txt | while read id
do
echo "hisat2 -p 3 -x ${index} -1 ${inputdir}/${id}_1_val_1.fq.gz -2 ${inputdir}/${id}_2_val_2.fq.gz 2>${id}.log | samtools sort -@ 5 -o ${outdir}/${id}.Hisat_aln.sorted.bam - && samtools index ${outdir}/${id}.Hisat_aln.sorted.bam ${outdir}/${id}.Hisat_aln.sorted.bam.bai"
done >Hisat.sh
# 提交后台运行
nohup sh Hisat.sh >Hisat.log &
# 统计比对情况
multiqc -o ./ SRR*log
subjunc比对
data:image/s3,"s3://crabby-images/4b71a/4b71ae6cc41348fdde890235d5b6c34aa9dd5805" alt=""
# 进入参考基因组目录
cd /teach/database/genome/Ensembl/Homo_sapiens/GRCh38_release95
# subjunc构建索引
subread-buildindex -o Homo_sapiens.GRCh38_release95.genome Homo_sapiens.GRCh38_release95.genome.fa
# 输入输出定义文件夹
index=/teach/database/genome/Ensembl/Homo_sapiens/GRCh38_release95/Homo_sapiens.GRCh38_release95.genome
inputdir=/trainee2/Oct12/project/project_test/data/cleandata/trim_galore
outdir=/trainee2/Oct12/project/project_test/Mapping/subjunc
# subjunc单样本比对
subjunc -T 3 -i ${index} -r ${inputdir}/SRR1039510_1_val_1.fq.gz -R ${inputdir}/SRR1039510_2_val_2.fq.gz -o ${outdir}/SRR1039510.Subjunc.bam 1>${outdir}/SRR1039510.Subjunc.log 2>&1
# subjunc多样本比对
cat /teach/data/airway/sra/sampleId.txt | while read id
do
echo "subjunc -T 1 -i ${index} -r ${inputdir}/${id}_1_val_1.fq.gz -R ${inputdir}/${id}_2_val_2.fq.gz -o ${outdir}/${id}.Subjunc.bam 1>${outdir}/${id}.Subjunc.log 2>&1 && samtools sort -@ 6 -o ${outdir}/${id}.Subjunc.sorted.bam ${outdir}/${id}.Subjunc.bam && samtools index ${outdir}/${id}.Subjunc.sorted.bam ${outdir}/${id}.Subjunc.sorted.bam.bai "
done >subjunc.sh
# 运行
nohup sh subjunc.sh >subjunc.log &
SAM/BAM格式
data:image/s3,"s3://crabby-images/dba62/dba6204fbbdfdac2f98e396094fb274e5294e108" alt=""
data:image/s3,"s3://crabby-images/0abce/0abceb0e9138712fa9b2dedb3cfc8bf75622657e" alt=""
data:image/s3,"s3://crabby-images/1fcb0/1fcb0028776b7633376440e63683dfa9a7682ef7" alt=""
data:image/s3,"s3://crabby-images/f20f0/f20f0628f946577beb5b936dc2c943d994872832" alt=""
-
统计比对结果
image
# 进入比对文件夹
cd ~/project/Human-16-Asthma-Trans/Mapping/Hisat
# 单个样本
samtools flagstat -@ 3 SRR1039510.Hisat_aln.sorted.bam
ls *.sorted.bam | while read id
do
echo "samtools flagstat -@ 1 ${id} > ${id/bam/flagstat} "
done >flagstat.sh
# 运行
nohup sh flagstat.sh >flagstat.log &
# 质控
multiqc -o ./ *.flagstat