RNAseq006 转录组入门(6):reads计数
2020-09-13 本文已影响0人
caoqiansheng
传送门:
RNAseq005 转录组入门(5):序列比对
RNAseq004 转录组入门(4):参考基因组下载
RNAseq003 转录组入门(3):了解fastq测序数据
RNAseq002 转录组入门(2):数据下载
RNAseq001 转录组入门(1):资源准备
前面的五章我们分析的是人类mRNA-Seq测序的结果,一般而言RNA-Seq数据分析都要有重复,而文章中有一个样本缺少配对数据,所以还是选用小鼠的数据把流程再来一遍
1.数据下载及质控见前文
2.比对
# HISAT2比对
for i in {59..62};do hisat2 -t -x /mnt/e/Work/bioinfo/public/index/mouse/hisat2/grcm38/genome -1 /mnt/e/Work/bioinfo/project/202009_RNAseq/data/SRR35899${i}_1.fastq.gz -2 /mnt/e/Work/bioinfo/project/202009_RNAseq/data/SRR35899${i}_2.fastq.gz -S /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899${i}.sam;done
3.SAM2BAM
# SAM文件转换为BAM
for i in `seq 59 62`
do
samtools view -S SRR35899${i}.sam -b > SRR35899${i}.bam
done
4.bam flag统计
# 对排序后的bam统计flagstat
# basename命令用于获取路径中的文件名或路径名,可以对末尾的扩展名进行删除
ls *.bam |while read id ;do (samtools flagstat -@ 1 $id > $(basename ${id} ".bam").flagstat );done
mkdir flagstat && mv *.flagstat flagstat && cd flagstat
multiqc ./
4.1 用一个小脚本把统计信息转换为csv文件
# 创建脚本
cat > stat.sh
### 将以下内容写入stat.sh
#!/bin/bash
cat *.flagstat | awk '{print $1}' | paste - - - - - - - - - - - - - > file1
# 77607517 16671207 0 0 75387881 60936310 30468155 30468155 56502696 57494864 1221810 832364 530657
# 134310379 28365145 0 0 130964009 105945234 52972617 52972617 98979648 100621038 1977826 1398380 907493
# 94264829 20737377 0 0 91921243 73527452 36763726 36763726 68525830 69723750 1460116 1023854 644490
# 111681106 24075844 0 0 109169544 87605262 43802631 43802631 82145504 83390620 1703080 1013088 643888
# 取行名
cut -d"+" -f 2 SRR3589959.flagstat | cut -d" " -f 3-90 > file2
# in total (QC-passed reads
# secondary
# supplementary
# duplicates
# mapped (97.14% : N/A)
# paired in sequencing
# read1
# read2
# properly paired (92.72% : N/A)
# with itself and mate mapped
# singletons (2.01% : N/A)
# with mate mapped to a different chr
# with mate mapped to a different chr (mapQ>=5)
# 取列名
ls *.flagstat | while read id ;do echo $(basename ${id} ".flagstat") ;done > file3
# SRR3589959
# SRR3589960
# SRR3589961
# SRR3589962
paste file3 file1 > file4
# 将file4行列转置
awk '{
for (i=1;i<=NF;i++){
if (NR==1){
res[i]=$i
}
else{
res[i]=res[i]" "$i
}
}
}END{
for(j=1;j<=NF;j++){
print res[j]
}
}' file4 > file5
# 在file2首行加入内容
sed '1i Index' file2 > file6
paste file6 file5 > stat.txt
cat stat.txt > stat.csv
rm file*
# 脚本内容截止
# ==========================================================================
# 退出脚本编辑Enter,ctrl+c
# 运行脚本
bash stat.sh
4.2 csv文件处理
image.png-
csv文件打开后是这个样子:
image.png -
选中第一列→数据→分列→分隔符→选择tab分隔
image.png -
选中第二列→数据→分列→分隔符→选择空格分隔
image.png -
转换完成
image.png
5.bam排序,索引
# 排序,索引
for i in `seq 59 62`
do
samtools sort SRR35899${i}.bam -o SRR35899${i}_sorted.bam
samtools index SRR35899${i}_sorted.bam
done
# 将SAM转换为BAM,并排序构建索引,随后删除SAM文件
# for i in `seq 59 62`
# do
# samtools view -S SRR35899${i}.sam -b > SRR35899${i}.bam
# samtools sort SRR35899${i}.bam -o SRR35899${i}_sorted.bam
# samtools index SRR35899${i}_sorted.bam
# done
# rm *.sam
6 注释
# 注释
for i in {59..62}
do
htseq-count -s no -f bam -r pos /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899${i}_sorted.bam /mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3 > /mnt/e/Work/bioinfo/project/202009_RNAseq/result/annotation/SRR35899${i}.count
done
# 代码运行报错
# Please Install PySam to use the BAM_Reader Class (http://code.google.com/p/pysam/)Error occured when reading beginning of BAM file.
# No module named pysam
# [Exception type: ImportError, raised in __init__.py:1086]
# 解决办法
# 下载pysam源代码
# 下载地址:https://pypi.org/project/pysam/#files
# 复制下载链接放入迅雷:https://files.pythonhosted.org/packages/99/5a/fc440eb5fffb5346e61a38b49991aa552e4b8b31e8493a101d2833ed1e19/pysam-0.16.0.1.tar.gz
cd ~/biosoft
mkdir pysam && cd pysam
wget https://files.pythonhosted.org/packages/99/5a/fc440eb5fffb5346e61a38b49991aa552e4b8b31e8493a101d2833ed1e19/pysam-0.16.0.1.tar.gz
tar zxvf pysam-0.16.0.1.tar.gz
cd pysam-0.16.0.1
python setup.py install
# 报错
# Traceback (most recent call last):
# File "setup.py", line 24, in <module>
# from setuptools import setup, find_packages
# ImportError: No module named setuptools
# python2环境下安装setuptools
sudo apt-get install python-setuptools
# python3环境下安装setuptools
sudo apt-get install python3-setuptools
# 再次执行安装
sudo python setup.py install
# 再次运行注释
# 构建脚本
cat > annotation.sh
#### 输入以下内容
#!/bin/bash
for i in {59..62}
do
# .sorted.bam地址
input="/mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899${i}_sorted.bam"
# .gtf地址
annotation="/mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3"
# 输出文件地址
output="/mnt/e/Work/bioinfo/project/202009_RNAseq/result/annotation"
htseq-count -s no -f bam -r pos ${input} ${annotation} > ${output}/SRR35899${i}.count
done
# ===============================
# 运行
bash annotation.sh
7 featureCounts统计
# featureCounts计数
featureCounts -p -t exon -g gene_id -a /mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3 -o /mnt/e/Work/bioinfo/project/202009_RNAseq/result/count/all.id.txt /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899{59..62}_sorted.bam
# 运行后报错
# featurecounts segmentation fault (core dumped)
# 解决办法
# 下载二进制版本subread
rm -rf ~/biosoft/subread
mkdir -p ~/biosoft/subread && cd ~/biosoft/subread
wget https://nchc.dl.sourceforge.net/project/subread/subread-2.0.1/subread-2.0.1-Linux-x86_64.tar.gz
tar zxvf subread-2.0.1-Linux-x86_64.tar.gz
cd subread-2.0.1-Linux-x86_64
cd ~/biosoft/subread/subread-2.0.1-Linux-x86_64/bin
./featureCounts
echo "export PATH=\$PATH:/home/cqs/biosoft/subread/subread-2.0.1-Linux-x86_64/bin" >> ~/.bashrc
source ~/.bashrc
featureCounts
# 再次运行代码
featureCounts -p -t exon -g gene_id -a /mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3 -o /mnt/e/Work/bioinfo/project/202009_RNAseq/result/count/all.id.txt /mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899{59..62}_sorted.bam
# 对all.id.txt.summary进行multiqc,查看Counts质控
multiqc ./all.id.txt.summary
# [INFO ] multiqc : This is MultiQC v1.9
# [INFO ] multiqc : Template : default
# [INFO ] multiqc : Searching : /mnt/e/Work/bioinfo/project/202009_RNAseq/result/count/all.id.txt.summary
# Searching 1 files.. [####################################] 100%
# [INFO ] feature_counts : Found 4 reports
# [INFO ] multiqc : Compressing plot data
# [INFO ] multiqc : Report : multiqc_report.html
# [INFO ] multiqc : Data : multiqc_data
# [INFO ] multiqc : MultiQC complete
image.png
8.htseq-count统计
cat > htseq-count.sh
### 输入以下内容
#!/bin/bash
for i in {59..62}
do
# .sorted.bam地址
input="/mnt/e/Work/bioinfo/project/202009_RNAseq/result/align/20200910mouse/SRR35899${i}_sorted.bam"
# .gtf地址
annotation="/mnt/e/Work/bioinfo/public/Annotation/mouse/gencode/gencode.vM25.annotation.gff3"
# 输出文件地址
output="/mnt/e/Work/bioinfo/project/202009_RNAseq/result/annotation"
htseq-count -s no -f bam -r pos ${input} ${annotation} > ${output}/SRR35899${i}.count
echo "SRR35899${i}.count is completed"
done
#==========================
# 运行脚本
bash htseq-count.sh
htseq-count.sh运行结果