RNAseq01(hisat2-stringtie-ballgo
2020-05-28 本文已影响0人
木石以东
1. Hisat2-featureCounts流程
- 1.1 shell-script001<hisat2_featureCounts.sh>
#/bin/bash
INPUT_DIR=$1 # fq.gz所在文件夹
NAME_FILE1=$2 # R1.fq.gz
NAME_FILE2=$3 # R2.fq.gz
INDEX=$4 # index文件夹/index文件名共同部分
GTF=$5 # GTF文件
OUT_DIR=$6 # 输出文件夹,各参数末位不能有/
file1=($(find $INPUT_DIR -name "$NAME_FILE1"))
file2=($(find $INPUT_DIR -name "$NAME_FILE2"))
echo ${#file1[*]} 'files found...'
NUM=${#file1[*]}
for ((i=0;i<NUM;i=i+1))
do
echo '====='$i'====='
echo ${file1[i]} .
echo ${file2[i]} !
PRE=${file1[i]##*/}
PRE=${PRE:0:2}
echo $PRE
done
for ((i=0;i<NUM;i=i+1))
do
echo $i '|' ${file1[i]} '<==>' ${file2[i]}
echo "=================================="
echo '**1 Map to the reference genome'
hisat2 -p 10 --dta -x ${INDEX} -1 ${file1[i]} -2 ${file2[i]} -S ${OUT_DIR}/${PRE}.sam
echo '**2 Sort and convert SAM to BAM'
samtools sort -@ 10 -o ${OUT_DIR}/${PRE}.bam ${OUT_DIR}/${PRE}.sam
echo '**3 featureCounts'
featureCounts -T 10 -p -t exon -g gene_id -a ${GTF} -o ${OUT_DIR}/featurecounts_${PRE}.txt ${OUT_DIR}/${PRE}.bam
done
- 1.2. 运行脚本(conda 环境下,目录为X_data)
./hisat2_featureCounts.sh cleandata *clean_R1.fq.gz *clean_R2.fq.gz indexes/hg38/genome genes/gencode.v34.annotation.gtf results
2. Stringtie-ballgown流程(承接脚本1的Hisat2-featureCounts流程)
- 2.1 shell-script002<stringtie_ballgown.sh>
#/bin/bash
BAM_DIR=$1 # bam文件所在文件夹
GTF=$2 # GTF文件
OUT_DIR=$3 # 输出文件夹,,,各变量末尾不能带/
BAM=$(find ${BAM_DIR} -name "*bam")
for b in $BAM
do
echo "=================="
echo $b
PRE=${b##*/}
PRE=${PRE:0:2}
echo $PRE
stringtie -p 10 -G ${GTF} -o ${OUT_DIR}/${PRE}.gtf -l ${PRE} ${OUT_DIR}/${PRE}.bam
done
echo -------------
echo 'merge transcripts'
ls ${OUT_DIR}/*.gtf > ${OUT_DIR}/mergelist.txt
cat ${OUT_DIR}/mergelist.txt
stringtie --merge -p 10 -G ${GTF} -o ${OUT_DIR}/stringtie_merged.gtf ${OUT_DIR}/mergelist.txt
echo -------------
echo 'estimate transcript and prepare for Ballgown'
for b in $BAM
do
echo "..........."
echo $b
PRE=${b##*/}
PRE=${PRE:0:2}
echo $PRE
stringtie -e -B -p 10 -G ${OUT_DIR}/stringtie_merged.gtf -o ${OUT_DIR}/ballgown/${PRE}/${PRE}.gtf ${OUT_DIR}/${PRE}.bam
done
date
- 2.2. 运行脚本(conda 环境下,目录为X_data)
./stringtie_ballgown.sh results genes/gencode.v34.annotation.gtf results
附录1. (项目文件目录)
X_data$ tree ./
./
├── cleandata
│ ├── A2
│ │ ├── A2_clean_R1.fq.gz
│ │ ├── A2_clean_R2.fq.gz
│ │ ├── A2_unpaired_R1.fq.gz
│ │ └── A2_unpaired.R2.fq.gz
│ ├── A3
│ │ ├── A3_clean_R1.fq.gz
│ │ ├── A3_clean_R2.fq.gz
│ │ ├── A3_unpaired_R1.fq.gz
│ │ └── A3_unpaired.R2.fq.gz
│ └── ######### 略
├── genes
│ └── gencode.v34.annotation.gtf
├── indexes
│ └── hg38
│ ├── genome.1.ht2
│ ├── genome.2.ht2
│ ├── genome.3.ht2
│ ├── genome.4.ht2
│ ├── genome.5.ht2
│ ├── genome.6.ht2
│ ├── genome.7.ht2
│ ├── genome.8.ht2
│ └── make_hg38.sh
├── QC
│ ├── A2_clean_R1_fastqc.html
│ ├── A2_clean_R1_fastqc.zip
│ ├── A2_clean_R2_fastqc.html
│ ├── ########## 略
│ ├── multiqc_data
│ │ ├── multiqc_fastqc.txt
│ │ ├── multiqc_general_stats.txt
│ │ ├── multiqc.log
│ │ └── multiqc_sources.txt
│ └── multiqc_report.html
├── rawdata
│ ├── A2
│ │ ├── A2_combined_R1.fastq.gz
│ │ └── A2_combined_R2.fastq.gz
│ ├── A3
│ │ ├── A3_combined_R1.fastq.gz
│ │ └── A3_combined_R2.fastq.gz
│ ├── B2
│ │ ├── B2_combined_R1.fastq.gz
│ │ └── B2_combined_R2.fastq.gz
│ ├── C2
│ │ ├── C2_combined_R1.fastq.gz
│ │ └── C2_combined_R2.fastq.gz
│ ├── D2
│ │ ├── D2_combined_R1.fastq.gz
│ │ └── D2_combined_R2.fastq.gz
│ └── md5.md5
├── results
│ ├── ######## 略
├── qc.sh # 质控
├── trim.sh # 去街头
├── hisat2_featureCounts.sh
├── stringtie_ballgown.sh
└── makeIndex.sh # 去街头
附录2. (质量控制、去接头、hisat2自建索引脚本)
# shell_1 质量控制===========================
$ cat qc.sh
#!/bin/bash
# $1 clean_data所在文件夹
# $2 正则表达式
# $3 输出文件夹
INPUT_DIR=$1
NAME_FILE=$2
OUT_DIR=$3
find ${INPUT_DIR} -name "${NAME_FILE}" | xargs fastqc -t 10 -o ${OUT_DIR} # samples目录下,QC目录需要提前建好
cd ${OUT_DIR} && multiqc .
# shell_2 去接头===================================================
$ cat trim.sh
#!/bin/bash
# rawdata所在目录 $1
RAW_DIR=$1
File1=($(find $RAW_DIR -name '*R1.fastq.gz'))
File2=($(find $RAW_DIR -name '*R2.fastq.gz'))
for ((i=0;i<5;i++))
do
echo $i '||' ${File1[i]} '<--->' ${File2[i]}
PRE=${File1[i]##*/}
PRE=${PRE:0:2}
echo $PRE
done
echo '================'
date
for ((i=0;i<5;i++))
do
echo $i '||' ${File1[i]} '<--->' ${File2[i]}
PRE=${File1[i]##*/}
PRE=${PRE:0:2}
echo $PRE
trimmomatic PE -threads 8 ${File1[i]} ${File2[i]} ${PRE}_clean_R1.fq.gz ${PRE}_unpaired_R1.fq.gz ${PRE}_clean_R2.fq.gz ${PRE}_unpaired.R2.fq.gz ILLUMINACLIP:/home/jun/miniconda3/envs/RNAseq/share/trimmomatic/adapters/TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36
done
date
# shell_3. make index ==========================================
cat makeIndex.sh
#!/bin/bash
# 注意gtf,fa文件所在路径,输出文件夹index下要提前创建好目录结构(共同文件名要写)
hisat2_extract_splice_sites.py ./genes/gencode.v32.annotation.gtf > genome.ss
hisat2_extract_exons.py genes/gencode.v32.annotation.gtf > genome.exon
hisat2-build --ss genome.ss --exon genome.exon genome/GRCh38.p13.genome.fa indexes/genecodeV32/genome