生信

RNAseq01(hisat2-stringtie-ballgo

2020-05-28  本文已影响0人  木石以东

1. Hisat2-featureCounts流程


#/bin/bash

INPUT_DIR=$1    # fq.gz所在文件夹
NAME_FILE1=$2   # R1.fq.gz
NAME_FILE2=$3   # R2.fq.gz
INDEX=$4        # index文件夹/index文件名共同部分
GTF=$5          # GTF文件
OUT_DIR=$6      # 输出文件夹,各参数末位不能有/

file1=($(find $INPUT_DIR -name "$NAME_FILE1"))
file2=($(find $INPUT_DIR -name "$NAME_FILE2"))

echo ${#file1[*]} 'files found...'
NUM=${#file1[*]} 

for ((i=0;i<NUM;i=i+1))
do
        echo '====='$i'====='
        echo ${file1[i]} .
        echo ${file2[i]} !

        PRE=${file1[i]##*/}
        PRE=${PRE:0:2}
        echo $PRE
done

for ((i=0;i<NUM;i=i+1))
do
        echo $i '|'  ${file1[i]} '<==>' ${file2[i]}
        echo "=================================="
        echo '**1 Map to the reference genome'
        hisat2 -p 10 --dta -x ${INDEX} -1 ${file1[i]} -2 ${file2[i]} -S ${OUT_DIR}/${PRE}.sam
        echo '**2 Sort and convert SAM to BAM'
        samtools sort -@ 10 -o ${OUT_DIR}/${PRE}.bam ${OUT_DIR}/${PRE}.sam
        echo '**3 featureCounts'
        featureCounts -T 10 -p -t exon -g gene_id -a ${GTF} -o ${OUT_DIR}/featurecounts_${PRE}.txt ${OUT_DIR}/${PRE}.bam
done
./hisat2_featureCounts.sh cleandata *clean_R1.fq.gz *clean_R2.fq.gz indexes/hg38/genome genes/gencode.v34.annotation.gtf results

2. Stringtie-ballgown流程(承接脚本1的Hisat2-featureCounts流程)


#/bin/bash

BAM_DIR=$1  # bam文件所在文件夹
GTF=$2      # GTF文件
OUT_DIR=$3  # 输出文件夹,,,各变量末尾不能带/


BAM=$(find ${BAM_DIR} -name "*bam")


for b in $BAM
do
        echo  "=================="
        echo $b
        PRE=${b##*/}
        PRE=${PRE:0:2}
        echo $PRE

        stringtie -p 10 -G ${GTF} -o ${OUT_DIR}/${PRE}.gtf -l ${PRE} ${OUT_DIR}/${PRE}.bam
done


echo -------------
echo 'merge transcripts'
ls ${OUT_DIR}/*.gtf > ${OUT_DIR}/mergelist.txt
cat ${OUT_DIR}/mergelist.txt
stringtie --merge -p 10 -G ${GTF} -o ${OUT_DIR}/stringtie_merged.gtf ${OUT_DIR}/mergelist.txt

echo -------------
echo 'estimate transcript and prepare for Ballgown'

for b in $BAM
do
        echo "..........."
        echo $b
        PRE=${b##*/}
        PRE=${PRE:0:2}

        echo $PRE
        stringtie -e -B -p 10 -G ${OUT_DIR}/stringtie_merged.gtf -o ${OUT_DIR}/ballgown/${PRE}/${PRE}.gtf ${OUT_DIR}/${PRE}.bam
done

date
./stringtie_ballgown.sh results genes/gencode.v34.annotation.gtf results

附录1. (项目文件目录)

X_data$ tree ./
./
├── cleandata
│   ├── A2
│   │   ├── A2_clean_R1.fq.gz
│   │   ├── A2_clean_R2.fq.gz
│   │   ├── A2_unpaired_R1.fq.gz
│   │   └── A2_unpaired.R2.fq.gz
│   ├── A3
│   │   ├── A3_clean_R1.fq.gz
│   │   ├── A3_clean_R2.fq.gz
│   │   ├── A3_unpaired_R1.fq.gz
│   │   └── A3_unpaired.R2.fq.gz   
│   └── ######### 略
├── genes
│   └── gencode.v34.annotation.gtf
├── indexes
│   └── hg38
│       ├── genome.1.ht2
│       ├── genome.2.ht2
│       ├── genome.3.ht2
│       ├── genome.4.ht2
│       ├── genome.5.ht2
│       ├── genome.6.ht2
│       ├── genome.7.ht2
│       ├── genome.8.ht2
│       └── make_hg38.sh
├── QC
│   ├── A2_clean_R1_fastqc.html
│   ├── A2_clean_R1_fastqc.zip
│   ├── A2_clean_R2_fastqc.html
│   ├── ########## 略
│   ├── multiqc_data
│   │   ├── multiqc_fastqc.txt
│   │   ├── multiqc_general_stats.txt
│   │   ├── multiqc.log
│   │   └── multiqc_sources.txt
│   └── multiqc_report.html
├── rawdata
│   ├── A2
│   │   ├── A2_combined_R1.fastq.gz
│   │   └── A2_combined_R2.fastq.gz
│   ├── A3
│   │   ├── A3_combined_R1.fastq.gz
│   │   └── A3_combined_R2.fastq.gz
│   ├── B2
│   │   ├── B2_combined_R1.fastq.gz
│   │   └── B2_combined_R2.fastq.gz
│   ├── C2
│   │   ├── C2_combined_R1.fastq.gz
│   │   └── C2_combined_R2.fastq.gz
│   ├── D2
│   │   ├── D2_combined_R1.fastq.gz
│   │   └── D2_combined_R2.fastq.gz
│   └── md5.md5
├── results
│   ├── ######## 略
├── qc.sh # 质控
├── trim.sh # 去街头
├── hisat2_featureCounts.sh
├── stringtie_ballgown.sh
└── makeIndex.sh # 去街头

附录2. (质量控制、去接头、hisat2自建索引脚本)

# shell_1 质量控制===========================
$ cat qc.sh
#!/bin/bash

# $1 clean_data所在文件夹
# $2 正则表达式
# $3 输出文件夹

INPUT_DIR=$1
NAME_FILE=$2
OUT_DIR=$3

find ${INPUT_DIR} -name "${NAME_FILE}" | xargs fastqc -t 10 -o ${OUT_DIR} # samples目录下,QC目录需要提前建好
cd ${OUT_DIR} && multiqc .

# shell_2 去接头===================================================
$ cat trim.sh 
#!/bin/bash

# rawdata所在目录 $1
RAW_DIR=$1

File1=($(find $RAW_DIR -name '*R1.fastq.gz'))
File2=($(find $RAW_DIR -name '*R2.fastq.gz'))

for ((i=0;i<5;i++))
do
        echo $i '||' ${File1[i]} '<--->' ${File2[i]}
        PRE=${File1[i]##*/}
        PRE=${PRE:0:2}
        echo $PRE
done

echo '================'
date


for ((i=0;i<5;i++))
do
        echo $i '||' ${File1[i]} '<--->' ${File2[i]}
        PRE=${File1[i]##*/}
        PRE=${PRE:0:2}
        echo $PRE
        trimmomatic PE -threads 8 ${File1[i]} ${File2[i]} ${PRE}_clean_R1.fq.gz ${PRE}_unpaired_R1.fq.gz ${PRE}_clean_R2.fq.gz ${PRE}_unpaired.R2.fq.gz ILLUMINACLIP:/home/jun/miniconda3/envs/RNAseq/share/trimmomatic/adapters/TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36
done

date

# shell_3. make index ==========================================
cat makeIndex.sh 
#!/bin/bash

# 注意gtf,fa文件所在路径,输出文件夹index下要提前创建好目录结构(共同文件名要写)
hisat2_extract_splice_sites.py ./genes/gencode.v32.annotation.gtf > genome.ss
hisat2_extract_exons.py genes/gencode.v32.annotation.gtf > genome.exon
hisat2-build --ss genome.ss --exon genome.exon genome/GRCh38.p13.genome.fa indexes/genecodeV32/genome
上一篇 下一篇

猜你喜欢

热点阅读