2020-06-11 单细胞测序DropSeq比对实战

2020-06-11 本文已影响0人王子威PtaYoth
#!/bin/bash

# 正式开始之前，先准备一系列所需文件

# create_Drop-seq_reference_metadata.sh
# 该脚本生成DropSeq Alignment所需文件
# fasta 参考基因组序列
# dict picard软件所需文件
# gtf 基因组注释文件
# refFlat picard所钟爱的类似gtf的文件格式
# gene.intervals 用于方便查看bam文件中比对至基因的读段
# exon.intervals 用于方便查看bam文件中比对至外显子的读段
# rRNA.intervals 用于方便查看bam文件中比对至rRNA读段的比例
# reduced.gtf gtf文件的子集 可读性更好

# create_Drop-seq_reference_metadata.sh \
# -n hg19 \
# -r hg19.fa.gz \
# -g gencode.v34lift37.annotation.gtf.gz \
# -s Human \
# -d /root/Drop-seq_tools-2.3.0/ \

# ls *.gz | awk 'BEGIN {FS = "."}{print $1}' | uniq | while read samplename; do
for samplename in CHP2N_2 CHP2T_2; do
    
    mkdir $samplename
    cd $samplename
    mkdir tmp

    echo "============FastqToSam============"
    # https://gatkforums.broadinstitute.org/gatk/discussion/6472/read-groups
    # More information about the definition of "read group". 
    # Generally, reads sequenced from a single lane are considered as a read group. 
    # We can use command:
    # ls *.gz | awk 'BEGIN {FS = "_"}{print $1"_"$2}'
    # to get sample infomation, 
    # and:
    # ls *.gz | awk 'BEGIN {FS = "_"}{print $3}'
    # to get lane infomation.
    picard -Xmx16g -Djava.io.tmpdir=`pwd`/tmp FastqToSam \
    F1=../$samplename\.raw_1.fq.gz \
    F2=../$samplename\.raw_2.fq.gz \
    O=unaligned_data.bam \
    SM=$samplename \
    RG=$samplename

    echo "============TagBamWithReadSequenceExtended============"
    # 这一步发现部分reads的第3位碱基为N，碱基质量Phred33值为#，对应0.63的错误率，会导致cell barcode和molecular barcode的移位
    # BASE_QUALITY=10这一参数会过滤掉这些reads
    # 1.Tag cell barcodes
    TagBamWithReadSequenceExtended \
    INPUT=unaligned_data.bam \
    OUTPUT=unaligned_tagged_Cell.bam \
    SUMMARY=unaligned_tagged_Cellular.bam_summary.txt \
    BASE_RANGE=1-12 \
    BASE_QUALITY=10 \
    BARCODED_READ=1 \
    DISCARD_READ=False \
    TAG_NAME=XC \
    NUM_BASES_BELOW_QUALITY=1

    echo "rm -f unaligned_data.bam"
    rm -f unaligned_data.bam

    # 2. Tag molecular barcodes
    TagBamWithReadSequenceExtended \
    INPUT=unaligned_tagged_Cell.bam \
    OUTPUT=unaligned_tagged_CellMolecular.bam \
    SUMMARY=unaligned_tagged_Molecular.bam_summary.txt \
    BASE_RANGE=13-20 \
    BASE_QUALITY=10 \
    BARCODED_READ=1 \
    DISCARD_READ=True \
    TAG_NAME=XM \
    NUM_BASES_BELOW_QUALITY=1

    echo "rm -f unaligned_tagged_Cell.bam"
    rm -f unaligned_tagged_Cell.bam

    echo "============FilterBam============"
    FilterBam \
    TAG_REJECT=XQ \
    INPUT=unaligned_tagged_CellMolecular.bam \
    OUTPUT=unaligned_tagged_filtered.bam

    echo "rm -f unaligned_tagged_CellMolecular.bam"
    rm -f unaligned_tagged_CellMolecular.bam

    # 查看保留的reads数
    # samtools view unaligned_tagged_filtered.bam | wc -l 

    echo "============TrimStartingSequence============"
    # 3. Trim 5’ primer sequence

    # https://support.illumina.com/bulletins/2016/04/adapter-trimming-why-are-adapter-sequences-trimmed-from-only-the--ends-of-reads.html
    # 5' adaptor sequence won't be found in reads using illumina seqtech

    # TrimStartingSequence \
    # INPUT=unaligned_tagged_filtered.bam \
    # OUTPUT=unaligned_tagged_trimmed_smart.bam \
    # OUTPUT_SUMMARY=adapter_trimming_report.txt \
    # SEQUENCE=AAGCAGTGGTATCAACGCAGAGTGAATGGG \
    # MISMATCHES=0 \
    # NUM_BASES=5

    echo "============PolyATrimmer============"
    # 4. Trim 3’ polyA sequence

    PolyATrimmer \
    INPUT=unaligned_tagged_filtered.bam \
    OUTPUT=unaligned_mc_tagged_polyA_filtered.bam \
    OUTPUT_SUMMARY=polyA_trimming_report.txt \
    # MISMATCHES=0 \
    # NUM_BASES=6 \
    # USE_NEW_TRIMMER=true

    echo "rm -f unaligned_tagged_filtered.bam"
    rm -f unaligned_tagged_filtered.bam

    # unaligned_mc_tagged_polyA_filtered.bam在retrieve tag infomation时仍需用到

    echo "============SamToFastq============"
    # 5. SamToFastq

    picard -Xmx16g -Djava.io.tmpdir=`pwd`/tmp SamToFastq \
    INPUT=unaligned_mc_tagged_polyA_filtered.bam \
    FASTQ=unaligned_mc_tagged_polyA_filtered.fastq

    echo "============STAR★alignment============"
    # 6. STAR★alignment

    STAR \
    --genomeDir /data/wangzw/dropseqMetadata/STAR \
    --readFilesIn unaligned_mc_tagged_polyA_filtered.fastq \
    --outFileNamePrefix star

    echo "rm -f unaligned_mc_tagged_polyA_filtered.bam"
    rm -f unaligned_mc_tagged_polyA_filtered.fastq

    echo "============SortSam============"
    # 7.Sort STAR alignment in queryname order

    picard -Xmx16g -Djava.io.tmpdir=`pwd`/tmp SortSam \
    I=starAligned.out.sam \
    O=aligned.sorted.bam \
    SO=queryname

    echo "rm -f starAligned.out.sam"
    rm -f starAligned.out.sam

    echo "============MergeBamAlignment============"
    # 8. Merge STAR alignment tagged SAM to recover cell/molecular barcodes

    picard -Xmx16g -Djava.io.tmpdir=`pwd`/tmp MergeBamAlignment \
    REFERENCE_SEQUENCE=/data/wangzw/dropseqMetadata/hg19.fasta.gz \
    UNMAPPED_BAM=unaligned_mc_tagged_polyA_filtered.bam \
    ALIGNED_BAM=aligned.sorted.bam \
    OUTPUT=merged.bam \
    INCLUDE_SECONDARY_ALIGNMENTS=false \
    PAIRED_RUN=false

    echo "rm -f unaligned_mc_tagged_polyA_filtered.bam aligned.sorted.bam"
    rm -f unaligned_mc_tagged_polyA_filtered.bam aligned.sorted.bam

    echo "============TagReadWithGeneExon============"
    # 9.Add gene/exon and other annotation tags
    TagReadWithGeneFunction \
    I=merged.bam \
    O=star_gene_exon_tagged.bam \
    ANNOTATIONS_FILE=/data/wangzw/dropseqMetadata/hg19.refFlat

    echo "rm -f merged.bam"
    rm -f merged.bam

    # echo "============DetectBeadSubstitutionErrors============"
    # 10.Barcode Repair
    # this step is based on multiple experimental observations, while the underlying theory is unclear.
    ## 10.1 Repair substitution errors (DetectBeadSubstitutionErrors)

    # DetectBeadSubstitutionErrors \
    # I=my.bam \
    # O=my_clean_subtitution.bam \
    # OUTPUT_REPORT=my_clean.substitution_report.txt

    # echo "============DetectBeadSynthesisErrors============"
    # 10.Barcode Repair
    ## 10.2 Repair indel errors (DetectBeadSynthesisErrors)
    ### SYNTHESIS_MISSING_BASE
    ### SINGLE_UMI_ERROR
    ### PRIMER_MATCH
    ### OTHER

    # DetectBeadSynthesisErrors \
    # I=star_gene_exon_tagged.bam \
    # O=clean.bam \
    # REPORT=clean.indel_report.txt \
    # OUTPUT_STATS=synthesis_stats.txt \
    # SUMMARY=synthesis_stats.summary.txt

    # DigitalExpression函数的Filter Option只支持单个
    DigitalExpression \
    I=star_gene_exon_tagged.bam \
    O=$samplename\_out_gene_exon_tagged.dge.txt.gz \
    SUMMARY=$samplename\_out_gene_exon_tagged.dge.summary.txt \
    MIN_NUM_TRANSCRIPTS_PER_CELL=500
    

    cd ..
    # linux命令替换
    currentTime=`date`
    echo "$samplename has finished. $currentTime."
done
参考内容：http://mccarrolllab.org/dropseq/
2020-06-11 单细胞测序DropSeq比对实战

猜你喜欢

热点阅读