Linux-NGS-二代测序分析流程ATACSeq 开放染色质分析

ATAC-seq上游分析

2019-09-27  本文已影响0人  大吉岭猹

1. 环境配置及软件安装

1.1. 安装conda并配置镜像

conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda
conda config --set show_channel_urls yes
cat ~/.condarc

1.2. 创建小环境并安装软件

conda create -n atac python=2
conda install -y sra-tools trim-galore samtools bedtools deeptools homer meme macs2 bowtie bowtie2 multiqc sambamba

1.3. 软件环境可以移植

conda activate target_env
conda env export > environment.yml
#换用户/服务器
conda env create -f exvironment.yml

2. (若使用公共数据)下载数据

mkdir {ssr_public,fq_ATAC,clean_ATAC,qc_ATAC,align_ATAC,peaks_ATAC,motif_ATAC}
cd ssr_public
conda activate atac
cat SRR_Acc_List.txt | while read id; do (nohup prefetch $id -O ./ &); done #先跑单个数据,再跑循环

3. 转fastq

ls SRR* | while read id; do (nohup fastq-dump --gzip --split-3 -O ../fq_ATAC $id &); done

4. 测序数据的过滤和质控

cd ../fq_ATAC
#vim config.raw
cat config.raw | while read id;
    do echo $id
    arr=($id)
    fq2=${arr[2]}
    fq1=${arr[1]}
    nohup trim_galore -q 25 --phred33 --length 35 -e 0.1 --stringency 4 --paired -o clean_ATAC/ $fq1 $fq2 &
    done
cd ../fq_ATAC
fastqc -t 5 *gz -o ./qc
multiqc ./qc #整合结果
cd ../clean_ATAC
fastqc -t 5 *gz -o ./qc
multiqc ./qc #整合结果

5. 比对

bowtie2_index=/trainee/huangjing/project/reference/index/bowtie2/mm10
#vim config.clean
#自行构建config.clean,和前面的一样
cat config.clean|while read id;
do
    echo $id
    arr=($id)
    fq2=${arr[2]}
    fq1=${arr[1]}
    tmp=${fq1##*/}
    tmpp=${tmp%%.*}
    sample=${tmpp%_*} #去头去尾
    bowtie2 -p 5 --very-sensitive -X 2000 -x $bowtie2_index -1 $fq1 -2 $fq2 | samtools sort -O bam -@ 5 -o - > ${sample}.raw.bam
    samtools index ${sample}.raw.bam #构建索引
    samtools flagstat ${sample}.raw.bam > ${sample}.raw.stat #评估比对率
    sambamba markdup --overflow-list-size 600000 --tmpdir='./' -r ${sample}.raw.bam  ${sample}.sambamba.rmdup.bam #去除PCR重复
    samtools index ${sample}.sambamba.rmdup.bam #构建索引
    samtools flagstat ${sample}.sambamba.rmdup.bam > ${sample}.sambamba.rmdup.stat #评估比对率
    samtools view -h -f 2 -q 30 ${sample}.sambamba.rmdup.bam | grep -v chrM | samtools sort -O bam -@ 5 -o - > ${sample}.last.bam #接下来只保留两条reads要比对到同一条染色体(Proper paired) ,还有高质量的比对结果(Mapping quality>=30),顺便去除线粒体DNA
    samtools index ${sample}.last.bam #构建索引
    samtools flagstat ${sample}.last.bam > ${sample}.last.stat #评估比对率
    bedtools bamtobed -i ${sample}.last.bam  > ${sample}.bed #转bed文件
done

6. 自定义分箱count

# 首先拿到基因组坐标染色体坐标
pip install pyfaidx
faidx mm10.fa -i chromsizes > sizes_mm10.genome
# 然后使用 bedtools 工具基因组染色体坐标进行窗口划分
bedtools makewindows -g sizes_mm10.genome -w 700 > 700.bed
# 对bam文件进行计算
ls ../align_ATAC/*.last.bam | while read id; do(bedtools multicov -bams $id -bed 700.bed > $(basename $id .last.bam)_700_counts.txt);done

7. peaks calling

ls *.last.bam | while read id; do (macs2 callpeak -t $id  -g mm --nomodel --shift -100 --extsize 200 -n ${id%%.*} --outdir ../peaks_ATAC/); done

友情宣传

上一篇下一篇

猜你喜欢

热点阅读