3D基因组科研信息学HiC分析

hic分析之分步工作原理

2019-08-21  本文已影响3人  Ray钱

参考之前的文章 hic分析之 hic-pro 介绍与安装

分步工作原理:

先看test文件

/PATH/TO/HiC-Pro_2.11.1/test-op/test_data
.
|-- dixon_2M
|   |-- SRR400264_00_R1.fastq.gz
|   `-- SRR400264_00_R2.fastq.gz
`-- dixon_2M_2
    |-- SRR400264_01_R1.fastq.gz
    `-- SRR400264_01_R2.fastq.gz

再看看opt文件

tree
.
|-- bowtie_results
|   |-- bwt2
|   |   |-- dixon_2M
|   |   |   |-- SRR400264_00_R1_hg19.bwt2merged.bam
|   |   |   |-- SRR400264_00_R1_hg19.mapstat
|   |   |   |-- SRR400264_00_R2_hg19.bwt2merged.bam
|   |   |   |-- SRR400264_00_R2_hg19.mapstat
|   |   |   |-- SRR400264_00_hg19.bwt2pairs.bam
|   |   |   `-- SRR400264_00_hg19.bwt2pairs.pairstat
|   |   `-- dixon_2M_2
|   |       |-- SRR400264_01_R1_hg19.bwt2merged.bam
|   |       |-- SRR400264_01_R1_hg19.mapstat
|   |       |-- SRR400264_01_R2_hg19.bwt2merged.bam
|   |       |-- SRR400264_01_R2_hg19.mapstat
|   |       |-- SRR400264_01_hg19.bwt2pairs.bam
|   |       |-- SRR400264_01_hg19.bwt2pairs.pairstat
|   |-- bwt2_global
|   |   |-- dixon_2M
|   |   |   |-- SRR400264_00_R1_hg19.bwt2glob.bam
|   |   |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap.fastq
|   |   |   |-- SRR400264_00_R2_hg19.bwt2glob.bam
|   |   |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap.fastq
|   |   |-- dixon_2M_2
|   |       |-- SRR400264_01_R1_hg19.bwt2glob.bam
|   |       |-- SRR400264_01_R1_hg19.bwt2glob.unmap.fastq
|   |       |-- SRR400264_01_R2_hg19.bwt2glob.bam
|   |       |-- SRR400264_01_R2_hg19.bwt2glob.unmap.fastq
|   |-- bwt2_local
|       |-- dixon_2M
|       |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
|       |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq
|       |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam
|       |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq
|       |-- dixon_2M_2
|           |-- SRR400264_01_R1_hg19.bwt2glob.unmap_bwt2loc.bam
|           |-- SRR400264_01_R1_hg19.bwt2glob.unmap_trimmed.fastq
|           |-- SRR400264_01_R2_hg19.bwt2glob.unmap_bwt2loc.bam
|           |-- SRR400264_01_R2_hg19.bwt2glob.unmap_trimmed.fastq
|-- config_test_latest.txt
|-- hic_results
|   |-- data
|   |   |-- dixon_2M
|   |   |   |-- SRR400264_00_hg19.bwt2pairs.DEPairs
|   |   |   |-- SRR400264_00_hg19.bwt2pairs.DumpPairs
|   |   |   |-- SRR400264_00_hg19.bwt2pairs.FiltPairs
|   |   |   |-- SRR400264_00_hg19.bwt2pairs.REPairs
|   |   |   |-- SRR400264_00_hg19.bwt2pairs.RSstat
|   |   |   |-- SRR400264_00_hg19.bwt2pairs.SCPairs
|   |   |   |-- SRR400264_00_hg19.bwt2pairs.SinglePairs
|   |   |   |-- SRR400264_00_hg19.bwt2pairs.validPairs
|   |   |   |-- SRR400264_00_hg19.bwt2pairs_interaction.bam
|   |   |   `-- dixon_2M.allValidPairs
|   |   `-- dixon_2M_2
|   |       |-- SRR400264_01_hg19.bwt2pairs.DEPairs
|   |       |-- SRR400264_01_hg19.bwt2pairs.DumpPairs
|   |       |-- SRR400264_01_hg19.bwt2pairs.FiltPairs
|   |       |-- SRR400264_01_hg19.bwt2pairs.REPairs
|   |       |-- SRR400264_01_hg19.bwt2pairs.RSstat
|   |       |-- SRR400264_01_hg19.bwt2pairs.SCPairs
|   |       |-- SRR400264_01_hg19.bwt2pairs.SinglePairs
|   |       |-- SRR400264_01_hg19.bwt2pairs.validPairs
|   |       |-- SRR400264_01_hg19.bwt2pairs_interaction.bam
|   |       \-- dixon_2M_2.allValidPairs
|   |-- matrix
|   |   |-- dixon_2M
|   |   |   |-- iced
|   |   |   |   |-- 500000
|   |   |   |-- raw
|   |   |       |-- 1000000
|   |   |       |   |-- dixon_2M_1000000.matrix
|   |   |       |   |-- dixon_2M_1000000_abs.bed
|   |   |       |-- 500000
|   |   |           |-- dixon_2M_500000.matrix
|   |   |           |-- dixon_2M_500000_abs.bed
|   |   |-- dixon_2M_2
|   |       |-- raw
|   |           |-- 1000000
|   |           |   |-- dixon_2M_2_1000000.matrix
|   |           |   |-- dixon_2M_2_1000000_abs.bed
|   |           |-- 500000
|   |               |-- dixon_2M_2_500000.matrix
|   |               |-- dixon_2M_2_500000_abs.bed
|   |-- pic
|   |   |-- dixon_2M
|   |   |   |-- plotHiCContactRanges_dixon_2M.pdf
|   |   |   |-- plotHiCFragmentSize_dixon_2M.pdf
|   |   |   |-- plotHiCFragment_dixon_2M.pdf
|   |   |   |-- plotMappingPairing_dixon_2M.pdf
|   |   |   |-- plotMapping_dixon_2M.pdf
|   |   |-- dixon_2M_2
|   |       |-- plotHiCContactRanges_dixon_2M_2.pdf
|   |       |-- plotHiCFragmentSize_dixon_2M_2.pdf
|   |       |-- plotHiCFragment_dixon_2M_2.pdf
|   |       |-- plotMappingPairing_dixon_2M_2.pdf
|   |       |-- plotMapping_dixon_2M_2.pdf
|   |-- stats
|       |-- dixon_2M
|       |   |-- dixon_2M.mRSstat
|       |   |-- dixon_2M.mpairstat
|       |   |-- dixon_2M_R1.mmapstat
|       |   |-- dixon_2M_R2.mmapstat
|       |   |-- dixon_2M_allValidPairs.mergestat
|       |-- dixon_2M_2
|           |-- dixon_2M_2.mRSstat
|           |-- dixon_2M_2.mpairstat
|           |-- dixon_2M_2_R1.mmapstat
|           |-- dixon_2M_2_R2.mmapstat
|           |-- dixon_2M_2_allValidPairs.mergestat
|-- logs
|   |-- dixon_2M
|   |   |-- SRR400264_00_R1_bowtie2.log
|   |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_bowtie2.log
|   |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_readsTrimming.log
|   |   |-- SRR400264_00_R2_bowtie2.log
|   |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_bowtie2.log
|   |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_readsTrimming.log
|   |   |-- build_raw_maps.log
|   |   |-- ice_500000.log
|   |   |-- make_Rplots.log
|   |   |-- mapped_2hic_fragments.log
|   |   |-- mapping_combine.log
|   |   |-- mapping_stats.log
|   |   |-- mapping_step1.log
|   |   |-- mapping_step2.log
|   |   |-- mergeSAM.log
|   |   |-- merge_stats.log
|   |   |-- merge_valid_interactions.log
|   |   |-- plot_hic_contacts.Rout
|   |   |-- plot_hic_fragment.Rout
|   |   |-- plot_mapping_portion.Rout
|   |   |-- plot_pairing_portion.Rout
|   |-- dixon_2M_2
|       |-- SRR400264_01_R1_bowtie2.log
|       |-- SRR400264_01_R1_hg19.bwt2glob.unmap_bowtie2.log
|       |-- SRR400264_01_R1_hg19.bwt2glob.unmap_readsTrimming.log
|       |-- SRR400264_01_R2_bowtie2.log
|       |-- SRR400264_01_R2_hg19.bwt2glob.unmap_bowtie2.log
|       |-- SRR400264_01_R2_hg19.bwt2glob.unmap_readsTrimming.log
|       |-- build_raw_maps.log
|       |-- make_Rplots.log
|       |-- mapped_2hic_fragments.log
|       |-- mapping_combine.log
|       |-- mapping_stats.log
|       |-- mapping_step1.log
|       |-- mapping_step2.log
|       |-- mergeSAM.log
|       |-- merge_stats.log
|       |-- merge_valid_interactions.log
|       |-- plot_hic_contacts.Rout
|       |-- plot_hic_fragment.Rout
|       |-- plot_mapping_portion.Rout
|       |-- plot_pairing_portion.Rout
|-- rawdata -> /research/labs/molebio/ordog/shared/HiC-Pro/HiC-Pro_2.11.1/test-op/test_data
|-- tmp


工作原理介绍:


image.png

1.Reads Mapping

将fastq文件用bowtie2 mapping一次,看log:

cat mapping_step1.log

###mapping fastq to reference genome , 将unmapping的read 另存为 unmap.fastq
/PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap.fastq --rg-id BMG --rg SM:SRR400264_00_R2 -p 20 -x /PATH/TO/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U rawdata/dixon_2M/SRR400264_00_R2.fastq.gz 2>> logs/dixon_2M/SRR400264_00_R2_bowtie2.log| /PATH/TO/samtools/samtools-1.6/bin/samtools view -F 4 -bS - > bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.bam
/PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap.fastq --rg-id BMG --rg SM:SRR400264_00_R1 -p 20 -x /PATH/TO/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U rawdata/dixon_2M/SRR400264_00_R1.fastq.gz 2>> logs/dixon_2M/SRR400264_00_R1_bowtie2.log| /PATH/TO/samtools/samtools-1.6/bin/samtools view -F 4 -bS - > bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.bam

得到这样的文件:

|   |-- bwt2_global
|   |   |-- dixon_2M
|   |   |   |-- SRR400264_00_R1_hg19.bwt2glob.bam
|   |   |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap.fastq
|   |   |   |-- SRR400264_00_R2_hg19.bwt2glob.bam
|   |   |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap.fastq
|   |   `-- dixon_2M_2
|   |       |-- SRR400264_01_R1_hg19.bwt2glob.bam
|   |       |-- SRR400264_01_R1_hg19.bwt2glob.unmap.fastq
|   |       |-- SRR400264_01_R2_hg19.bwt2glob.bam
|   |       |-- SRR400264_01_R2_hg19.bwt2glob.unmap.fastq

这里有mapping的,和unmapping的。因为read是嵌合基因。mapping上的是背景基因组。unmapping的是需要下一步处理的。

2.片段分配和过滤

先看log

cat mapping_step2.log
### 酶切
/PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/cutsite_trimming --fastq bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap.fastq --cutsite AAGCTAGCTT --out bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq > logs/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_readsTrimming.log 2>&1
/PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/cutsite_trimming --fastq bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap.fastq --cutsite AAGCTAGCTT --out bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq > logs/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_readsTrimming.log 2>&1
### 再用bowtie2 mapping一次
/PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder --rg-id BML --rg SM:SRR400264_00_R1_hg19.bwt2glob.unmap -p 20 -x /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq 2>> logs/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_bowtie2.log | /PATH/TO/samtools/samtools-1.6/bin/samtools view -bS - > bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
/PATH/TO/bowtie2/bowtie2-2.3.3.1/bowtie2 --very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder --rg-id BML --rg SM:SRR400264_00_R2_hg19.bwt2glob.unmap -p 20 -x /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/Bowtie2Index/hg19 -U bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq 2>> logs/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_bowtie2.log | /PATH/TO/samtools/samtools-1.6/bin/samtools view -bS - > bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam

这一步是将unmapping的结果,用酶切位点分割,然后将分割下来的结果,再mapping一次。得到以下结果

|   |-- bwt2_local
|       |-- dixon_2M
|       |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
|       |   |-- SRR400264_00_R1_hg19.bwt2glob.unmap_trimmed.fastq
|       |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam
|       |   |-- SRR400264_00_R2_hg19.bwt2glob.unmap_trimmed.fastq
|       |-- dixon_2M_2
|           |-- SRR400264_01_R1_hg19.bwt2glob.unmap_bwt2loc.bam
|           |-- SRR400264_01_R1_hg19.bwt2glob.unmap_trimmed.fastq
|           |-- SRR400264_01_R2_hg19.bwt2glob.unmap_bwt2loc.bam
|           |-- SRR400264_01_R2_hg19.bwt2glob.unmap_trimmed.fastq

3、mapping_combine.log

cat mapping_combine.log

### 将global的bam与local的bam用samtools merge
/PATH/TO/samtools/samtools-1.6/bin/samtools merge -@ 20 -n -f bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.bam bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R1_hg19.bwt2glob.unmap_bwt2loc.bam
/PATH/TO/samtools/samtools-1.6/bin/samtools merge -@ 20 -n -f bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam bowtie_results/bwt2_global/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.bam bowtie_results/bwt2_local/dixon_2M/SRR400264_00_R2_hg19.bwt2glob.unmap_bwt2loc.bam

### sort
/PATH/TO/samtools/samtools-1.6/bin/samtools sort -@ 20 -n -T tmp/SRR400264_00_R2_hg19 -o bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam
/PATH/TO/samtools/samtools-1.6/bin/samtools sort -@ 20 -n -T tmp/SRR400264_00_R1_hg19 -o bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam

[bam_sort_core] merging from 0 files and 20 in-memory blocks...
[bam_sort_core] merging from 0 files and 20 in-memory blocks...

### 改名
mv bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam
mv bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.sorted.bam bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam

4、merge

### 将R1与R2 merge为一个文件。
cat mergeSAM.log
/PATH/TO/python/python-2.7.10/bin/python /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/mergeSAM.py -q 0 -t -v -f bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam -r bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam -o bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam
## mergeBAM.py
## forward= bowtie_results/bwt2/dixon_2M/SRR400264_00_R1_hg19.bwt2merged.bam
## reverse= bowtie_results/bwt2/dixon_2M/SRR400264_00_R2_hg19.bwt2merged.bam
## output= bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam
## min mapq= 0
## report_single= False
## report_multi= False
## verbose= True
## Merging forward and reverse tags ...

5、利用HiCPro的mapped_2hic_fragments.py程序将比对结果转化为Hi-C片段信息

cat mapped_2hic_fragments.log
/PATH/TO/python /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/scripts/mapped_2hic_fragments.py -v -S -t 100 -m 100000 -s 100 -l 600 -a -f /PATH/TO/HiC-Pro_2.11.1/annotation/HindIII_resfrag_hg19.bed -r bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam -o hic_results/data/dixon_2M
## overlapMapped2HiCFragments.py
## mappedReadsFile= bowtie_results/bwt2/dixon_2M/SRR400264_00_hg19.bwt2pairs.bam
## fragmentFile= /PATH/TO/HiC-Pro/HiC-Pro_2.11.1/annotation/HindIII_resfrag_hg19.bed
## minInsertSize= 100
## maxInsertSize= 600
## minFragSize= 100
## maxFragSize= 100000
## allOuput= True
## SAM ouput= True
## verbose= True

再对输出的valid pairs文件进行排序:

LANG=en; sort -T tmp -k2,2V -k3,3n -k5,5V -k6,6n -o hic_results/data/sample/sample_sample_genome_ref.bwt2pairs.validPairs hic_results/data/sample/sample_sample_genome_ref.bwt2pairs.validPairs

6 对所有的valid pairs进行合并

LANG=en; sort -T tmp -S 50% -k2,2V -k3,3n -k5,5V -k6,6n -m hic_results/data/dixon_2M_2/SRR400264_01_hg19.bwt2pairs.validPairs | awk -F"\t" 'BEGIN{c1=0;c2=0;s1=0;s2=0}(c1!=$2 || c2!=$5 || s1!=$3 || s2!=$6){print;c1=$2;c2=$5;s1=$3;s2=$6}' > hic_results/data/dixon_2M_2/dixon_2M_2.allValidPairs

这样就成了我们最终的文件,dixon_2M_2.allValidPairs。

这个文件还不能可视化,如果要可视化,一般在jucebox里面查看。我们需要用配套的hicpro2juicebox.sh来转化:

## e.g.1
HICPRO_PATH/bin/utils/hicpro2juicebox.sh -i hicpro_res/hic_results/data/dixon_2M/dixon_2M_allValidPairs -g hg19 -j /usr/local/juicebox/juicebox_clt_1.4.jar

很多人问,这个-j的 jar文件在哪里?
事实上用这个就可以。

/PATH/TO/juicer/juicer/UGER/scripts/juicer_tools.jar

下次,我们再讨论juicer与juicebox的用法。

上一篇 下一篇

猜你喜欢

热点阅读