生物信息学与算法ChipSeq数据分析R

systemPipeR 包中文翻译之 ChIP-seq 分析

2019-06-08  本文已影响54人  热衷组培的二货潜

原文英文链接:ChIP-Seq Workflow Template
作者:Daniela Cassol (danielac@ucr.edu) and Thomas Girke (thomas.girke@ucr.edu)

R包

内容目录(我自己加在前面的)

1 简单介绍

1.1 背景和目标
1.2 实验设计
2 工作环境

2.1 生成工作流程环境
library(systemPipeRdata)
genWorkenvir(workflow = "chipseq")
setwd("chipseq")
Rscript -e "systemPipeRdata::genWorkenvir(workflow='chipseq')"
2.2 运行工作流程
2.2.1 在计算节点运行 R
q("no") # 关闭节点上运行的R
srun --x11 --partition=short --mem=2gb --cpus-per-task 4 --ntasks 1 --time 2:00:00 --pty bash -l
module load R/3.4.2
R
system("hostname")  # 返回以 i 或 c 开头的计算节点的名称
getwd()  # 查看当前 R 的工作路径
dir()  # 返回当前工作路径下的文件
library(systemPipeR)
source("systemPipeChIPseq_Fct.R")
3 Read 预处理

3.1 targets 文件提供的实验定义
targetspath <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
targets <- read.delim(targetspath, comment.char = "#")
targets[1:4, -c(5, 6)]
                  FileName SampleName Factor SampleLong SampleReference
1 ./data/SRR446027_1.fastq        M1A     M1  Mock.1h.A                
2 ./data/SRR446028_1.fastq        M1B     M1  Mock.1h.B                
3 ./data/SRR446029_1.fastq        A1A     A1   Avr.1h.A             M1A
4 ./data/SRR446030_1.fastq        A1B     A1   Avr.1h.B             M1B
3.2 Read 质量筛选与修剪
args <- systemArgs(sysma = "param/trim.param", mytargets = "targets_chip.txt")
filterFct <- function(fq, cutoff = 20, Nexceptions = 0) {
    qcount <- rowSums(as(quality(fq), "matrix") <= cutoff, na.rm = TRUE)
    fq[qcount <= Nexceptions]
    # 保留读取 Phred 分数 > = 阈值 cutoff 为 N 的 reads
    # 例外
}
preprocessReads(args = args, Fct = "filterFct(fq, cutoff=20, Nexceptions=0)", 
    batchsize = 1e+05)
writeTargetsout(x = args, file = "targets_chip_trim.txt", overwrite = TRUE)
3.3 FASTQ 质量报告
args <- systemArgs(sysma = "param/tophat.param", mytargets = "targets_chip.txt")
library(BiocParallel)
library(batchtools)
f <- function(x) {
    library(systemPipeR)
    args <- systemArgs(sysma = "param/tophat.param", mytargets = "targets_chip.txt")
    seeFastq(fastq = infile1(args)[x], batchsize = 1e+05, klength = 8)
}
moduleload(modules(args))  # Skip if a module system is not used
resources <- list(walltime = 120, ntasks = 1, ncpus = cores(args), 
    memory = 1024)
param <- BatchtoolsParam(workers = 4, cluster = "slurm", template = "batchtools.slurm.tmpl", 
    resources = resources)
fqlist <- bplapply(seq(along = args), f, BPPARAM = param)

pdf("./results/fastqReport.pdf", height = 18, width = 4 * length(fqlist))
seeFastqPlot(unlist(fqlist, recursive = FALSE))
dev.off()
Figure 1:18 个样本的 `FASTQ` 质量报告
4 比对

4.1 使用 Bowtie2 进行 Reads 的比对
rgs <- systemArgs(sysma = "param/bowtieSE.param", mytargets = "targets_chip_trim.txt")

sysargs(args)[1]  # 第一个 FASTQ 文件的命令行参数

moduleload(modules(args))  # 如果不需要 module 则可以跳过

system("bowtie2-build ./data/tair10.fasta ./data/tair10.fasta")
# 对基因组文件建立索引

resources <- list(walltime = 120, ntasks = 1, ncpus = cores(args), 
    memory = 1024)

reg <- clusterRun(args, conffile = ".batchtools.conf.R", Njobs = 18, 
    template = "batchtools.slurm.tmpl", runid = "01", resourceList = resources)

getStatus(reg = reg)

waitForJobs(reg = reg)

writeTargetsout(x = args, file = "targets_bam.txt", overwrite = TRUE)
runCommandline(args)
file.exists(outpaths(args))
4.2 Reads 和比对信息统计
read_statsDF <- alignStats(args = args)

write.table(read_statsDF, "results/alignStats.xls", row.names = FALSE, 
    quote = FALSE, sep = "\t")

read.delim("results/alignStats.xls")
4.3 创建用于在 IGV 中查看 BAM 文件的链接
symLink2bam(sysargs = args, htmldir = c("~/.html/", "somedir/"), 
    urlbase = "http://biocluster.ucr.edu/~tgirke/", urlfile = "./results/IGVurl.txt")
5 coverage 数据的实用程序( Utilities )

5.1 Rle 属性中储存的 coverage 信息
library(rtracklayer)
library(GenomicRanges)
library(Rsamtools)
library(GenomicAlignments)

aligns <- readGAlignments(outpaths(args)[1])
cov <- coverage(aligns)
cov
5.2 调整已比对上的 reads
trim(resize(as(aligns, "GRanges"), width = 200))
5.3 Naive Peak calling
islands <- slice(cov, lower = 15)
islands[[1]]
5.4 绘制定义区间的 coverage 信息
library(ggbio)
myloc <- c("Chr1", 1, 1e+05)
ga <- readGAlignments(outpaths(args)[1], use.names = TRUE, param = ScanBamParam(which = GRanges(myloc[1], 
    IRanges(as.numeric(myloc[2]), as.numeric(myloc[3])))))
autoplot(ga, aes(color = strand, fill = strand), facets = strand ~ 
    seqnames, stat = "coverage")
6 使用 MACS2 进行 Peak calling

6.1 在进行 Peak calling 之前将每个样本的重复文件 BAM 合并成一个 BAM 文件。
args <- systemArgs(sysma = NULL, mytargets = "targets_bam.txt")

args_merge <- mergeBamByFactor(args, overwrite = TRUE)

writeTargetsout(x = args_merge, file = "targets_mergeBamByFactor.txt", 
    overwrite = TRUE)
6.2 没有input/referencePeak calling
args <- systemArgs(sysma = "param/macs2_noinput.param", mytargets = "targets_mergeBamByFactor.txt")

sysargs(args)[1]  # 第一个 FASTQ 文件的命令行参数

runCommandline(args)
file.exists(outpaths(args))

writeTargetsout(x = args, file = "targets_macs.txt", overwrite = TRUE)
6.3 有input/referencePeak calling
writeTargetsRef(infile = "targets_mergeBamByFactor.txt", outfile = "targets_bam_ref.txt", 
    silent = FALSE, overwrite = TRUE)

args_input <- systemArgs(sysma = "param/macs2.param", mytargets = "targets_bam_ref.txt")

sysargs(args_input)[1]  # 第一个 FASTQ 文件的命令行参数

runCommandline(args_input)
file.exists(outpaths(args_input))

writeTargetsout(x = args_input, file = "targets_macs_input.txt", 
    overwrite = TRUE)
6.4 鉴定共同的 Peaks
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/rangeoverlapper.R")

peak_M1A <- outpaths(args)["M1A"]

peak_M1A <- as(read.delim(peak_M1A, comment = "#")[, 1:3], "GRanges")

peak_A1A <- outpaths(args)["A1A"]

peak_A1A <- as(read.delim(peak_A1A, comment = "#")[, 1:3], "GRanges")

(myol1 <- subsetByOverlaps(peak_M1A, peak_A1A, minoverlap = 1))
# 返回任何有交集的区域

myol2 <- olRanges(query = peak_M1A, subject = peak_A1A, output = "gr")
# 返回 0L 长度的区域

myol2[values(myol2)["OLpercQ"][, 1] >= 50]
# 返回最小交集 50% 的区域
7 用基因组背景注 Peaks

7.1 使用 ChIPpeakAnno 包进行注释
library(ChIPpeakAnno)
library(GenomicFeatures)

args <- systemArgs(sysma = "param/annotate_peaks.param", mytargets = "targets_macs.txt")

# txdb <- loadDb('./data/tair10.sqlite')
txdb <- makeTxDbFromGFF(file = "data/tair10.gff", format = "gff", 
    dataSource = "TAIR", organism = "Arabidopsis thaliana")

ge <- genes(txdb, columns = c("tx_name", "gene_id", "tx_type"))

for (i in seq(along = args)) {
    peaksGR <- as(read.delim(infile1(args)[i], comment = "#"), 
        "GRanges")
    annotatedPeak <- annotatePeakInBatch(peaksGR, AnnotationData = genes(txdb))
    df <- data.frame(as.data.frame(annotatedPeak), as.data.frame(values(ge[values(annotatedPeak)$feature, 
        ])))
    write.table(df, outpaths(args[i]), quote = FALSE, row.names = FALSE, 
        sep = "\t")
}

writeTargetsout(x = args, file = "targets_peakanno.txt", overwrite = TRUE)
7.2 使用 ChIPseeker 包来注释
library(ChIPseeker)

for (i in seq(along = args)) {
    peakAnno <- annotatePeak(infile1(args)[i], TxDb = txdb, verbose = FALSE)
    df <- as.data.frame(peakAnno)
    write.table(df, outpaths(args[i]), quote = FALSE, row.names = FALSE, 
        sep = "\t")
}

writeTargetsout(x = args, file = "targets_peakanno.txt", overwrite = TRUE)
peak <- readPeakFile(infile1(args)[1])

covplot(peak, weightCol = "X.log10.pvalue.")
peakHeatmap(outpaths(args)[1], TxDb = txdb, upstream = 1000, 
    downstream = 1000, color = "red")

plotAvgProf2(outpaths(args)[1], TxDb = txdb, upstream = 1000, 
    downstream = 1000, xlab = "Genomic Region (5'->3')", ylab = "Read Count Frequency")
8 交集的 Peaksreads 计数

library(GenomicRanges)

args <- systemArgs(sysma = "param/count_rangesets.param", mytargets = "targets_macs.txt")

args_bam <- systemArgs(sysma = NULL, mytargets = "targets_bam.txt")

bfl <- BamFileList(outpaths(args_bam), yieldSize = 50000, index = character())

countDFnames <- countRangeset(bfl, args, mode = "Union", ignore.strand = TRUE)

writeTargetsout(x = args, file = "targets_countDF.txt", overwrite = TRUE)
9 差异结合分析

args_diff <- systemArgs(sysma = "param/rundiff.param", mytargets = "targets_countDF.txt")

cmp <- readComp(file = args_bam, format = "matrix")

dbrlist <- runDiff(args = args_diff, diffFct = run_edgeR, targets = targetsin(args_bam), 
    cmp = cmp[[1]], independent = TRUE, dbrfilter = c(Fold = 2, 
        FDR = 1))

writeTargetsout(x = args_diff, file = "targets_rundiff.txt", 
    overwrite = TRUE)
10 GO富集分析

args <- systemArgs(sysma = "param/macs2.param", mytargets = "targets_bam_ref.txt")

args_anno <- systemArgs(sysma = "param/annotate_peaks.param", 
    mytargets = "targets_macs.txt")

annofiles <- outpaths(args_anno)

gene_ids <- sapply(names(annofiles), function(x) unique(as.character(read.delim(annofiles[x])[, 
    "gene_id"])), simplify = FALSE)

load("data/GO/catdb.RData")

BatchResult <- GOCluster_Report(catdb = catdb, setlist = gene_ids, 
    method = "all", id_type = "gene", CLSZ = 2, cutoff = 0.9, 
    gocats = c("MF", "BP", "CC"), recordSpecGO = NULL)
11 Motif 分析

11.1 从基因组中获得 Peaks 区间的DNA序列
library(Biostrings)
library(seqLogo)
library(BCRANK)

args <- systemArgs(sysma = "param/annotate_peaks.param", mytargets = "targets_macs.txt")

rangefiles <- infile1(args)

for (i in seq(along = rangefiles)) {
    df <- read.delim(rangefiles[i], comment = "#")
    peaks <- as(df, "GRanges")

    names(peaks) <- paste0(as.character(seqnames(peaks)), "_", 
        start(peaks), "-", end(peaks))

    peaks <- peaks[order(values(peaks)$X.log10.pvalue., decreasing = TRUE)]

    pseq <- getSeq(FaFile("./data/tair10.fasta"), peaks)
    names(pseq) <- names(peaks)
    writeXStringSet(pseq, paste0(rangefiles[i], ".fasta"))
}
11.2 使用 BCRANK 发现 Motif
set.seed(0)

BCRANKout <- bcrank(paste0(rangefiles[1], ".fasta"), restarts = 25, 
    use.P1 = TRUE, use.P2 = TRUE)

toptable(BCRANKout)

topMotif <- toptable(BCRANKout, 1)

weightMatrix <- pwm(topMotif, normalize = FALSE)

weightMatrixNormalized <- pwm(topMotif, normalize = TRUE)

pdf("results/seqlogo.pdf")
seqLogo(weightMatrixNormalized)
dev.off()
Figure 2: 被 BCRANK 鉴定的其中一个 motif
12 版本信息

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.10-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.10-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices
## [6] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] systemPipeR_1.19.0          ShortRead_1.43.0           
##  [3] GenomicAlignments_1.21.0    SummarizedExperiment_1.15.0
##  [5] DelayedArray_0.11.0         matrixStats_0.54.0         
##  [7] Biobase_2.45.0              BiocParallel_1.19.0        
##  [9] Rsamtools_2.1.0             Biostrings_2.53.0          
## [11] XVector_0.25.0              GenomicRanges_1.37.0       
## [13] GenomeInfoDb_1.21.0         IRanges_2.19.0             
## [15] S4Vectors_0.23.0            BiocGenerics_0.31.0        
## [17] BiocStyle_2.13.0           
## 
## loaded via a namespace (and not attached):
##  [1] Category_2.51.0          bitops_1.0-6            
##  [3] bit64_0.9-7              RColorBrewer_1.1-2      
##  [5] progress_1.2.0           httr_1.4.0              
##  [7] Rgraphviz_2.29.0         backports_1.1.4         
##  [9] tools_3.6.0              R6_2.4.0                
## [11] DBI_1.0.0                lazyeval_0.2.2          
## [13] colorspace_1.4-1         withr_2.1.2             
## [15] tidyselect_0.2.5         prettyunits_1.0.2       
## [17] bit_1.1-14               compiler_3.6.0          
## [19] graph_1.63.0             formatR_1.6             
## [21] rtracklayer_1.45.0       bookdown_0.9            
## [23] scales_1.0.0             checkmate_1.9.3         
## [25] genefilter_1.67.0        RBGL_1.61.0             
## [27] rappdirs_0.3.1           stringr_1.4.0           
## [29] digest_0.6.18            rmarkdown_1.12          
## [31] AnnotationForge_1.27.0   pkgconfig_2.0.2         
## [33] htmltools_0.3.6          BSgenome_1.53.0         
## [35] limma_3.41.0             rlang_0.3.4             
## [37] RSQLite_2.1.1            GOstats_2.51.0          
## [39] hwriter_1.3.2            dplyr_0.8.0.1           
## [41] VariantAnnotation_1.31.0 RCurl_1.95-4.12         
## [43] magrittr_1.5             GO.db_3.8.2             
## [45] GenomeInfoDbData_1.2.1   Matrix_1.2-17           
## [47] Rcpp_1.0.1               munsell_0.5.0           
## [49] stringi_1.4.3            yaml_2.2.0              
## [51] edgeR_3.27.0             zlibbioc_1.31.0         
## [53] plyr_1.8.4               grid_3.6.0              
## [55] blob_1.1.1               crayon_1.3.4            
## [57] lattice_0.20-38          splines_3.6.0           
## [59] GenomicFeatures_1.37.0   annotate_1.63.0         
## [61] hms_0.4.2                batchtools_0.9.11       
## [63] locfit_1.5-9.1           knitr_1.22              
## [65] pillar_1.3.1             rjson_0.2.20            
## [67] base64url_1.4            codetools_0.2-16        
## [69] biomaRt_2.41.0           XML_3.98-1.19           
## [71] glue_1.3.1               evaluate_0.13           
## [73] latticeExtra_0.6-28      data.table_1.12.2       
## [75] BiocManager_1.30.4       gtable_0.3.0            
## [77] purrr_0.3.2              assertthat_0.2.1        
## [79] ggplot2_3.1.1            xfun_0.6                
## [81] xtable_1.8-4             survival_2.44-1.1       
## [83] tibble_2.1.1             pheatmap_1.0.12         
## [85] AnnotationDbi_1.47.0     memoise_1.1.0           
## [87] brew_1.0-6               GSEABase_1.47.0
13 基金

参考文献

上一篇 下一篇

猜你喜欢

热点阅读