ChIPpeakAnno 注释peak
2022-09-12 本文已影响0人
JeremyL
# 1. 初步使用
- 运行下面几行代码,初步了解ChIPpeakAnno使用
library(ChIPpeakAnno)
## import the MACS output
macs <- system.file("extdata", "MACS_peaks.xls", package="ChIPpeakAnno")
macsOutput <- toGRanges(macs, format="MACS")
## annotate the peaks with precompiled ensembl annotation
data(TSS.human.GRCh38)
macs.anno <- annotatePeakInBatch(macsOutput, AnnotationData=TSS.human.GRCh38)
## add gene symbols
library(org.Hs.eg.db)
macs.anno <- addGeneIDs(annotatedPeak=macs.anno,
orgAnn="org.Hs.eg.db",
IDs2Add="symbol")
if(interactive()){## annotate the peaks with UCSC annotation
library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
ucsc.hg38.knownGene <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
macs.anno <- annotatePeakInBatch(macsOutput,
AnnotationData=ucsc.hg38.knownGene)
macs.anno <- addGeneIDs(annotatedPeak=macs.anno,
orgAnn="org.Hs.eg.db",
feature_id_type="entrez_id",
IDs2Add="symbol")
}
# 1. 详细例子
ChIPpeakAnno使用GRanges 保存peak数据。ChIPpeakAnno内置toGRanges 函数可以将BED, GFF等格式转换为GRanges。
- toGRanges()函数
toGRanges(data, format=c("BED", "GFF", "GTF",
"MACS", "MACS2", "MACS2.broad",
"narrowPeak", "broadPeak",
"others"),
header=FALSE, comment.char="#", colNames=NULL, ...)
bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno")
gr1 <- toGRanges(bed, format="BED", header=FALSE)
## one can also try import from rtracklayer
library(rtracklayer)
gr1.import <- import(bed, format="BED")
identical(start(gr1), start(gr1.import))
# [1] TRUE
gr1[1:2]
GRanges object with 2 ranges and 1 metadata column:
seqnames ranges strand | score
<Rle> <IRanges> <Rle> | <numeric>
MACS_peak_1 chr1 28341-29610 * | 160.81
MACS_peak_2 chr1 90821-91234 * | 133.12
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
gr1.import[1:2] #note the name slot is different from gr1
GRanges object with 2 ranges and 2 metadata columns:
seqnames ranges strand | name score
<Rle> <IRanges> <Rle> | <character> <numeric>
[1] chr1 28341-29610 * | MACS_peak_1 160.81
[2] chr1 90821-91234 * | MACS_peak_2 133.12
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
gff <- system.file("extdata", "GFF_peaks.gff", package="ChIPpeakAnno")
gr2 <- toGRanges(gff, format="GFF", header=FALSE, skip=3)
ol <- findOverlapsOfPeaks(gr1, gr2)
- 韦恩图展示overlap结果
makeVennDiagram(ol,
fill=c("#009E73", "#F0E442"), # circle fill color
col=c("#D55E00", "#0072B2"), #circle border color
cat.col=c("#D55E00", "#0072B2"))

- 饼图展示结果
pie1(table(ol$overlappingPeaks[["gr1///gr2"]]$overlapFeature))

- 找到overlap peak后,可以使用annotatePeakInBatch在maxgap使用AnnotationData中的基因组特征对peak进行注释,在以下示例maxgap=5kb。
overlaps <- ol$peaklist[["gr1///gr2"]]
## ============== old style ===========
## data(TSS.human.GRCh37)
## overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData,
## output="overlapping", maxgap=5000L)
## overlaps.anno <- addGeneIDs(overlaps.anno, "org.Hs.eg.db", "symbol")
## ============== new style ===========
library(EnsDb.Hsapiens.v75) ##(hg19)
## create annotation file from EnsDb or TxDb
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
annoData[1:2]
GRanges object with 2 ranges and 1 metadata column:
seqnames ranges strand | gene_name
<Rle> <IRanges> <Rle> | <character>
ENSG00000223972 chr1 11869-14412 + | DDX11L1
ENSG00000227232 chr1 14363-29806 - | WASH7P
-------
seqinfo: 273 sequences from GRCh37 genome
- annotatePeakInBatch进行注释
获取距离最近的TSS、miRNA、外显子等的距离
overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData,
output="overlapping", maxgap=5000L)
overlaps.anno$gene_name <-
annoData$gene_name[match(overlaps.anno$feature,
names(annoData))]
head(overlaps.anno)
GRanges object with 6 ranges and 11 metadata columns:
seqnames ranges strand |
<Rle> <IRanges> <Rle> |
X001.ENSG00000228327 chr1 713791-715578 * |
X001.ENSG00000237491 chr1 713791-715578 * |
X002.ENSG00000237491 chr1 724851-727191 * |
X003.ENSG00000272438 chr1 839467-840090 * |
X004.ENSG00000223764 chr1 856361-856999 * |
X004.ENSG00000187634 chr1 856361-856999 * |
peakNames
<CharacterList>
X001.ENSG00000228327 gr1__MACS_peak_13,gr2__001,gr2__002
X001.ENSG00000237491 gr1__MACS_peak_13,gr2__001,gr2__002
X002.ENSG00000237491 gr2__003,gr1__MACS_peak_14
X003.ENSG00000272438 gr1__MACS_peak_16,gr2__004
X004.ENSG00000223764 gr1__MACS_peak_17,gr2__005
X004.ENSG00000187634 gr1__MACS_peak_17,gr2__005
peak feature
<character> <character>
X001.ENSG00000228327 001 ENSG00000228327
X001.ENSG00000237491 001 ENSG00000237491
X002.ENSG00000237491 002 ENSG00000237491
X003.ENSG00000272438 003 ENSG00000272438
X004.ENSG00000223764 004 ENSG00000223764
X004.ENSG00000187634 004 ENSG00000187634
start_position end_position
<integer> <integer>
X001.ENSG00000228327 700237 714006
X001.ENSG00000237491 714150 745440
X002.ENSG00000237491 714150 745440
X003.ENSG00000272438 840214 851356
X004.ENSG00000223764 852245 856396
X004.ENSG00000187634 860260 879955
feature_strand insideFeature
<character> <character>
X001.ENSG00000228327 - overlapStart
X001.ENSG00000237491 + overlapStart
X002.ENSG00000237491 + inside
X003.ENSG00000272438 + upstream
X004.ENSG00000223764 - overlapStart
X004.ENSG00000187634 + upstream
distancetoFeature shortestDistance
<numeric> <integer>
X001.ENSG00000228327 215 215
X001.ENSG00000237491 -359 359
X002.ENSG00000237491 10701 10701
X003.ENSG00000272438 -747 124
X004.ENSG00000223764 35 35
X004.ENSG00000187634 -3899 3261
fromOverlappingOrNearest gene_name
<character> <character>
X001.ENSG00000228327 Overlapping RP11-206L10.2
X001.ENSG00000237491 Overlapping RP11-206L10.9
X002.ENSG00000237491 Overlapping RP11-206L10.9
X003.ENSG00000272438 Overlapping RP11-54O7.16
X004.ENSG00000223764 Overlapping RP11-54O7.3
X004.ENSG00000187634 Overlapping SAMD11
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
- 绘制peak与最近特征(如转录起始位点TSS)的距离分布。
gr1.copy <- gr1
gr1.copy$score <- 1
binOverFeature(gr1, gr1.copy, annotationData=annoData,
radius=5000, nbins=10, FUN=c(sum, length),
ylab=c("score", "count"),
main=c("Distribution of aggregated peak scores around TSS",
"Distribution of aggregated peak numbers around TSS"))

- assignChromosomeRegion()可以统计 exon, intron, enhancer, proximal promoter, 5’ UTR 和3’ UTR在peak中的分布。
if(require(TxDb.Hsapiens.UCSC.hg19.knownGene)){
aCR<-assignChromosomeRegion(gr1, nucleotideLevel=FALSE,
precedence=c("Promoters", "immediateDownstream",
"fiveUTRs", "threeUTRs",
"Exons", "Introns"),
TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)
barplot(aCR$percentage)
}
