R语言可视化(二十七):序列logo图绘制
2020-10-17 本文已影响0人
Davey1220
27. 序列logo图绘制
清除当前环境中的变量
rm(list=ls())
设置工作目录
setwd("C:/Users/Dell/Desktop/R_Plots/27seqlogo/")
使用seqLogo包绘制序列logo图
# 安装并加载所需的R包
#BiocManager::install("seqLogo")
library(seqLogo)
# 读取示例位置频率矩阵(PWM)数据
mFile <- system.file("Exfiles/pwm1", package="seqLogo")
m <- read.table(mFile)
m
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 0.0 0.0 0.0 0.3 0.2 0.0 0.0 0.0
## 2 0.8 0.2 0.8 0.3 0.4 0.2 0.8 0.2
## 3 0.2 0.8 0.2 0.4 0.3 0.8 0.2 0.8
## 4 0.0 0.0 0.0 0.0 0.1 0.0 0.0 0.0
# 使用makePWM函数转换成PWM矩阵
pwm <- makePWM(m)
pwm
## 1 2 3 4 5 6 7 8
## A 0.0 0.0 0.0 0.3 0.2 0.0 0.0 0.0
## C 0.8 0.2 0.8 0.3 0.4 0.2 0.8 0.2
## G 0.2 0.8 0.2 0.4 0.3 0.8 0.2 0.8
## T 0.0 0.0 0.0 0.0 0.1 0.0 0.0 0.0
# 使用seqLogo函数绘制序列logo图
seqLogo(pwm)
image.png
使用ggseqlogo包绘制序列logo图
# 安装并加载所需的R包
#install.packages("ggseqlogo")
library(ggseqlogo)
# 加载并查看示例数据
data(ggseqlogo_sample)
# 查看示例氨基酸序列数据
length(seqs_aa)
## [1] 4
head(seqs_aa[[1]])
## [1] "VVGARRSSWRVVSSI" "GPRSRSRSRDRRRKE" "LLCLRRSSLKAYGNG" "TERPRPNTFIIRCLQ"
## [5] "LSRERVFSEDRARFY" "PSTSRRFSPPSSSLQ"
# 查看示例DNA序列数据
length(seqs_dna)
## [1] 12
head(seqs_dna[[1]])
## [1] "CCATATATAG" "CCATATATAG" "CCATAAATAG" "CCATAAATAG" "CCATAAATAG"
## [6] "CCATAAATAG"
# 查看示例位置频率矩阵数据
length(pfms_dna)
## [1] 4
head(pfms_dna[[1]])
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## A 0 0 11 0 1 0 2 8
## C 1 1 0 9 0 3 7 0
## G 1 10 0 2 10 0 1 1
## T 9 0 0 0 0 8 1 2
# 使用ggseqlogo函数绘制序列logo图
ggseqlogo(seqs_dna[[1]])
image.png
# 绘制多个序列logo
ggseqlogo(seqs_dna, facet = "wrap",ncol = 4)
image.png
# seq_type参数指定序列类型,默认为“auto”自动设别,可以设置为"aa","dna","rna","other"等
ggseqlogo(seqs_aa, seq_type = "aa")
image.png
# method参数指定序列展示的方法,默认为“bits”
ggseqlogo(seqs_dna[1:4], method = "prob")
image.png
# col_scheme参数设置配色方案
# 使用list_col_schemes()函数查看内置配色方案
list_col_schemes(v = T)
## Available ggseqlogo color schemes:
## auto
## chemistry
## chemistry2
## hydrophobicity
## nucleotide
## nucleotide2
## base_pairing
## clustalx
## taylor
ggseqlogo(pfms_dna, col_scheme = "clustalx")
image.png
ggseqlogo(pfms_dna, col_scheme = "base_pairing")
image.png
# 也可以使用make_col_scheme()函数自定义配色方案
# 离散型配色方案 Discrete color scheme examples
cs1 = make_col_scheme(chars=c('A', 'T', 'G', 'C'), groups=c('g1', 'g1', 'g2', 'g2'),
cols=c('red', 'red', 'blue', 'blue'), name='custom1')
cs1
## letter group col
## 1 A g1 red
## 2 T g1 red
## 3 G g2 blue
## 4 C g2 blue
# 连续型配色方案 Quantitative color scheme
cs2 = make_col_scheme(chars=c('A', 'T', 'G', 'C'), values=1:4,
name='custom3')
cs2
## letter group
## 1 A 1
## 2 T 2
## 3 G 3
## 4 C 4
ggseqlogo(pfms_dna, col_scheme = cs1)
image.png
ggseqlogo(pfms_dna, col_scheme = cs2)
image.png
# font参数设置logo字体
# 使用list_fonts()函数查看内置字体
list_fonts(v = T)
## Available ggseqlogo fonts:
## helvetica_regular
## helvetica_bold
## helvetica_light
## roboto_medium
## roboto_bold
## roboto_regular
## akrobat_bold
## akrobat_regular
## roboto_slab_bold
## roboto_slab_regular
## roboto_slab_light
## xkcd_regular
ggseqlogo(seqs_dna[5:8],font="helvetica_bold")
image.png
ggseqlogo(seqs_dna[5:8],font="roboto_regular")
image.png
# stack_width参数设置字母的宽度
ggseqlogo(seqs_dna[5:8],stack_width=0.5)
image.png
ggseqlogo(seqs_dna[5:8],stack_width=0.9)
image.png
使用motifStack包绘制序列logo图
# 安装并加载所需的R包
#BiocManager::install("motifStack")
library(motifStack)
# 读取motif文件
pcm <- read.table(file.path(find.package("motifStack"),
"extdata", "bin_SOLEXA.pcm"))
head(pcm)
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 A | 462 0 1068 1025 1068 0 1019
## 2 C | 71 60 0 24 0 993 0
## 3 G | 504 0 0 0 0 12 0
## 4 T | 31 1008 0 19 0 63 49
# 生成motif矩阵
pcm <- pcm[,3:ncol(pcm)]
rownames(pcm) <- c("A","C","G","T")
head(pcm)
## V3 V4 V5 V6 V7 V8 V9
## A 462 0 1068 1025 1068 0 1019
## C 71 60 0 24 0 993 0
## G 504 0 0 0 0 12 0
## T 31 1008 0 19 0 63 49
motif <- new("pcm", mat=as.matrix(pcm), name="bin_SOLEXA")
motif
## An object of class "pcm"
## Slot "mat":
## V3 V4 V5 V6 V7 V8 V9
## A 462 0 1068 1025 1068 0 1019
## C 71 60 0 24 0 993 0
## G 504 0 0 0 0 12 0
## T 31 1008 0 19 0 63 49
##
## Slot "name":
## [1] "bin_SOLEXA"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
# 生成motif logo图形
plot(motif)
image.png
#plot the logo with same height
plot(motif, ic.scale=FALSE, ylab="probability")
image.png
#try a different font and a different color group
motif@color <- colorset(colorScheme='basepairing')
plot(motif,font="Times")
image.png
# plot an affinity logo
# 绘制双链关联序列logo图
motif<-matrix(
c(
.846, .631, .593, .000, .000, .000, .434, .410, 1.00, .655, .284, .000, .000, .771, .640, .961,
.625, .679, .773, 1.00, 1.00, .000, .573, .238, .397, 1.00, 1.00, .000, .298, 1.00, 1.00, .996,
1.00, 1.00, 1.00, .228, .000, 1.00, 1.00, .597, .622, .630, .000, 1.00, 1.00, .871, .617, 1.00,
.701, .513, .658, .000, .000, .247, .542, 1.00, .718, .686, .000, .000, .000, .595, .437, .970
), nrow=4, byrow = TRUE)
rownames(motif) <- c("A", "C", "G", "T")
motif<-new("psam", mat=motif, name="affinity logo")
motif
## An object of class "psam"
## Slot "mat":
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## A 0.846 0.631 0.593 0.000 0 0.000 0.434 0.410 1.000 0.655 0.284 0
## C 0.625 0.679 0.773 1.000 1 0.000 0.573 0.238 0.397 1.000 1.000 0
## G 1.000 1.000 1.000 0.228 0 1.000 1.000 0.597 0.622 0.630 0.000 1
## T 0.701 0.513 0.658 0.000 0 0.247 0.542 1.000 0.718 0.686 0.000 0
## [,13] [,14] [,15] [,16]
## A 0.000 0.771 0.640 0.961
## C 0.298 1.000 1.000 0.996
## G 1.000 0.871 0.617 1.000
## T 0.000 0.595 0.437 0.970
##
## Slot "name":
## [1] "affinity logo"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
plot(motif)
image.png
# plot sequence logo stack
# 导入多个序列矩阵
motifs<-importMatrix(dir(file.path(find.package("motifStack"), "extdata"),"pcm$", full.names = TRUE))
motifs
## $bin_SOLEXA
## An object of class "pcm"
## Slot "mat":
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## A 462 0 1068 1025 1068 0 1019
## C 71 60 0 24 0 993 0
## G 504 0 0 0 0 12 0
## T 31 1008 0 19 0 63 49
##
## Slot "name":
## [1] "bin_SOLEXA"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
##
##
## $fd64A_SOLEXA
## An object of class "pcm"
## Slot "mat":
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## A 0 47 0 0 0 347 15
## C 0 0 0 0 0 0 208
## G 0 504 0 5 61 98 112
## T 551 0 551 546 490 106 216
##
## Slot "name":
## [1] "fd64A_SOLEXA"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
##
##
## $fkh_NAR
## An object of class "pcm"
## Slot "mat":
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## A 3 5 0 0 0 13 4 6 0 23 15
## C 0 0 0 0 0 0 13 7 11 0 4
## G 0 22 0 0 1 14 2 3 2 1 4
## T 24 0 27 27 26 0 8 11 14 3 4
##
## Slot "name":
## [1] "fkh_NAR"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
##
##
## $foxo_SOLEXA
## An object of class "pcm"
## Slot "mat":
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## A 0 122 0 0 107 978 8
## C 0 16 0 0 0 0 834
## G 0 1443 2 5 96 163 191
## T 1581 0 1579 1576 1378 440 516
##
## Slot "name":
## [1] "foxo_SOLEXA"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
##
##
## $FoxP_SOLEXA
## An object of class "pcm"
## Slot "mat":
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## A 380 52 1158 1178 1202 0 1191 651
## C 83 184 44 24 0 1057 0 151
## G 652 1 0 0 0 0 1 154
## T 5 958 0 0 0 145 0 209
##
## Slot "name":
## [1] "FoxP_SOLEXA"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
##
##
## $slp1_SOLEXA
## An object of class "pcm"
## Slot "mat":
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## A 844 290 641 351 1747 1832 1851 0 1842
## C 361 642 432 302 104 19 0 1710 9
## G 482 277 745 7 0 0 0 14 0
## T 158 642 33 1191 0 0 0 127 0
##
## Slot "name":
## [1] "slp1_SOLEXA"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
...
## plot stacks
# 绘制多序列堆叠logo图
motifStack(motifs, layout="stack", ncex=1.0)
image.png
## plot stacks with hierarchical tree
# 添加进化树(layout="tree")
motifStack(motifs, layout="tree")
image.png
## When the number of motifs is too much to be shown in a vertical stack,
## motifStack can draw them in a radial style.
## random sample from MotifDb
#BiocManager::install("MotifDb")
library("MotifDb")
matrix.fly <- query(MotifDb, "Dmelanogaster")
motifs2 <- as.list(matrix.fly)
## use data from FlyFactorSurvey
motifs2 <- motifs2[grepl("Dmelanogaster\\-FlyFactorSurvey\\-",
names(motifs2))]
## format the names
names(motifs2) <- gsub("Dmelanogaster_FlyFactorSurvey_", "",
gsub("_FBgn\\d+$", "",
gsub("[^a-zA-Z0-9]","_",
gsub("(_\\d+)+$", "", names(motifs2)))))
motifs2 <- motifs2[unique(names(motifs2))]
pfms <- sample(motifs2, 50)
## creat a list of object of pfm
motifs2 <- lapply(names(pfms),
function(.ele, pfms){new("pfm",mat=pfms[[.ele]], name=.ele)}
,pfms)
## trim the motifs
motifs2 <- lapply(motifs2, trimMotif, t=0.4)
motifs2
## [[1]]
## An object of class "pfm"
## Slot "mat":
## 3 4 5 6 7 8
## A 0.23333333 0.4333333 1 0 0.0 0.5
## C 0.03333333 0.0000000 0 0 0.0 0.0
## G 0.00000000 0.0000000 0 0 0.2 0.5
## T 0.73333333 0.5666667 0 1 0.8 0.0
##
## Slot "name":
## [1] "CG4328_Cell"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
##
##
## [[2]]
## An object of class "pfm"
## Slot "mat":
## 1 2 3 4 5 6 7 8
## A 0.1111111 0 0.2222222 1 1 0.2222222 0 0.8888889
## C 0.0000000 0 0.0000000 0 0 0.2222222 0 0.1111111
## G 0.8888889 0 0.7777778 0 0 0.2222222 1 0.0000000
## T 0.0000000 1 0.0000000 0 0 0.3333333 0 0.0000000
##
## Slot "name":
## [1] "CG7386_F10_12_SANGER_5"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
##
##
## [[3]]
## An object of class "pfm"
## Slot "mat":
## 2 3 4 5 6 7
## A 0.012944984 0.98705502 1 0 0.01618123 0.58899676
## C 0.006472492 0.01294498 0 0 0.00000000 0.02265372
## G 0.000000000 0.00000000 0 0 0.04854369 0.34951456
## T 0.980582524 0.00000000 0 1 0.93527508 0.03883495
##
## Slot "name":
## [1] "CG9876_SOLEXA"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
##
##
## [[4]]
## An object of class "pfm"
## Slot "mat":
## 2 3 4 5 6 7
## A 0.09137056 0.69035533 0.90609137 0.00000000 0.03045685 0.67005076
## C 0.01269036 0.09644670 0.09390863 0.00000000 0.01776650 0.06598985
## G 0.01776650 0.19289340 0.00000000 0.03045685 0.14720812 0.23857868
## T 0.87817259 0.02030457 0.00000000 0.96954315 0.80456853 0.02538071
##
## Slot "name":
## [1] "Lim3_SOLEXA"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
##
##
## [[5]]
## An object of class "pfm"
## Slot "mat":
## 2 3 4 5 6 7 8
## A 0.1052632 0.05263158 1 1 0 0.0000000 0.8421053
## C 0.5263158 0.00000000 0 0 0 0.0000000 0.0000000
## G 0.0000000 0.00000000 0 0 0 0.3157895 0.1578947
## T 0.3684211 0.94736842 0 0 1 0.6842105 0.0000000
##
## Slot "name":
## [1] "Ap_Cell"
##
## Slot "alphabet":
## [1] "DNA"
##
## Slot "color":
## A C G T
## "#00811B" "#2000C7" "#FFB32C" "#D00001"
##
## Slot "background":
## A C G T
## 0.25 0.25 0.25 0.25
...
## setting colors
library(RColorBrewer)
color <- brewer.pal(12, "Set3")
color
## [1] "#8DD3C7" "#FFFFB3" "#BEBADA" "#FB8072" "#80B1D3" "#FDB462" "#B3DE69"
## [8] "#FCCDE5" "#D9D9D9" "#BC80BD" "#CCEBC5" "#FFED6F"
## plot logo stack with radial style
# 设置环形多序列logo图(layout="radialPhylog")
motifStack(motifs2, layout="radialPhylog",
circle=0.3, cleaves = 0.2,
clabel.leaves = 0.5,
col.bg=rep(color, each=5), col.bg.alpha=0.3,
col.leaves=rep(color, each=5),
col.inner.label.circle=rep(color, each=5),
inner.label.circle.width=0.05,
col.outer.label.circle=rep(color, each=5),
outer.label.circle.width=0.02,
circle.motif=1.2,
angle=350)
image.png
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.936
## [2] LC_CTYPE=Chinese (Simplified)_China.936
## [3] LC_MONETARY=Chinese (Simplified)_China.936
## [4] LC_NUMERIC=C
## [5] LC_TIME=Chinese (Simplified)_China.936
##
## attached base packages:
## [1] stats4 parallel grid stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] RColorBrewer_1.1-2 MotifDb_1.26.0 motifStack_1.28.0
## [4] Biostrings_2.52.0 XVector_0.24.0 IRanges_2.18.1
## [7] S4Vectors_0.22.0 ade4_1.7-13 MotIV_1.40.0
## [10] BiocGenerics_0.30.0 grImport2_0.2-0 ggseqlogo_0.1
## [13] seqLogo_1.50.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 lattice_0.20-38
## [3] png_0.1-7 Rsamtools_2.0.0
## [5] assertthat_0.2.1 digest_0.6.20
## [7] R6_2.4.0 GenomeInfoDb_1.20.0
## [9] evaluate_0.14 ggplot2_3.2.0
## [11] pillar_1.4.2 zlibbioc_1.30.0
## [13] rlang_0.4.7 lazyeval_0.2.2
## [15] data.table_1.12.2 Matrix_1.2-17
## [17] rmarkdown_1.13 labeling_0.3
## [19] BiocParallel_1.17.18 stringr_1.4.0
## [21] htmlwidgets_1.3 RCurl_1.95-4.12
## [23] munsell_0.5.0 DelayedArray_0.10.0
## [25] compiler_3.6.0 rtracklayer_1.44.0
## [27] xfun_0.8 pkgconfig_2.0.2
## [29] base64enc_0.1-3 htmltools_0.3.6
## [31] tidyselect_0.2.5 SummarizedExperiment_1.14.0
## [33] tibble_2.1.3 GenomeInfoDbData_1.2.1
## [35] matrixStats_0.54.0 XML_3.98-1.20
## [37] crayon_1.3.4 dplyr_0.8.3
## [39] GenomicAlignments_1.20.1 MASS_7.3-51.4
## [41] bitops_1.0-6 gtable_0.3.0
## [43] magrittr_1.5 scales_1.0.0
## [45] stringi_1.4.3 splitstackshape_1.4.8
## [47] rGADEM_2.32.0 tools_3.6.0
## [49] BSgenome_1.52.0 Biobase_2.44.0
## [51] glue_1.3.1 purrr_0.3.2
## [53] jpeg_0.1-8.1 yaml_2.2.0
## [55] colorspace_1.4-1 GenomicRanges_1.36.0
## [57] knitr_1.23