R

R语言可视化(二十七):序列logo图绘制

2020-10-17  本文已影响0人  Davey1220

27. 序列logo图绘制


清除当前环境中的变量

rm(list=ls())

设置工作目录

setwd("C:/Users/Dell/Desktop/R_Plots/27seqlogo/")

使用seqLogo包绘制序列logo图

# 安装并加载所需的R包
#BiocManager::install("seqLogo")
library(seqLogo)

# 读取示例位置频率矩阵(PWM)数据
mFile <- system.file("Exfiles/pwm1", package="seqLogo")
m <- read.table(mFile)
m
##    V1  V2  V3  V4  V5  V6  V7  V8
## 1 0.0 0.0 0.0 0.3 0.2 0.0 0.0 0.0
## 2 0.8 0.2 0.8 0.3 0.4 0.2 0.8 0.2
## 3 0.2 0.8 0.2 0.4 0.3 0.8 0.2 0.8
## 4 0.0 0.0 0.0 0.0 0.1 0.0 0.0 0.0

# 使用makePWM函数转换成PWM矩阵
pwm <- makePWM(m)
pwm
##     1   2   3   4   5   6   7   8
## A 0.0 0.0 0.0 0.3 0.2 0.0 0.0 0.0
## C 0.8 0.2 0.8 0.3 0.4 0.2 0.8 0.2
## G 0.2 0.8 0.2 0.4 0.3 0.8 0.2 0.8
## T 0.0 0.0 0.0 0.0 0.1 0.0 0.0 0.0

# 使用seqLogo函数绘制序列logo图
seqLogo(pwm)
image.png

使用ggseqlogo包绘制序列logo图

# 安装并加载所需的R包
#install.packages("ggseqlogo")
library(ggseqlogo)

# 加载并查看示例数据
data(ggseqlogo_sample)
# 查看示例氨基酸序列数据
length(seqs_aa)
## [1] 4
head(seqs_aa[[1]])
## [1] "VVGARRSSWRVVSSI" "GPRSRSRSRDRRRKE" "LLCLRRSSLKAYGNG" "TERPRPNTFIIRCLQ"
## [5] "LSRERVFSEDRARFY" "PSTSRRFSPPSSSLQ"

# 查看示例DNA序列数据
length(seqs_dna)
## [1] 12
head(seqs_dna[[1]])
## [1] "CCATATATAG" "CCATATATAG" "CCATAAATAG" "CCATAAATAG" "CCATAAATAG"
## [6] "CCATAAATAG"

# 查看示例位置频率矩阵数据
length(pfms_dna)
## [1] 4
head(pfms_dna[[1]])
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## A    0    0   11    0    1    0    2    8
## C    1    1    0    9    0    3    7    0
## G    1   10    0    2   10    0    1    1
## T    9    0    0    0    0    8    1    2

# 使用ggseqlogo函数绘制序列logo图
ggseqlogo(seqs_dna[[1]])
image.png
# 绘制多个序列logo
ggseqlogo(seqs_dna, facet = "wrap",ncol = 4)
image.png
# seq_type参数指定序列类型,默认为“auto”自动设别,可以设置为"aa","dna","rna","other"等
ggseqlogo(seqs_aa, seq_type = "aa")
image.png
# method参数指定序列展示的方法,默认为“bits”
ggseqlogo(seqs_dna[1:4], method = "prob")
image.png
# col_scheme参数设置配色方案
# 使用list_col_schemes()函数查看内置配色方案
list_col_schemes(v = T)
## Available ggseqlogo color schemes:
##  auto
##  chemistry
##  chemistry2
##  hydrophobicity
##  nucleotide
##  nucleotide2
##  base_pairing
##  clustalx
##  taylor

ggseqlogo(pfms_dna, col_scheme = "clustalx")
image.png
ggseqlogo(pfms_dna, col_scheme = "base_pairing")
image.png
# 也可以使用make_col_scheme()函数自定义配色方案
# 离散型配色方案 Discrete color scheme examples
cs1 = make_col_scheme(chars=c('A', 'T', 'G', 'C'), groups=c('g1', 'g1', 'g2', 'g2'), 
                      cols=c('red', 'red', 'blue', 'blue'), name='custom1')
cs1
##   letter group  col
## 1      A    g1  red
## 2      T    g1  red
## 3      G    g2 blue
## 4      C    g2 blue

# 连续型配色方案 Quantitative color scheme
cs2 = make_col_scheme(chars=c('A', 'T', 'G', 'C'), values=1:4, 
                      name='custom3')
cs2
##   letter group
## 1      A     1
## 2      T     2
## 3      G     3
## 4      C     4

ggseqlogo(pfms_dna, col_scheme = cs1)
image.png
ggseqlogo(pfms_dna, col_scheme = cs2)
image.png
# font参数设置logo字体
# 使用list_fonts()函数查看内置字体
list_fonts(v = T)
## Available ggseqlogo fonts:
##  helvetica_regular
##  helvetica_bold
##  helvetica_light
##  roboto_medium
##  roboto_bold
##  roboto_regular
##  akrobat_bold
##  akrobat_regular
##  roboto_slab_bold
##  roboto_slab_regular
##  roboto_slab_light
##  xkcd_regular

ggseqlogo(seqs_dna[5:8],font="helvetica_bold")
image.png
ggseqlogo(seqs_dna[5:8],font="roboto_regular")
image.png
# stack_width参数设置字母的宽度
ggseqlogo(seqs_dna[5:8],stack_width=0.5)
image.png
ggseqlogo(seqs_dna[5:8],stack_width=0.9)
image.png

使用motifStack包绘制序列logo图

# 安装并加载所需的R包
#BiocManager::install("motifStack")
library(motifStack)

# 读取motif文件
pcm <- read.table(file.path(find.package("motifStack"), 
                            "extdata", "bin_SOLEXA.pcm"))
head(pcm)
##   V1 V2  V3   V4   V5   V6   V7  V8   V9
## 1  A  | 462    0 1068 1025 1068   0 1019
## 2  C  |  71   60    0   24    0 993    0
## 3  G  | 504    0    0    0    0  12    0
## 4  T  |  31 1008    0   19    0  63   49

# 生成motif矩阵
pcm <- pcm[,3:ncol(pcm)]
rownames(pcm) <- c("A","C","G","T")
head(pcm)
##    V3   V4   V5   V6   V7  V8   V9
## A 462    0 1068 1025 1068   0 1019
## C  71   60    0   24    0 993    0
## G 504    0    0    0    0  12    0
## T  31 1008    0   19    0  63   49

motif <- new("pcm", mat=as.matrix(pcm), name="bin_SOLEXA")
motif
## An object of class "pcm"
## Slot "mat":
##    V3   V4   V5   V6   V7  V8   V9
## A 462    0 1068 1025 1068   0 1019
## C  71   60    0   24    0 993    0
## G 504    0    0    0    0  12    0
## T  31 1008    0   19    0  63   49
## 
## Slot "name":
## [1] "bin_SOLEXA"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25

# 生成motif logo图形
plot(motif)
image.png
#plot the logo with same height
plot(motif, ic.scale=FALSE, ylab="probability")
image.png
#try a different font and a different color group
motif@color <- colorset(colorScheme='basepairing')
plot(motif,font="Times")
image.png
# plot an affinity logo
# 绘制双链关联序列logo图
motif<-matrix(
  c(
    .846, .631, .593, .000, .000, .000, .434, .410, 1.00, .655, .284, .000, .000, .771, .640, .961,
    .625, .679, .773, 1.00, 1.00, .000, .573, .238, .397, 1.00, 1.00, .000, .298, 1.00, 1.00, .996,
    1.00, 1.00, 1.00, .228, .000, 1.00, 1.00, .597, .622, .630, .000, 1.00, 1.00, .871, .617, 1.00,
    .701, .513, .658, .000, .000, .247, .542, 1.00, .718, .686, .000, .000, .000, .595, .437, .970
  ), nrow=4, byrow = TRUE)
rownames(motif) <- c("A", "C", "G", "T")
motif<-new("psam", mat=motif, name="affinity logo")
motif
## An object of class "psam"
## Slot "mat":
##    [,1]  [,2]  [,3]  [,4] [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11] [,12]
## A 0.846 0.631 0.593 0.000    0 0.000 0.434 0.410 1.000 0.655 0.284     0
## C 0.625 0.679 0.773 1.000    1 0.000 0.573 0.238 0.397 1.000 1.000     0
## G 1.000 1.000 1.000 0.228    0 1.000 1.000 0.597 0.622 0.630 0.000     1
## T 0.701 0.513 0.658 0.000    0 0.247 0.542 1.000 0.718 0.686 0.000     0
##   [,13] [,14] [,15] [,16]
## A 0.000 0.771 0.640 0.961
## C 0.298 1.000 1.000 0.996
## G 1.000 0.871 0.617 1.000
## T 0.000 0.595 0.437 0.970
## 
## Slot "name":
## [1] "affinity logo"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001"

plot(motif)
image.png
# plot sequence logo stack
# 导入多个序列矩阵
motifs<-importMatrix(dir(file.path(find.package("motifStack"), "extdata"),"pcm$", full.names = TRUE))
motifs
## $bin_SOLEXA
## An object of class "pcm"
## Slot "mat":
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## A  462    0 1068 1025 1068    0 1019
## C   71   60    0   24    0  993    0
## G  504    0    0    0    0   12    0
## T   31 1008    0   19    0   63   49
## 
## Slot "name":
## [1] "bin_SOLEXA"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## 
## 
## $fd64A_SOLEXA
## An object of class "pcm"
## Slot "mat":
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## A    0   47    0    0    0  347   15
## C    0    0    0    0    0    0  208
## G    0  504    0    5   61   98  112
## T  551    0  551  546  490  106  216
## 
## Slot "name":
## [1] "fd64A_SOLEXA"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## 
## 
## $fkh_NAR
## An object of class "pcm"
## Slot "mat":
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## A    3    5    0    0    0   13    4    6    0    23    15
## C    0    0    0    0    0    0   13    7   11     0     4
## G    0   22    0    0    1   14    2    3    2     1     4
## T   24    0   27   27   26    0    8   11   14     3     4
## 
## Slot "name":
## [1] "fkh_NAR"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## 
## 
## $foxo_SOLEXA
## An object of class "pcm"
## Slot "mat":
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## A    0  122    0    0  107  978    8
## C    0   16    0    0    0    0  834
## G    0 1443    2    5   96  163  191
## T 1581    0 1579 1576 1378  440  516
## 
## Slot "name":
## [1] "foxo_SOLEXA"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## 
## 
## $FoxP_SOLEXA
## An object of class "pcm"
## Slot "mat":
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## A  380   52 1158 1178 1202    0 1191  651
## C   83  184   44   24    0 1057    0  151
## G  652    1    0    0    0    0    1  154
## T    5  958    0    0    0  145    0  209
## 
## Slot "name":
## [1] "FoxP_SOLEXA"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## 
## 
## $slp1_SOLEXA
## An object of class "pcm"
## Slot "mat":
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## A  844  290  641  351 1747 1832 1851    0 1842
## C  361  642  432  302  104   19    0 1710    9
## G  482  277  745    7    0    0    0   14    0
## T  158  642   33 1191    0    0    0  127    0
## 
## Slot "name":
## [1] "slp1_SOLEXA"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
...

## plot stacks
# 绘制多序列堆叠logo图
motifStack(motifs, layout="stack", ncex=1.0)
image.png
## plot stacks with hierarchical tree
# 添加进化树(layout="tree")
motifStack(motifs, layout="tree")
image.png
## When the number of motifs is too much to be shown in a vertical stack, 
## motifStack can draw them in a radial style.
## random sample from MotifDb
#BiocManager::install("MotifDb")
library("MotifDb")
matrix.fly <- query(MotifDb, "Dmelanogaster")
motifs2 <- as.list(matrix.fly)
## use data from FlyFactorSurvey
motifs2 <- motifs2[grepl("Dmelanogaster\\-FlyFactorSurvey\\-",
                         names(motifs2))]
## format the names
names(motifs2) <- gsub("Dmelanogaster_FlyFactorSurvey_", "",
                       gsub("_FBgn\\d+$", "",
                            gsub("[^a-zA-Z0-9]","_",
                                 gsub("(_\\d+)+$", "", names(motifs2)))))
motifs2 <- motifs2[unique(names(motifs2))]
pfms <- sample(motifs2, 50)
## creat a list of object of pfm 
motifs2 <- lapply(names(pfms), 
                  function(.ele, pfms){new("pfm",mat=pfms[[.ele]], name=.ele)}
                  ,pfms)
## trim the motifs
motifs2 <- lapply(motifs2, trimMotif, t=0.4)
motifs2
## [[1]]
## An object of class "pfm"
## Slot "mat":
##            3         4 5 6   7   8
## A 0.23333333 0.4333333 1 0 0.0 0.5
## C 0.03333333 0.0000000 0 0 0.0 0.0
## G 0.00000000 0.0000000 0 0 0.2 0.5
## T 0.73333333 0.5666667 0 1 0.8 0.0
## 
## Slot "name":
## [1] "CG4328_Cell"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## 
## 
## [[2]]
## An object of class "pfm"
## Slot "mat":
##           1 2         3 4 5         6 7         8
## A 0.1111111 0 0.2222222 1 1 0.2222222 0 0.8888889
## C 0.0000000 0 0.0000000 0 0 0.2222222 0 0.1111111
## G 0.8888889 0 0.7777778 0 0 0.2222222 1 0.0000000
## T 0.0000000 1 0.0000000 0 0 0.3333333 0 0.0000000
## 
## Slot "name":
## [1] "CG7386_F10_12_SANGER_5"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## 
## 
## [[3]]
## An object of class "pfm"
## Slot "mat":
##             2          3 4 5          6          7
## A 0.012944984 0.98705502 1 0 0.01618123 0.58899676
## C 0.006472492 0.01294498 0 0 0.00000000 0.02265372
## G 0.000000000 0.00000000 0 0 0.04854369 0.34951456
## T 0.980582524 0.00000000 0 1 0.93527508 0.03883495
## 
## Slot "name":
## [1] "CG9876_SOLEXA"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## 
## 
## [[4]]
## An object of class "pfm"
## Slot "mat":
##            2          3          4          5          6          7
## A 0.09137056 0.69035533 0.90609137 0.00000000 0.03045685 0.67005076
## C 0.01269036 0.09644670 0.09390863 0.00000000 0.01776650 0.06598985
## G 0.01776650 0.19289340 0.00000000 0.03045685 0.14720812 0.23857868
## T 0.87817259 0.02030457 0.00000000 0.96954315 0.80456853 0.02538071
## 
## Slot "name":
## [1] "Lim3_SOLEXA"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
## 
## 
## [[5]]
## An object of class "pfm"
## Slot "mat":
##           2          3 4 5 6         7         8
## A 0.1052632 0.05263158 1 1 0 0.0000000 0.8421053
## C 0.5263158 0.00000000 0 0 0 0.0000000 0.0000000
## G 0.0000000 0.00000000 0 0 0 0.3157895 0.1578947
## T 0.3684211 0.94736842 0 0 1 0.6842105 0.0000000
## 
## Slot "name":
## [1] "Ap_Cell"
## 
## Slot "alphabet":
## [1] "DNA"
## 
## Slot "color":
##         A         C         G         T 
## "#00811B" "#2000C7" "#FFB32C" "#D00001" 
## 
## Slot "background":
##    A    C    G    T 
## 0.25 0.25 0.25 0.25 
...

## setting colors
library(RColorBrewer)
color <- brewer.pal(12, "Set3")
color
##  [1] "#8DD3C7" "#FFFFB3" "#BEBADA" "#FB8072" "#80B1D3" "#FDB462" "#B3DE69"
##  [8] "#FCCDE5" "#D9D9D9" "#BC80BD" "#CCEBC5" "#FFED6F"

## plot logo stack with radial style
# 设置环形多序列logo图(layout="radialPhylog")
motifStack(motifs2, layout="radialPhylog", 
           circle=0.3, cleaves = 0.2, 
           clabel.leaves = 0.5, 
           col.bg=rep(color, each=5), col.bg.alpha=0.3, 
           col.leaves=rep(color, each=5),
           col.inner.label.circle=rep(color, each=5), 
           inner.label.circle.width=0.05,
           col.outer.label.circle=rep(color, each=5), 
           outer.label.circle.width=0.02, 
           circle.motif=1.2,
           angle=350)
image.png
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.936 
## [2] LC_CTYPE=Chinese (Simplified)_China.936   
## [3] LC_MONETARY=Chinese (Simplified)_China.936
## [4] LC_NUMERIC=C                              
## [5] LC_TIME=Chinese (Simplified)_China.936    
## 
## attached base packages:
##  [1] stats4    parallel  grid      stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-2  MotifDb_1.26.0      motifStack_1.28.0  
##  [4] Biostrings_2.52.0   XVector_0.24.0      IRanges_2.18.1     
##  [7] S4Vectors_0.22.0    ade4_1.7-13         MotIV_1.40.0       
## [10] BiocGenerics_0.30.0 grImport2_0.2-0     ggseqlogo_0.1      
## [13] seqLogo_1.50.0     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.5                  lattice_0.20-38            
##  [3] png_0.1-7                   Rsamtools_2.0.0            
##  [5] assertthat_0.2.1            digest_0.6.20              
##  [7] R6_2.4.0                    GenomeInfoDb_1.20.0        
##  [9] evaluate_0.14               ggplot2_3.2.0              
## [11] pillar_1.4.2                zlibbioc_1.30.0            
## [13] rlang_0.4.7                 lazyeval_0.2.2             
## [15] data.table_1.12.2           Matrix_1.2-17              
## [17] rmarkdown_1.13              labeling_0.3               
## [19] BiocParallel_1.17.18        stringr_1.4.0              
## [21] htmlwidgets_1.3             RCurl_1.95-4.12            
## [23] munsell_0.5.0               DelayedArray_0.10.0        
## [25] compiler_3.6.0              rtracklayer_1.44.0         
## [27] xfun_0.8                    pkgconfig_2.0.2            
## [29] base64enc_0.1-3             htmltools_0.3.6            
## [31] tidyselect_0.2.5            SummarizedExperiment_1.14.0
## [33] tibble_2.1.3                GenomeInfoDbData_1.2.1     
## [35] matrixStats_0.54.0          XML_3.98-1.20              
## [37] crayon_1.3.4                dplyr_0.8.3                
## [39] GenomicAlignments_1.20.1    MASS_7.3-51.4              
## [41] bitops_1.0-6                gtable_0.3.0               
## [43] magrittr_1.5                scales_1.0.0               
## [45] stringi_1.4.3               splitstackshape_1.4.8      
## [47] rGADEM_2.32.0               tools_3.6.0                
## [49] BSgenome_1.52.0             Biobase_2.44.0             
## [51] glue_1.3.1                  purrr_0.3.2                
## [53] jpeg_0.1-8.1                yaml_2.2.0                 
## [55] colorspace_1.4-1            GenomicRanges_1.36.0       
## [57] knitr_1.23
上一篇下一篇

猜你喜欢

热点阅读