methylKit 包中 getCorrelation 函数怎么
2020-04-30 本文已影响0人
热衷组培的二货潜
今天有一个小伙伴遇到就是不知道为啥 getCorrelation 函数输出的结果都不能储存到变量中,我也尝试但是费解。
但是本质就是 先计算每个位点中每个样品的甲基化值,然后通过 cor()
函数来计算相关性。
> test <- getCorrelation(methylBase.obj,method="pearson",plot=FALSE)
test1 test2 ctrl1 ctrl2
test1 1.0000000 0.9252530 0.8767865 0.8737509
test2 0.9252530 1.0000000 0.8791864 0.8801669
ctrl1 0.8767865 0.8791864 1.0000000 0.9465369
ctrl2 0.8737509 0.8801669 0.9465369 1.0000000
查看 test 是空的,不论我我怎么转换。。

然后我就开启了查看源码模式
先聊点无关的,我们可以看到如果不指定 nrow 参数多少行,默认是用前 200 万行来计算相关性,如果指定,就按照你指定多少行来计算。那么问题来了,结果明明是由 cor() 函数来计算得到的相关性结果,怎么就不能保存到一个变量中呢?我也不知道。
然后我就拆开来运行了
- 首先将几个相关函数 copy 出来
headTabix:用来将数据转换成 data.frame 、data.table、GRanges 文件格式的函数
fread.gzipped:用来读取文件的函数,调用 data.table 包中的 fread 函数来进行
headTabix <- function(tbxFile, nrow = 10,
return.type = c("data.table","data.frame","GRanges") ){
if(nrow < 1e6) {
if( class(tbxFile) != "TabixFile" ){
tbxFile <- TabixFile(tbxFile)
open(tbxFile)
}
df <- getTabixByChunk( tbxFile,chunk.size=nrow,return.type)
}
else {
returnDt = if(return.type[1] == "data.table") TRUE else FALSE
df <- fread.gzipped(tbxFile,nrow = nrow, stringsAsFactors = TRUE, data.table = returnDt)
if(return.type[1] == "GRanges"){
return( GRanges(seqnames=as.character(df$V1),
ranges=IRanges(start=df$V2, end=df$V3),
strand=df$V4, df[,5:ncol(df)]) )
}
}
return(df)
}
fread.gzipped <- function(filepath, ..., runShell=TRUE){
# check if file is gzipped (either gz or bgz)
if (R.utils::isGzipped(filepath, method = "content")){
if( runShell && ( .Platform$OS.type == "unix" ) ) {
# being on unix we can run shell comands to uncompress the file
if(!file.exists(filepath) )
# to protect against exploits
stop("No such file: ", filepath)
# run the command
ext = if( tools::file_ext(filepath) == "bgz") "bgz" else "gz"
tmpFile <- paste0(tempdir(),"/",gsub(ext,"",basename(filepath)))
if(!file.exists(tmpFile)) {
system2("gunzip",args = c("-c",shQuote(filepath)), stdout = tmpFile)
}
filepath <- tmpFile
# on.exit(unlink( tmpFile ), add = TRUE)
} else {
# on windows we have to decompress first ...
ext = if( tools::file_ext(filepath) == "bgz") "bgz" else "gz"
tmpFile <- R.utils::gunzip(filepath,temporary = TRUE, overwrite = TRUE,
remove = FALSE, ext = ext, FUN = gzfile)
filepath <- tmpFile
# on.exit(unlink( tmpFile ), add = TRUE)
}
}
## finally we read in the uncompressed file
df <- data.table::fread(file = filepath,...)
return(df)
}
- 好了,函数 copy 出来了,就进行分布操作
其实本质就是先计算每个位点中每个样品的甲基化值,然后通过
cor()
函数来计算相关性。
mydbpath <- methylKit::getDBPath(makeMethylDB(methylBase.obj,"my/path"))
# [1] "my/path\\methylBase_58b42b3d58d3.txt.bgz"
data = headTabix(mydbpath, nrow = 2e6, return.type = "data.frame")
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
# 1 chr21 9853296 9853296 + 17 10 7 333 268 65 18 16 2 395 341 54
# 2 chr21 9853326 9853326 + 17 12 5 329 249 79 16 14 2 379 284 95
# 3 chr21 9860126 9860126 + 39 38 1 83 78 5 83 83 0 41 40 1
# 4 chr21 9906604 9906604 + 68 42 26 111 97 14 23 18 5 37 33 4
# 5 chr21 9906616 9906616 + 68 52 16 111 104 7 23 14 9 37 27 10
# 6 chr21 9906619 9906619 + 68 59 9 111 109 2 22 18 4 37 29 8
# 数据中代表样品 C 的数目的列
methylBase.obj@numCs.index
# [1] 6 9 12 15
# 数据中代表样品 T 的数目的列
methylBase.obj@numTs.index
# [1] 7 10 13 16
# meth = C/(C + T)
meth.mat = data[, methylBase.obj@numCs.index]/
( data[,methylBase.obj@numCs.index] + data[,methylBase.obj@numTs.index] )
# V6 V9 V12 V15
# 1 0.5882353 0.8048048 0.8888889 0.8632911
# 2 0.7058824 0.7591463 0.8750000 0.7493404
# 3 0.9743590 0.9397590 1.0000000 0.9756098
# 4 0.6176471 0.8738739 0.7826087 0.8918919
# 5 0.7647059 0.9369369 0.6086957 0.7297297
# 6 0.8676471 0.9819820 0.8181818 0.7837838
# 样品名称
methylBase.obj@sample.ids
# [1] "test1" "test2" "ctrl1" "ctrl2"
names(meth.mat) = methylBase.obj@sample.ids
# test1 test2 ctrl1 ctrl2
# 1 0.5882353 0.8048048 0.8888889 0.8632911
# 2 0.7058824 0.7591463 0.8750000 0.7493404
# 3 0.9743590 0.9397590 1.0000000 0.9756098
# 4 0.6176471 0.8738739 0.7826087 0.8918919
# 5 0.7647059 0.9369369 0.6086957 0.7297297
# 6 0.8676471 0.9819820 0.8181818 0.7837838
a <- cor(meth.mat, method = "pearson")
a
# test1 test2 ctrl1 ctrl2
# test1 1.0000000 0.9252530 0.8767865 0.8737509
# test2 0.9252530 1.0000000 0.8791864 0.8801669
# ctrl1 0.8767865 0.8791864 1.0000000 0.9465369
# ctrl2 0.8737509 0.8801669 0.9465369 1.0000000
class(a)
# [1] "matrix"