BS-seq

methylKit 包中 getCorrelation 函数怎么

2020-04-30  本文已影响0人  热衷组培的二货潜

今天有一个小伙伴遇到就是不知道为啥 getCorrelation 函数输出的结果都不能储存到变量中,我也尝试但是费解。
但是本质就是 先计算每个位点中每个样品的甲基化值,然后通过 cor() 函数来计算相关性。

> test <- getCorrelation(methylBase.obj,method="pearson",plot=FALSE)
          test1     test2     ctrl1     ctrl2
test1 1.0000000 0.9252530 0.8767865 0.8737509
test2 0.9252530 1.0000000 0.8791864 0.8801669
ctrl1 0.8767865 0.8791864 1.0000000 0.9465369
ctrl2 0.8737509 0.8801669 0.9465369 1.0000000

查看 test 是空的,不论我我怎么转换。。



然后我就开启了查看源码模式

先聊点无关的,我们可以看到如果不指定 nrow 参数多少行,默认是用前 200 万行来计算相关性,如果指定,就按照你指定多少行来计算。那么问题来了,结果明明是由 cor() 函数来计算得到的相关性结果,怎么就不能保存到一个变量中呢?我也不知道。
然后我就拆开来运行了

headTabix:用来将数据转换成 data.frame 、data.table、GRanges 文件格式的函数
fread.gzipped:用来读取文件的函数,调用 data.table 包中的 fread 函数来进行

headTabix <- function(tbxFile, nrow = 10,
                      return.type = c("data.table","data.frame","GRanges") ){
  
  if(nrow < 1e6) {
    if( class(tbxFile) != "TabixFile" ){
      tbxFile <- TabixFile(tbxFile)
      open(tbxFile)
    }
    df <- getTabixByChunk( tbxFile,chunk.size=nrow,return.type)
  } 
  else {
    returnDt = if(return.type[1] == "data.table") TRUE else FALSE 
    df <- fread.gzipped(tbxFile,nrow = nrow, stringsAsFactors = TRUE, data.table = returnDt)
    
    if(return.type[1] == "GRanges"){
      return( GRanges(seqnames=as.character(df$V1),
                      ranges=IRanges(start=df$V2, end=df$V3),
                      strand=df$V4, df[,5:ncol(df)]) )
    } 
  }
  
  return(df)
  
}

fread.gzipped <- function(filepath, ..., runShell=TRUE){
  
  # check if file is gzipped (either gz or bgz)
  if (R.utils::isGzipped(filepath, method = "content")){
    
    if( runShell && ( .Platform$OS.type == "unix" ) ) {
      # being on unix we can run shell comands to uncompress the file
      if(!file.exists(filepath) ) 
        # to protect against exploits
        stop("No such file: ", filepath)
      # run the command
      ext = if( tools::file_ext(filepath) == "bgz") "bgz" else "gz"
      tmpFile <- paste0(tempdir(),"/",gsub(ext,"",basename(filepath)))
      if(!file.exists(tmpFile)) {
        system2("gunzip",args = c("-c",shQuote(filepath)), stdout = tmpFile)
      }
      filepath <- tmpFile
      # on.exit(unlink( tmpFile ), add = TRUE)
      
    } else {
      # on windows we have to decompress first ... 
      ext = if( tools::file_ext(filepath) == "bgz") "bgz" else "gz"
      tmpFile <- R.utils::gunzip(filepath,temporary = TRUE, overwrite = TRUE,
                                 remove = FALSE, ext = ext, FUN = gzfile)
      filepath <- tmpFile
      # on.exit(unlink( tmpFile ), add = TRUE)
    }
    
  }
  ## finally we read in the uncompressed file  
  df <- data.table::fread(file = filepath,...)
  
  
  return(df)
}

其实本质就是先计算每个位点中每个样品的甲基化值,然后通过 cor() 函数来计算相关性。

mydbpath <- methylKit::getDBPath(makeMethylDB(methylBase.obj,"my/path"))
# [1] "my/path\\methylBase_58b42b3d58d3.txt.bgz"

data = headTabix(mydbpath, nrow = 2e6, return.type = "data.frame")
# V1      V2      V3 V4 V5 V6 V7  V8  V9 V10 V11 V12 V13 V14 V15 V16
# 1 chr21 9853296 9853296  + 17 10  7 333 268  65  18  16   2 395 341  54
# 2 chr21 9853326 9853326  + 17 12  5 329 249  79  16  14   2 379 284  95
# 3 chr21 9860126 9860126  + 39 38  1  83  78   5  83  83   0  41  40   1
# 4 chr21 9906604 9906604  + 68 42 26 111  97  14  23  18   5  37  33   4
# 5 chr21 9906616 9906616  + 68 52 16 111 104   7  23  14   9  37  27  10
# 6 chr21 9906619 9906619  + 68 59  9 111 109   2  22  18   4  37  29   8

# 数据中代表样品 C 的数目的列
methylBase.obj@numCs.index
# [1]  6  9 12 15

# 数据中代表样品 T 的数目的列
methylBase.obj@numTs.index
# [1]  7 10 13 16

# meth = C/(C + T)
meth.mat = data[, methylBase.obj@numCs.index]/
  ( data[,methylBase.obj@numCs.index] + data[,methylBase.obj@numTs.index] )
#       V6        V9       V12       V15
# 1 0.5882353 0.8048048 0.8888889 0.8632911
# 2 0.7058824 0.7591463 0.8750000 0.7493404
# 3 0.9743590 0.9397590 1.0000000 0.9756098
# 4 0.6176471 0.8738739 0.7826087 0.8918919
# 5 0.7647059 0.9369369 0.6086957 0.7297297
# 6 0.8676471 0.9819820 0.8181818 0.7837838

# 样品名称
methylBase.obj@sample.ids
# [1] "test1" "test2" "ctrl1" "ctrl2"

names(meth.mat) = methylBase.obj@sample.ids
# test1     test2     ctrl1     ctrl2
# 1 0.5882353 0.8048048 0.8888889 0.8632911
# 2 0.7058824 0.7591463 0.8750000 0.7493404
# 3 0.9743590 0.9397590 1.0000000 0.9756098
# 4 0.6176471 0.8738739 0.7826087 0.8918919
# 5 0.7647059 0.9369369 0.6086957 0.7297297
# 6 0.8676471 0.9819820 0.8181818 0.7837838

a <- cor(meth.mat, method = "pearson")
 a
#           test1     test2     ctrl1     ctrl2
# test1 1.0000000 0.9252530 0.8767865 0.8737509
# test2 0.9252530 1.0000000 0.8791864 0.8801669
# ctrl1 0.8767865 0.8791864 1.0000000 0.9465369
# ctrl2 0.8737509 0.8801669 0.9465369 1.0000000


class(a)
# [1] "matrix"
上一篇 下一篇

猜你喜欢

热点阅读