复盘总结（一）

2021-01-01 本文已影响0人蓬举举

新年最适合干什么？当然是复盘总结！！
先从代码开始复盘。

linux处理bed文件

#对输入的文件进行处理，使得其满足methylkit的输入文件格式
for file in /data/home/lt/data/brain_adrenal/*
do  
var="$file"
tmp=$(echo ${var##*/}) 
a=$(echo ${tmp%.*}) 
echo $a #在屏幕上显示即将生成的文件名
gawk '{print $1"."$3,$1,$3,$6,$10,$11,100-$11}' $file | \
gawk '{if($4=="+"){$4="F"}else{$4="R"}print $0}' | \
sed '1d' | sed '1i\chrBase chr base strand coverage freqC freqT' > $a #
done

用linux不多，还是一开始尝试上游分析（测序数据的质控、比对）的时候用的比较多。linux的优点是比较快。

methylkit R包上游处理

setwd("D:/DMP_mk_")
library(methylKit)
#创建file list
#多个对象
if(T)
{
  file.list <- list(
    "adrenal_1.txt",
    "adrenal_2.txt",
    "brain_1.txt",
    "brain_2.txt",
    "Breast_1.txt",
    "Breast_2.txt",
    "Kidney_1.txt",
    "Kidney_2.txt",
    "Left_Ventricle_1.txt",
    "Left_Ventricle_2.txt",
    "Liver_1.txt",
    "Liver_2.txt",
    "Lung_1.txt",
    "Lung_2.txt",
    "Pancreas_1.txt",
    "Pancreas_2.txt",
    "SkeletalMuscle_1.txt",
    "SkeletalMuscle_2.txt",
    "SkeletalMuscle_3.txt",
    "SkeletalMuscle_4.txt",
    "Skin_1.txt",
    "Skin_2.txt",
    "Stomach_1.txt",
    "Stomach_2.txt",
    "Testis_1.txt",
    "Testis_2.txt",
    "Uterus_1.txt",
    "Uterus_2.txt"
  )
  
}

# read the files to a methylRawList object: myobj
#多个分组对象读取
if(T)
{
  myobj <- methRead(
    file.list,
    sample.id=list(
      "adrenal_1.txt",
      "adrenal_2.txt",
      "brain_1.txt",
      "brain_2.txt",
      "Breast_1.txt",
      "Breast_2.txt",
      "Kidney_1.txt",
      "Kidney_2.txt",
      "Left_Ventricle_1",
      "Left_Ventricle_2",
      "Liver_1.txt",
      "Liver_2.txt",
      "Lung_1.txt",
      "Lung_2.txt",
      "Pancreas_1.txt",
      "Pancreas_2.txt",
      "SkeletalMuscle_1.txt",
      "SkeletalMuscle_2.txt",
      "SkeletalMuscle_3.txt",
      "SkeletalMuscle_4.txt",
      "Skin_1.txt",
      "Skin_2.txt",
      "Stomach_1.txt",
      "Stomach_2.txt",
      "Testis_1.txt",
      "Testis_2.txt",
      "Uterus_1.txt",
      "Uterus_2.txt"),
    assembly="hg19",
    sep = " ",
    treatment=c(0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8,8,9,9,10,10,11,11,12,12),
    context="CpG",
    mincov = 10
  )
  
}
#去除覆盖率较低的reads(count<10)
myobj=filterByCoverage(myobj,lo.count=10,lo.perc=NULL,hi.count=NULL,hi.perc=99.9)
#归一化
myobj <- normalizeCoverage(myobj) 
#合并所有样本中共有的位点
meth=unite(myobj, destrand=FALSE)
save(meth,file = "united_data.Rdata")
#样品聚类信息
pdf(file = "correlation_meth_.pdf",width = 7,height = 20)#在pdf编辑器中进行了编辑
clusterSamples(meth, dist="correlation", method="ward", plot=TRUE)
dev.off()

methylKit是一个用来分析RRBS数据及其变体的R包，利用 filterByCoverage 函数可以对RRBS数据进行初步的过滤以及归一化，消除PCR过度扩增以及在此位点上覆盖的reads过小所带来的偏差。

测序深度和覆盖度

在这里的coverage个人认为是覆盖了某个base的reads数目。hi.perc=99.9表示去掉最大的0.1%。

normalizeCoverage 函数使用从样本覆盖率分布的均值或中位数之间的差异派生的比例因子来标准化样本之间的覆盖率值。这样可以消除不同样本的测序总reads数目所带来的差异。（个人感觉如果最后计算的是甲基化的百分比，标准化的作用不大。）

没有过滤之前：

> getMethylationStats(myobj[[1]],both.strands=FALSE)
methylation statistics per base
summary:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   0.000   3.226  22.795  33.333 100.000 
percentiles:
        0%        10%        20%        30%        40%        50%        60%        70% 
  0.000000   0.000000   0.000000   0.000000   1.612903   3.225806   6.086957  15.789474 
       80%        90%        95%        99%      99.5%      99.9%       100% 
 60.869565  90.909091  96.551724 100.000000 100.000000 100.000000 100.000000 

> getCoverageStats(myobj[[1]],both.strands=FALSE)
read coverage statistics per base
summary:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
   10.00    24.00    42.00    58.41    69.00 46296.00 
percentiles:
   0%   10%   20%   30%   40%   50%   60%   70%   80%   90%   95%   99% 99.5% 99.9% 
   10    15    21    28    35    42    51    62    77   106   150   241   271   336 
 100% 
46296

过滤之后：

> getMethylationStats(myobj[[1]],both.strands=FALSE)
methylation statistics per base
summary:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   0.000   3.226  22.799  33.333 100.000 
percentiles:
        0%        10%        20%        30%        40%        50%        60%        70% 
  0.000000   0.000000   0.000000   0.000000   1.612903   3.225806   6.060606  15.789474 
       80%        90%        95%        99%      99.5%      99.9%       100% 
 60.869565  90.909091  96.551724 100.000000 100.000000 100.000000 100.000000 

> getCoverageStats(myobj[[1]],both.strands=FALSE)
read coverage statistics per base
summary:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10.00   24.00   42.00   54.65   68.00  335.00 
percentiles:
   0%   10%   20%   30%   40%   50%   60%   70%   80%   90%   95%   99% 99.5% 99.9% 
   10    15    21    28    35    42    51    62    77   105   148   237   263   306 
 100% 
  335

归一化之后：

> getMethylationStats(myobj[[1]],both.strands=FALSE)
methylation statistics per base
summary:
  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.000   0.000   3.125  22.760  33.333 100.000 
percentiles:
       0%        10%        20%        30%        40%        50%        60%        70% 
 0.000000   0.000000   0.000000   0.000000   1.351351   3.125000   6.000000  15.789474 
      80%        90%        95%        99%      99.5%      99.9%       100% 
61.016949  91.176471  96.774194 100.000000 100.000000 100.000000 100.000000 

> getCoverageStats(myobj[[1]],both.strands=FALSE)
read coverage statistics per base
summary:
  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 13.00   30.00   53.00   68.96   86.00  423.00 
percentiles:
  0%   10%   20%   30%   40%   50%   60%   70%   80%   90%   95%   99% 99.5% 99.9% 
  13    19    26    35    44    53    64    78    97   132   187   299   332   386 
100% 
 423

生成聚类图：

> clusterSamples(meth, dist="correlation", method="ward", plot=TRUE)
The "ward" method has been renamed to "ward.D"; note new "ward.D2"

Call:
hclust(d = d, method = HCLUST.METHODS[hclust.method])

Cluster method   : ward.D 
Distance         : pearson 
Number of objects: 28

在这一步中我们得到了united_data.Rdata文件。

另外一种unite选择

if(F){
meth=unite(myobj,destrand=FALSE,min.per.group = 1L)
perc.meth=percMethylation(meth)
  tmp <- data.frame(perc.meth)
  for (i in 1:715765)
  {
  for (j in seq(1,25,2))
    {
    if(is.na(tmp[i,j]))
      tmp[i,j] =  tmp[i,j+1]
    }
  
  }
  
  rm(i,j)
  for (i in 1:715765)
  {

      if(is.na(tmp[i,27]))
        {tmp[i,27] =  tmp[i,28]}

    
  }

  
  rm(i)
  tmp_1 <- tmp

  for (i in 1:715765)
  {
    for (j in seq(2,28,2))
    {
      if(is.na(tmp_1[i,j]))
        tmp_1[i,j] =  tmp_1[i,j-1]
    }
    
  }
  
  rm(i,j)
  tmp_2 <- tmp_1
  for (i in 1:715765)
  {
   
      if(is.na(tmp_2[i,17]))
        tmp_2[i,17] =  tmp_2[i,19]
      if(is.na(tmp_2[i,18]))
        tmp_2[i,18] =  tmp_2[i,20]
      if(is.na(tmp_2[i,19]))
        tmp_2[i,19] =  tmp_2[i,17]
      if(is.na(tmp_2[i,20]))
        tmp_2[i,20] =  tmp_2[i,18]

  }
meth_1 <- meth
perc.meth_1 <- tmp_2
rm(meth,perc.meth)
save(meth_1,perc.meth_1,file = "united_data_1.Rdata") 
  
  }

这里我们就可以得到同组至少包含一个样本的unite文件。我们将它另存为united_data_1.Rdata文件。但是上述代码运行的时间在一个小时以上，R语言的for循环速度实在太慢了，暂时还没有找到好的解决方法。

复盘总结（一）

linux处理bed文件

methylkit R包上游处理

另外一种unite选择

猜你喜欢

热点阅读