Topic1.克隆进化之sciClone
最近研究了一下克隆进化的流行pipeline,主流的主要有两套如下:
Pyclone+ citup+ Timespace/fishplot(倾向具有CNV数据);
sciClone+ clonevol(只具有体细胞突变位点)
其中第一套流程主要是python计算,R进行可视化,第二套流程完全使用R即可实现可视化。
这里我将克隆进化的两套流程进行分批次讲解,主要是因为在安装以及调用过程中会出下很多问题,在这里我也会一一解答遇到的bug该怎样解决。
首先就讲讲sciClone这个大家比较熟悉的软件,它主要是推断克隆结构和跟踪肿瘤演化的时空模式。
关于安装:大多是版本问题,具体看您安装的R版本对应着的R包以及Rtools,也可参考GitHub - genome/sciclone: An R package for inferring the subclonal architecture of tumors 如下:
install.packages('devtools')
install.packages('gridBase')
install.packages('gridExtra')
install.packages('ggplot2')
install.packages('igraph')
install.packages('packcircles')
install_github('hdng/trees')
install.packages("installr")
install.packages("stringr")
install_github("genome/bmm")
install_github("genome/sciClone")
关于数据读取问题:一是SNV读取;二是CNV读取;三是LOH读取。对于这三类数据模式,在WGS或WES的分析中都可通过软件计算出来。
一是SNV数据格式主要包括五列:
1. 染色体;
2.绝对位置;
3.支持是ref的reads数;
4.支持是变异的reads数;
5.突变频率[0-100]。如下:
chr pos ref_reads var_reads vaf
1 chr1 226564877 92 108 54.00
2 chr3 47164829 133 56 29.63
3 chr5 79974803 144 36 20.00
4 chr5 86642512 10 5 33.33
5 chr5 112173275 39 16 29.09
6 chr6 20402666 343 288 45.57
二是CNV数据格式主要包括四列:
1.染色体;
2.CNV起始位置;
3.CNV终止位置;
4.拷贝值(假设正常的拷贝数为2)。
三是LOH数据格式主要包括三列:
1.染色体;
2.窗口的起始位置;
3.窗口的终止位置。
关于文章数据重现:
数据下载链接:GitHub - genome/sciclone-meta: accessory scripts and documentation related to the sciclone R package at genome/sciclone
文章中Figure 1 如下图:
########Figure 1
library(sciClone)
setwd("sciclone-meta-master/manuscript/data/suppFigureN.theta.mmy")
load("out.Rdata")
v0 = read.table("data/mmy.snv.vafs", sep="\t")
cn0 = read.table("data/cn.dat")
cn0 = cn0[,c(1,2,3,5)]
reg0 = read.table("data/exclude.loh")
reg0 = reg0[,c(1,2,3)]
clusterParams="empty"
#clusterParams="no.pvalue.outlier.detection"
sc = sciClone(vafs=list(v0),
sampleNames=c("MMY4"),
copyNumberCalls=list(cn0),
regionsToExclude=list(reg0),
minimumDepth=100,
doClustering=TRUE,
clusterParams=clusterParams,
maximumClusters=10,
copyNumberMargins=0.25,
useSexChrs=FALSE
)
文章中Figure 2 如下图:
setwd("sciclone-meta-master/sciclone-meta-master/manuscript/data/figure2")
#BRC sample
rcnt = read.table(paste("data/TCGA-A2-A0YG/vafs",sep=""),sep="\t")
cn = read.table(paste("data/TCGA-A2-A0YG/cn",sep=""),sep="\t")
sc = sciClone(vafs=list(rcnt), sampleNames=c("TCGA-2-A0YG"), copyNumberCalls=list(cn),
cnCallsAreLog2=TRUE, minimumDepth=50)
writeClusterTable(sc,"clusters.brc")
writeClusterSummaryTable(sc,"clusters.summary.brc")
#UCEC sample
rcnt = read.table("data/TCGA-D1-A17T/variants.rcnt")
cn0 = read.table("data/TCGA-D1-A17T/copy_number.cbs")
cn0 = cn0[,c(1,2,3,5)]
sc2 = sciClone(vafs=list(rcnt), sampleNames=c("TCGA-D1-A17T"), copyNumberCalls=list(cn0), minimumDepth=50, cnCallsAreLog2=TRUE)
writeClusterTable(sc2, "clusters.ucec")
writeClusterSummaryTable(sc2, "cluster.summary.ucec")
#output cluster probabilities for genes of interest
anno = read.table("data/TCGA-D1-A17T/genesToAnnotate")
b = read.table("clusters.ucec",header=T)
cat("gene\tcluster\tcluster.prob.1\tcluster.prob.2\n")
for(i in 1:length(anno[,1])){
x=b[b$chr==anno[i,1] & b$st==anno[i,2],];
cat(paste(anno[i,3],x$cluster,x$cluster.prob.1,x$cluster.prob.2,sep="\t"),"\n")
}
文章中Figure 3a 如下图:
setwd("sciclone-meta-master/manuscript/data/figure3")
tum = read.table("data/tumor.vafs", sep="\t", header=TRUE)
rel = read.table("data/relapse.vafs", sep="\t", header=TRUE)
highlight.genes <- read.table("data/genesToAnnotate", header=FALSE, sep="\t", as.is=TRUE)
highlight.genes<-highlight.genes[,1:2]
samples = c("AML28 Tumor", "AML28 Relapse")
cn1 = read.table("data/tumor.cn", sep="\t")
cn1 = cn1[,c(1,2,3,5)]
cn2 = read.table("data/relapse.cn", sep="\t")
cn2 = cn2[,c(1,2,3,5)]
# Change this to 1 to output an intermediate plot at every iteration (as shown in supplement)
plotIntermediateResults <- 0
sc <- sciClone(vafs=list(tum,rel),
sampleNames=samples,
useSexChrs=FALSE,
copyNumberCalls=list(cn1,cn2),
copyNumberMargins=0.25,
minimumDepth=100,
verbose=1,
doClusteringAlongMargins=TRUE,
plotIntermediateResults = plotIntermediateResults)
writeClusterTable(sc, "clusters")
writeClusterSummaryTable(sc, "clusters.summary")
sc.plot2d(sc, highlightsHaveNames=TRUE, positionsToHighlight=highlight.genes, "figure3.pdf")
connectivity.matrix <- getConnectivityMatrix(sc)
write.table(file="connectivity.matrix.tsv", connectivity.matrix, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
文章中Figure 5abc 如下图:
setwd("sciclone-meta-master/manuscript/data/figure5")
#read in data
regions_to_exclude = read.table(file='data/loh.regions',sep="\t",header=F);
samples = c("Pre-treatment Tumor 1","Pre-treatment Tumor 2","Post-treatment Tumor")
v1 = read.table("data/pre1.vafs",sep="\t")
v2 = read.table("data/pre2.vafs",sep="\t")
v3 = read.table("data/post1.vafs",sep="\t")
cn1 = read.table("data/pre1.cn",sep="\t")
cn1 = cn1[,c(1,2,3,5)]
cn2 = read.table("data/pre2.cn",sep="\t")
cn2 = cn2[,c(1,2,3,5)]
cn3 = read.table("data/post1.cn",sep="\t")
cn3 = cn3[,c(1,2,3,5)]
#cluster
sc = sciClone(vafs=list(v1,v2,v3), sampleNames=samples, copyNumberCalls=list(cn1,cn2,cn3), doClusteringAlongMargins=FALSE, maximumClusters=10, regionsToExclude=regions_to_exclude)
writeClusterTable(sc,"cluster")
writeClusterSummaryTable(sc,"cluster.summary")
sc.plot2d(sc,"figure5abc.pdf", singlePage=TRUE, scale=1.8)
文章中Figure 5d 如下图:
##接上面的sc
sco = sc
samplesToPlot=sc@sampleNames
size=700
open3d(windowRect = c(0,0, size, size) )
a = sco@vafs.merged[,c(paste(samplesToPlot,".vaf",sep=""),"cluster")]
a = a[!is.na(a$cluster),]
a = a[!(a$cluster==0),]
numClusters=max(a$cluster)
cols=getClusterColors(numClusters)
colvec = cols[a$cluster]
samples = c("Pre-treatment Tumor 1","Pre-treatment Tumor 2","Post-treatment Tumor")
plot3d(a[,1], a[,2], a[,3], xlim=c(0,100), ylim=c(0,100),zlim=c(0,100), axes=FALSE,
xlab="Pre-treatment Tumor 1 VAF",ylab="Pre-treatment Tumor 2 VAF", zlab="Post-treatment Tumor VAF",
type="s", col=colvec)
axes3d( edges=c("x--", "y--", "z"),labels=FALSE)
for(i in c("+","-")){
for(j in c("+","-")){
axes3d( edges=paste("x",i,j,sep=""), tick=FALSE, labels=FALSE)
axes3d( edges=paste("y",i,j,sep=""), tick=FALSE, labels=FALSE)
axes3d( edges=paste("z",i,j,sep=""), tick=FALSE, labels=FALSE)
}
}
到处为止,文章中出现使用sciClone包做出来的图表都已复现完成,希望对大家有一定的帮助!
关注公众号,每日有更新!
桓峰基因 生物信息分析,SCI文章撰写及生物信息基础知识学习:R语言学习,perl基础编程,linux系统命令,Python遇见更好的你 44篇原创内容 --> 公众号
本文使用 文章同步助手 同步