用R语言对vcf文件进行数据挖掘.8 窗口缩放
2021-08-20 本文已影响0人
Jason数据分析生信教室
目录
vcfR可以对单独的每一段染色体进行分析。比起全基因总体的变化,很多时候染色体单位的变化往往会更加引起我们的兴趣。
数据准备和导入
和之前文章里介绍的一样,先导入基础vcf数据
library(vcfR)
data(vcfR_example)
data("vcfR_test")
然后为了方便理解,自己虚拟三条染色体的参照序列和注释信息。
library(ape)
dna.l <- vector('list', length=3)
dna.l[[1]] <- as.character(dna[1,1:length(dna)])
set.seed(123)
dna.l[[2]] <- sample( c('a','c','g','t'), size = getPOS(vcfR_test)[length(getPOS(vcfR_test))], replace = TRUE )
dna.l[[3]] <- sample( c('a','c','g','t'), size = getPOS(vcfR_test)[length(getPOS(vcfR_test))], replace = TRUE )
dna.l <- as.DNAbin(dna.l)
names(dna.l)[1] <- "Supercontig_1.50"
names(dna.l)[2] <- "Supercontig_1.5"
names(dna.l)[3] <- "Supercontig_1.10"
dna.l
## 3 DNA sequences in binary format stored in a list.
##
## Mean sequence length: 856378.3
## Shortest sequence: 100001
## Longest sequence: 1234567
##
## Labels:
## Supercontig_1.50
## Supercontig_1.5
## Supercontig_1.10
##
## Base composition:
## a c g t
## 0.25 0.25 0.25 0.25
分别虚拟三条染色体的注释文件,然后合并成一个整体。
gff2 <- gff[1:10,]
gff2[,1] <- "Supercontig_1.5"
gff3 <- gff[11:20,]
gff3[,1] <- "Supercontig_1.10"
gff <- rbind(gff, gff2, gff3)
写出三条染色体的vcf
write.vcf(vcf, file = "Supercontig_1.50.vcf.gz")
vcfR_test@fix[,'CHROM'] <- "Supercontig_1.5"
write.vcf(vcfR_test, file = "Supercontig_1.5.vcf.gz")
vcfR_test@fix[,'CHROM'] <- "Supercontig_1.10"
write.vcf(vcfR_test, file = "Supercontig_1.10.vcf.gz")
这样,三条染色体样本数据就准备好了。
缩放染色体
这样就可以自定义窗口尺寸,对染色体进行分析了。
myFiles <- list.files(".", pattern = "Supercontig.*vcf.gz$")
myChroms <- unlist( lapply( strsplit(myFiles, "\\.vcf"), function(x){ x[1] } ) )
myWin.size <- 1e4
i <- 1
myChrom <- myChroms[i]
dna1 <- dna.l[myChrom]
gff1 <- gff[ gff[,1] == myChrom,]
vcf <- read.vcfR(myFiles[i], verbose = FALSE)
chrom <- create.chromR(name=myChrom, vcf=vcf, seq=dna1, ann=gff1, verbose = FALSE)
chrom <- proc.chromR(chrom, win.size = myWin.size, verbose=FALSE)
write.var.info(chrom, file = "pinf_ref_vars.csv", APPEND = FALSE)
write.win.info(chrom, file = "pinf_ref_wins.csv", APPEND = FALSE)
for(i in 2:length(myFiles)){
myChrom <- myChroms[i]
cat(myChrom)
cat("\n")
dna1 <- dna.l[myChrom]
gff1 <- gff[ gff[,1] == myChrom,]
vcf <- read.vcfR(myFiles[i], verbose = FALSE)
chrom <- create.chromR(name=myChrom, vcf=vcf, seq=dna1, ann=gff1, verbose = FALSE)
chrom <- proc.chromR(chrom, win.size = myWin.size, verbose=FALSE)
write.var.info(chrom, file = "pinf_ref_vars.csv", APPEND = TRUE)
write.win.info(chrom, file = "pinf_ref_wins.csv", APPEND = TRUE)
}