基因组denovo组装计算
2018-12-01 本文已影响103人
Biofantasy
SOAPdenovo 配置example.config
#mal read length
max_rd_len=100
[LIB]
#average
avg_ins=450
#if sequence needs to be reversed
reverse_seq=0
#in which part(s) the reads are used
asm_flags=3
#use only first 100 bps of each read
rd_len_cutoff=100
#in which order the reads are used while scaffolding
rank=1
#cutoff of pair number for a reliable connection (at least 3 for short insert size)
pair_num_cutoff=3
#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)
map_len=32
#a pair of fastq file, read 1 file should always be followed by read 2 file
q1=newBGIseq500_1.fq1
q2=newBGIseq500_2.fq1
length.pl
#!/bin/perl -w
use strict;
$/ = ">";
open(IN,"output.scafSeq") || die $!;
while (<IN>){
chomp;
next if (/^$/);
my @tmp = split /\n/,$_,2;
$tmp[1] =~ s/\n//g;
print length($tmp[1])."\n"
}
close IN;
计算长度累计曲线
perl length.pl |sort -k 1rn |awk 'BEGIN{sum=0}{if(!NF){next};sum+=$1;print sum}' >len.txt
计算contig N50
perl -e 'my ($len,$total)=(0,0);my @x;while(<>){if(/^[\>\@]/){if($len>0){$total+=$len;push@x,$len;};$len=0;}else{s/\s//g;$len+=length($_);}}if ($len>0){$total+=$len;push @x,$len;}@x=sort{$b<=>$a}@x; my ($count,$half)=(0,0);for (my $j=0;$j<@x;$j++){$count+=$x[$j];if(($count>=$total/2)&&($half==0)){print "N50: $x[$j]\n";$half=$x[$j]}elsif($count>=$total*0.9){print "N90: $x[$j]\n";exit;}}' output.scafSeq
R
setwd("/home/u2406/files/")
rm(list=ls())
pdf("length.pdf")
lens <- read.table("len.txt")
plot(x=seq(1,length(lens$V1)),y=lens$V1,type="l")
dev.off()
chr17染色体
vcftools --vcf chr17.vcf --chr chr17 --to-bp 7687550 --out TP53 --recode --from-bp 7661779
TP53位置信息:
Chromosome 17: 7,661,779-7,687,550
该区域中包含的变异位点的脚本:
#!/usr/bin/perl
use strict;
use warnings;
my @sample;
my %hash;
open (IN,"chr17.vcf") or die $!;
while (<IN>) {
chomp;
next if (/^##/);
my $line=$_;
if ($line=~/^#CHROM/) { (undef,undef,undef,undef,undef,undef,undef,undef,undef,@sample)=split/\t/,$line;
next;
}
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 27DMBDM4YT 7XKZJA3JWX APRDKV0LDS
my ($chrom,$pos,$id,$ref,$alt,$qual,$filter,$info,$format,@Other)=split/\t/,$line;
if ($pos>=7661779 && $pos <=7687550){
for (my $i=0;$i<@sample ;$i++) {
my $lis=(split/:/,$Other[$i])[0];
my ($aa,$bb)=split/\//,$lis;
if ($aa == $bb) {
if (exists $hash{$sample[$i]}{'aa'}) {
$hash{$sample[$i]}{'aa'} +=1;
}else{
$hash{$sample[$i]}{'aa'}=1;
}
}else{
if (exists $hash{$sample[$i]}{'ab'}) {
$hash{$sample[$i]}{'ab'} +=1;
}else{
$hash{$sample[$i]}{'ab'}=1;
}
}
}
}
}
close IN;
print "#Sample\tHomozygous\tHeterozygote\n";
foreach my $name (keys %hash) {
print "$name\t$hash{$name}{'aa'}\t$hash{$name}{'ab'}\n";
}
看到脚本在后台稳稳运行,好舒心
看到脚本在后台稳稳运行,好舒心