生物信息生物信息学遗传学

snp密度图、treemix、D检测

2021-07-30  本文已影响0人  琴酒martini

snp密度图

#去除多等位基因位点和indels
nohup /home/software/bcftools/bcftools view -m 2 -M 2 --type "snps"  test.vcf.gz -O z -o test.record.snps.vcf.gz &
gunzip -c test.record.snps.vcf.gz > test.record.snps.vcf
nohup  /home/software/plink --vcf test.record.snps.vcf --recode --allow-extra-chr --chr-set 30 --out test.record.snps &
#提取map文件位点名称、染色体号、位置,作成snp.position.txt,加表头
nohup awk  '{print $2,$1,$4}'  test.record.snps.map > test.record.snps.position.txt &
# 使用R画图
install.packages("CMplot")
library("CMplot")
m <- read.table("test.record.snps.position.txt ")
head(mydata)
# snp         chr       pos
# snp1_1    1        2041
# snp1_2    1        2062
# snp1_3    1        2190
CMplot(m,plot.type="d",bin.size=1e6,col=c("darkgreen", "yellow", "red"),file="jpg",memo="",dpi=600,file.output=TRUE, verbose=TRUE)
基因渗透.png
随机遗传漂变.png

treemix画图ML树

1.制作test.clust:(5列)
order   id1 id2 breed   num
1   SRR1    SRR1    Angus   1
2   SRR2    SRR2    Angus   1
3   SRR3    SRR3    Angus   1
4   SRR4    SRR4    Angus   1
5   SRR5    SRR5    Angus   1
2.使用脚本vcf_treemix_allele_num.pl
nohup perl vcf_treemix_allele_num.pl &
3.mv treemix_test.txt treemix_test.frq
4.gzip treemix_test.frq 
5.source ~/.bashrc
6.nohup bash treemix.sh &
7.最后R画图使用treemix-plot.R,另一个辅助不需要打开。
# vcf_treemix_allele_num.pl
# 制作.clust文件时,注意1.加表头2.做5列,1、5列为数字3.品种名特殊符号为下划线格式4.个体名称与vcf文件一致

@breed=("Angus","Hereford","Holstein"......);

open (AAA,"ld.QC.test.vcf"); ##vcf
open (BBB,"test.clust"); ##breed name
@bb=<BBB>;
open (CCC,">treemix_"."test".".txt");

#print CCC "SB DH BS DQ LJ MG BN JP YJ \n"; #此处使用简写
print CCC "An He Hol  \n";

$nnn=0;

while($line=<AAA>){
chomp $line;

#$nnn=$nnn+1;

#if($nnn eq 100){last;}

if($line=~/^#/){next;}
else{
@arraynum=split (/\t/,$line);

    for($num=0;$num<=$#breed;$num++ ){

$freq=0;
$n0=0;
$n1=0;
$n2=0;
$ii=0;

             for($i=9;$i<=399;$i++){
                                    $ii++;
                                    chomp $bb[$ii];

                                                                        @arraynum_bb=split(/\t/,$bb[$ii]);
                                    if( $arraynum_bb[3] eq $breed[$num] ){
                                                                          if($arraynum[$i] eq "0/0"){$n0++;}
                                                                          if($arraynum[$i] eq "0/1"){$n1++;}
                                                                          if($arraynum[$i] eq "1/1"){$n2++;}
                                                                         }
                                   } #从第9列开始读入,例共11个个体,则最后个体为19列

 $freq1=(2*$n0+$n1);     ##alle num
 $freq2=(2*$n2+$n1);     ##alle num

 print CCC "$freq1,$freq2 ";
}
print CCC "\n";
}

}

close CCC;
close BBB;
close AAA;
# treemix.sh脚本,判断基因流为几次
for N in 0 1 2 3 4 5 ; do /home/software/treemix-1.13/src/treemix -i treemix_test.frq.gz -m $N -o treemix_test_$N -k 500 -noss;done
# 将所有结果放入文件夹中
# R画图treemix-plot.R
prefix="treemix_test"
library(RColorBrewer)
library(R.utils)
source("plotting_funcs.R") #辅助脚本

par(mfrow=c(2,3))
for(edge in 0:0){
  plot_tree(cex=0.8,paste0(prefix,"_",edge))
  title(paste(edge,"edges"))
} # 放的是m012345,则0:5 
# plotting_funcs.R脚本
# functions for plotting a tree

library(RColorBrewer)
set_y_coords = function(d){
    i = which(d[,3]=="ROOT")
    y = d[i,8]/ (d[i,8]+d[i,10])
    d[i,]$y = 1-y
    d[i,]$ymin = 0
    d[i,]$ymax = 1
    c1 = d[i,7]
    c2 = d[i,9]
    ni = which(d[,1]==c1)
    ny = d[ni,8]/ (d[ni,8]+d[ni,10])
        d[ni,]$ymin = 1-y
    d[ni,]$ymax = 1
    d[ni,]$y = 1- ny*(y)

        ni = which(d[,1]==c2)
        ny = d[ni,8]/ (d[ni,8]+d[ni,10])
        d[ni,]$ymin = 0
        d[ni,]$ymax = 1-y
        d[ni,]$y = (1-y)-ny*(1-y)

    for (j in 1:nrow(d)){
        d = set_y_coord(d, j)
    }   
    return(d)
}

set_y_coord = function(d, i){
    index = d[i,1]
    parent = d[i,6]
    if (!is.na(d[i,]$y)){
        return(d)
    }
    tmp = d[d[,1] == parent,]
    if ( is.na(tmp[1,]$y)){
        d = set_y_coord(d, which(d[,1]==parent))
        tmp = d[d[,1]== parent,]
    }
    py = tmp[1,]$y
    pymin = tmp[1,]$ymin
    pymax = tmp[1,]$ymax
    f = d[i,8]/( d[i,8]+d[i,10])
    #print (paste(i, index, py, pymin, pymax, f))
    if (tmp[1,7] == index){
        d[i,]$ymin = py
        d[i,]$ymax = pymax
        d[i,]$y = pymax-f*(pymax-py)
        if (d[i,5]== "TIP"){
            d[i,]$y = (py+pymax)/2
        }
    }
    else{
        d[i,]$ymin = pymin
        d[i,]$ymax = py
        d[i,]$y = py-f*(py-pymin)
                if (d[i,5]== "TIP"){
                        d[i,]$y = (pymin+py)/2
                }   
    
    }
    return(d)
}


set_x_coords = function(d, e){
    i = which(d[,3]=="ROOT")
    index = d[i,1]
        d[i,]$x = 0
        c1 = d[i,7]
        c2 = d[i,9]
        ni = which(d[,1]==c1)
    tmpx =  e[e[,1]==index & e[,2] == c1,3]
    if (length(tmpx) == 0){
        tmp = e[e[,1] == index,]
        tmpc1 = tmp[1,2]
        if ( d[d[,1]==tmpc1,4] != "MIG"){
            tmpc1 = tmp[2,2]
        }
        tmpx = get_dist_to_nmig(d, e, index, tmpc1)
    }
    if(tmpx < 0){
        tmpx = 0
    }
        d[ni,]$x = tmpx

        ni = which(d[,1]==c2)
        tmpx =  e[e[,1]==index & e[,2] == c2,3]
    if (length(tmpx) == 0){
            tmp = e[e[,1] == index,]
                tmpc2 = tmp[2,2]
                if ( d[d[,1]==tmpc2,4] != "MIG"){
                        tmpc2 = tmp[1,2]
                }
                tmpx = get_dist_to_nmig(d, e, index, tmpc2)
    }
        if(tmpx < 0){
                tmpx = 0
        }
        d[ni,]$x = tmpx
        
        for (j in 1:nrow(d)){
                d = set_x_coord(d, e, j)
        }
        return(d)
    print(d)
}


set_x_coord = function(d, e, i){
    index = d[i,1]
        parent = d[i,6]
        if (!is.na(d[i,]$x)){
                return(d)
        }
        tmp = d[d[,1] == parent,]
        if ( is.na(tmp[1,]$x)){
                d = set_x_coord(d, e, which(d[,1]==parent))
                tmp = d[d[,1]== parent,]
        }
    #print (paste(parent, index))
    tmpx =  e[e[,1]==parent & e[,2] == index,3]
    if (length(tmpx) == 0){
                tmp2 = e[e[,1] == parent,]
                tmpc2 = tmp2[2,2]
        #print
                if ( d[d[,1]==tmpc2,4] != "MIG"){
                        tmpc2 = tmp2[1,2]
                }
        tmpx = get_dist_to_nmig(d, e, parent, tmpc2)
    }
        if(tmpx < 0){
                tmpx = 0
        }
    d[i,]$x = tmp[1,]$x+ tmpx
    return(d)
}

plot_tree_internal = function(d, e, o = NA, cex = 1, disp = 0.005, plus = 0.005, arrow = 0.05, ybar = 0.01, scale = T, mbar = T, mse = 0.01, plotmig = T, plotnames = T, xmin = 0, lwd = 1, font = 1){
    plot(d$x, d$y, axes = F, ylab = "", xlab = "Drift parameter", xlim = c(xmin, max(d$x)+plus), pch = "")
    axis(1)
    mw = max(e[e[,5]=="MIG",4])
    mcols = rev(heat.colors(150))
    for(i in 1:nrow(e)){
        col = "black"
        if (e[i,5] == "MIG"){
            w = floor(e[i,4]*200)+50
            if (mw > 0.5){
                w = floor(e[i,4]*100)+50
            }
            col = mcols[w]
            if (is.na(col)){
                col = "blue"
            }
        }
        v1 = d[d[,1] == e[i,1],]
        v2 = d[d[,1] == e[i,2],]
        if (e[i,5] == "MIG"){
            if (plotmig){
            arrows( v1[1,]$x, v1[1,]$y, v2[1,]$x, v2[1,]$y, col = col, length = arrow)
            }
        }
        else{
            lines( c(v1[1,]$x, v2[1,]$x), c(v1[1,]$y, v2[1,]$y), col = col, lwd = lwd)
        }
    }
    tmp = d[d[,5] == "TIP",]
    print(tmp$x)
    print(disp)
    if ( !is.na(o)){
        for(i in 1:nrow(tmp)){
            tcol = o[o[,1] == tmp[i,2],2]
            if(plotnames){
                #print(tmp[i,2])
                text(tmp[i,]$x+disp, tmp[i,]$y, labels = tmp[i,2], adj = 0, cex = cex, col  = tcol, font = font)
            }
        }
    }
    else{
        if (plotnames){
        text(tmp$x+disp, tmp$y, labels = tmp[,2], adj = 0, cex = cex, font = font)
        }
    }
    if (scale){
    print (paste("mse", mse))
        lines(c(0, mse*10), c(ybar, ybar))
    text( 0, ybar - 0.04, lab = "10 s.e.", adj = 0, cex  = 0.8)
    lines( c(0, 0), c( ybar - 0.01, ybar+0.01))
    lines( c(mse*10, mse*10), c(ybar- 0.01, ybar+ 0.01))
    }
        if (mbar){
                mcols = rev( heat.colors(150) )
                mcols = mcols[50:length(mcols)]
                ymi = ybar+0.15
                yma = ybar+0.35
                l = 0.2
                w = l/100
                xma = max(d$x/20)
                rect( rep(0, 100), ymi+(0:99)*w, rep(xma, 100), ymi+(1:100)*w, col = mcols, border = mcols)
                text(xma+disp, ymi, lab = "0", adj = 0, cex = 0.7)
        if ( mw >0.5){ text(xma+disp, yma, lab = "1", adj = 0, cex = 0.7)}
                else{
            text(xma+disp, yma, lab = "0.5", adj = 0, cex =0.7)
        }
        text(0, yma+0.06, lab = "Migration", adj = 0 , cex = 0.6)
        text(0, yma+0.03, lab = "weight", adj = 0 , cex = 0.6)
        }   
}

set_mig_coords = function(d, e){
    for (j in 1:nrow(d)){
        if (d[j,4] == "MIG"){
            p = d[d[,1] == d[j,6],]
            c = d[d[,1] == d[j,7],]
            tmpe = e[e[,1] == d[j,1],]
            y1 = p[1,]$y
            y2 = c[1,]$y
            x1 = p[1,]$x
            x2 = c[1,]$x

            mf = tmpe[1,6]  
            if (is.nan(mf)){
                mf = 0
            }
            #d[j,]$y = (y1+y2)* mf
                        #d[j,]$x = (x1+x2) *mf
                        d[j,]$y = y1+(y2-y1)* mf
            print(paste(mf, x1, x2))
                        d[j,]$x = x1+(x2-x1) *mf
        }   

    }
    return(d)

}

get_f = function(stem){
    d = paste(stem, ".cov.gz", sep = "")
    d2 = paste(stem, ".modelcov.gz", sep = "")
    d = read.table(gzfile(d), as.is = T, comment.char = "", quote = "")
    d2 = read.table(gzfile(d2), as.is = T, comment.char = "", quote = "")
    d = d[order(names(d)), order(names(d))]
    d2 = d2[order(names(d2)), order(names(d2))]
    tmpcf = vector()
        tmpmcf = vector()
        for (j in 1:nrow(d)){
                for (k in (j+1):nrow(d)){
                        tmpcf = append(tmpcf, d[j,k])
                        tmpmcf = append(tmpmcf, d[j,k] - d2[j,k])
                }
        }
        tmpv = var(tmpmcf)/var(tmpcf)
    return(1-tmpv)

}

plot_tree = function(stem, o = NA, cex = 1, disp = 0.003, plus = 0.01, flip = vector(), arrow = 0.05, scale = T, ybar = 0.1, mbar = T, plotmig = T, plotnames = T, xmin = 0, lwd = 1, font = 1){
    d = paste(stem, ".vertices.gz", sep = "")
    e = paste(stem, ".edges.gz", sep = "")
    se = paste(stem, ".covse.gz", sep = "")
    d = read.table(gzfile(d), as.is = T, comment.char = "", quote = "")
    e = read.table(gzfile(e), as.is  = T, comment.char = "", quote = "")
    if (!is.na(o)){
        o = read.table(o, as.is = T, comment.char = "", quote = "")
    }
    e[,3] = e[,3]*e[,4]
    e[,3] = e[,3]*e[,4]
    
    se = read.table(gzfile(se), as.is = T, comment.char = "", quote = "")
    m1 = apply(se, 1, mean)
    m = mean(m1)
    #m = 0
    for(i in 1:length(flip)){
        d = flip_node(d, flip[i])
    }
    d$x = "NA"
    d$y = "NA"
    d$ymin = "NA"
    d$ymax = "NA"
    d$x = as.numeric(d$x)
    d$y = as.numeric(d$y)
    d$ymin = as.numeric(d$ymin)
    d$ymax = as.numeric(d$ymax)

    d = set_y_coords(d)
    d = set_x_coords(d, e)
    print(d)
    d = set_mig_coords(d, e)
    plot_tree_internal(d, e, o = o, cex = cex, xmin = xmin, disp = disp, plus = plus, arrow = arrow, ybar = ybar, mbar = mbar, mse = m, scale = scale, plotmig = plotmig, plotnames = plotnames, lwd = lwd, font = font)
    return(list( d= d, e = e))
}

get_dist_to_nmig = function(d, e, n1, n2){
    toreturn = e[e[,1] == n1 & e[,2] == n2,3]
    #print(toreturn)
    while ( d[d[,1] ==n2,4] == "MIG"){
        tmp = e[e[,1] == n2 & e[,5] == "NOT_MIG",]
        toreturn = toreturn+tmp[1,3]
        n2 = tmp[1,2]
    }
    return(toreturn)
}

flip_node = function(d, n){
    i = which(d[,1] == n)
    t1 = d[i,7]
    t2 = d[i,8]
    d[i,7] = d[i,9]
    d[i,8] = d[i,10]
    d[i,9] = t1
    d[i,10] = t2
    return(d)

}

plot_modelcov = function(stem, pop_order, min = -0.009, max = 0.009, cex = 1, usemax = T){
        c = read.table(gzfile(paste(stem, ".modelcov.gz", sep = "")), as.is = T, head = T)
        o = read.table(pop_order, as.is = T, comment.char = "", quote = "")


        toplot = data.frame(matrix(nrow = nrow(c), ncol = ncol(c)))
        for(i in 1:nrow(o)){
                for( j in 1:nrow(o)){

                        toplot[i, j] = c[which(names(c)==o[i,1]), which(names(c)==o[j,1])]
                }
        }
        if (usemax){
                m1 = max(abs(toplot))
                max = m1*1.1
                min = -(m1*1.1)
        }
        names(toplot) = o[,1]
        plot_resid_internal(toplot, max = max, min = min)
}



plot_cov = function(stem, pop_order, min = -0.009, max = 0.009, cex = 1, usemax = T, wcols = ""){
        c = read.table(gzfile(paste(stem, ".cov.gz", sep = "")), as.is = T, head = T)
        o = read.table(pop_order, as.is = T)


        toplot = data.frame(matrix(nrow = nrow(c), ncol = ncol(c)))
        for(i in 1:nrow(o)){
                for( j in 1:nrow(o)){

                        toplot[i, j] = c[which(names(c)==o[i,1]), which(names(c)==o[j,1])]
                }
        }
        if (usemax){
                m1 = max(abs(toplot))
                max = m1*1.1
                min = 0
        }
        names(toplot) = o[,1]
        plot_cov_internal(toplot, max = max, min = min, wcols = wcols, o = o, cex = cex)
}


plot_resid = function(stem, pop_order, min = -0.009, max = 0.009, cex = 1, usemax = T, wcols = "r"){
    c = read.table(gzfile(paste(stem, ".cov.gz", sep = "")), as.is = T, head = T, quote = "", comment.char = "")
    m = read.table(gzfile(paste(stem, ".modelcov.gz", sep = "")), as.is = T, head = T, quote = "", comment.char = "")
    names(c) = rownames(c)
    names(m) = rownames(m)
    o = read.table(pop_order, as.is = T, comment.char = "", quote = "")
    se = read.table(gzfile(paste(stem, ".covse.gz", sep = "")), as.is = T, head = T, quote = "", comment.char = "")
    mse = apply(se, 1, mean)
    mse = mean(mse)
    print(mse)  
    c = c[order(names(c)), order(names(c))]
    m = m[order(names(m)), order(names(m))]
    tmp = c -m 
    #tmp = m - c
    #tmp = (m-c)/m
    #print(tmp)
    toplot = data.frame(matrix(nrow = nrow(tmp), ncol = ncol(tmp)))
    for(i in 1:nrow(o)){
            for( j in 1:nrow(o)){
            #print(paste(o[i,1], o[j,1]))
            if (o[i,1] %in% names(tmp) ==F){
                print(paste("not found", o[i,1]))
            }
            if (o[j,1] %in% names(tmp) ==F){
                print(paste("not found", o[j,1]))
            }
                    toplot[i, j] = tmp[which(names(tmp)==o[i,1]), which(names(tmp)==o[j,1])]
            }
    }
    #print(toplot)
    if (usemax){
        m1 = max(abs(toplot), na.rm = T)
        max = m1*1.02
        min = -(m1*1.02)    
    }
    print("here")
    names(toplot) = o[,1]
    toreturn = plot_resid_internal(toplot, max = max, min = min, wcols = wcols, mse = mse, o = o, cex = cex)
    return(toreturn)
}

plot_cov_internal = function(d, o = NA, max = 0.009, min = -0.009, cex =0.5, wcols = "", mse = 5){
        npop = nrow(d)
        width = 1/npop
        height = 1/npop
        colors = brewer.pal(9, "Spectral")
        colors = c("red", "orange","yellow", "white", "green", "blue", "black")
        pal = colorRampPalette(colors)
        ncol = 80
        cols = pal(ncol)
        plot("NA", xlim = c(0, 1), ylim = c(0, 1), axes = F, xlab = "", ylab = "")
        for (i in 1:npop){
                for( j in 1:i){
                        v = d[i,j]
                        col= "white"
                        if (v < 0){
                                if (wcols == "rb"){
                                col = rgb(0, 0, 1, v/min)
                                }
                                else{
                                #col = rgb(0, 0, 1, 0.1+0.9*(v/min))
                                col = cols[ncol/2-floor( (v/min)*(ncol/2))]
                                }
                        }
                        else{
                                if (wcols == "rb"){
                                col = rgb(1, 0, 0, v/max)
                                }
                                else{
                                #col = rgb(1, 0, 0, 0.1+0.9*(v/max))
                                col = cols[ceiling((v/max)*(ncol))]
                                }
                        }
                        xmin = j/npop - 1/npop
                        xmax = j/npop
                        ymin = 1-(i/npop)
                        ymax = 1-(i/npop)+1/npop
            if (v == 0){ col = "white"}
                        rect(xmin, ymin, xmax, ymax, col = col, border = col)
                }
                tcol = "black"
                tmp = o[o[,1] == names(d)[i],]
                if (length(tmp) != 1){
                        tcol = tmp[1,2]
                }
                mtext(names(d)[i], side = 2, at = 1-i/npop+0.5/npop, las = 1, cex = cex, col = tcol)
                mtext(names(d)[i], side = 1, at =  i/npop-0.5/npop, las = 3, cex = cex, col = tcol)
        }
        if ( !is.na(mse)){
                ymi = 0.5
                yma = 0.9
                w = (yma-ymi)/ncol
                xma = 0.80
                lmi = round(min, digits = 1)
                lma = round(max, digits = 1)
                print(cols)
                print(ymi+(0:ncol)*w)
                rect( rep(0.75, ncol), ymi+(0:(ncol-1))*w, rep(xma, ncol), ymi+(1:ncol)*w, col = cols, border = cols)
                text(xma+0.01, ymi, lab = paste(lmi),  adj = 0, cex = 0.8)
                text(xma+0.01, yma, lab = paste(lma, "(Variance)"), adj = 0, cex = 0.8)

        }
        return(d)
        #image(as.matrix(d), col = cols)
}

plot_resid_internal = function(d, o = NA, max = 0.009, min = -0.009, cex =0.5, wcols = "rb", mse = NA){
        npop = nrow(d)
        width = 1/npop
        height = 1/npop
    colors = brewer.pal(9, "Spectral")
    colors = c("red", "orange","yellow", "white", "green", "blue", "black")
    pal = colorRampPalette(colors)
    ncol = 80
    cols = pal(ncol)
        plot("NA", xlim = c(0, 1), ylim = c(0, 1), axes = F, xlab = "", ylab = "")
        for (i in 1:npop){
                for( j in 1:i){
                        v = d[i,j]
            print(paste(i, j, v))
                        col= "white"
                        if (v < 0){
                if (wcols == "rb"){
                col = rgb(0, 0, 1, v/min)
                }
                else{
                                #col = rgb(0, 0, 1, 0.1+0.9*(v/min))
                col = cols[ncol/2-floor( (v/min)*(ncol/2))]
                #col = "white"
                }
                        }
                        else{
                if (wcols == "rb"){
                col = rgb(1, 0, 0, v/max)
                }
                else{
                                #col = rgb(1, 0, 0, 0.1+0.9*(v/max))
                col = cols[ncol/2+ceiling((v/max)*(ncol/2))]
                }
                        }
                        xmin = j/npop - 1/npop
                        xmax = j/npop
                        ymin = 1-(i/npop)
                        ymax = 1-(i/npop)+1/npop
                        rect(xmin, ymin, xmax, ymax, col = col, border = col)
                }
        tcol = "black"
        tmp = o[o[,1] == names(d)[i],]
        if (length(tmp) != 1){
            tcol = tmp[1,2]
        }
                mtext(names(d)[i], side = 2, at = 1-i/npop+0.5/npop, las = 1, cex = cex, col = tcol)
                mtext(names(d)[i], side = 1, at =  i/npop-0.5/npop, las = 3, cex = cex, col = tcol)
        }
    if ( !is.na(mse)){
                ymi = 0.5
                yma = 0.9
                w = (yma-ymi)/ncol
                xma = 0.80
        lmi = round(min/mse, digits = 1)
        lma = round(max/mse, digits = 1)
        print(cols)
        print(ymi+(0:ncol)*w)
                rect( rep(0.75, ncol), ymi+(0:(ncol-1))*w, rep(xma, ncol), ymi+(1:ncol)*w, col = cols, border = cols)
                text(xma+0.01, ymi, lab = paste(lmi, "SE"),  adj = 0, cex = 0.8)
                text(xma+0.01, yma, lab = paste(lma, "SE"), adj = 0, cex = 0.8)

    }
    return(d)
    #image(as.matrix(d), col = cols)
}

D检测 (ABBA-BABA test)

D检测.png

IGV 读取bam文件

IGV导入基因组和bam文件。
genomes>load from file 选择对应的基因组参考文件
file>load from file 选择对应的bam文件

上一篇下一篇

猜你喜欢

热点阅读