bam diff-bowtie2&bwa结果比较-2020-01

2020-01-09  本文已影响0人  爬山小虎

参考:https://genome.sph.umich.edu/wiki/BamUtil:_diff

安装BamUtil:

conda install -c bioconda bamutil

conda install -c bioconda/label/cf201901 bamut

准备文件{bam1},{bam2}
bam diff --in1 ${bam1} --in2 ${bam2} --mapQual --onlyDiffs > ${bamdiff}.txt

结果比较:
导入包:

import os, subprocess
import pandas as pd

定义函数整理文件为pandas的dataframe:

def GetBamdiffArr(Bamdifftxt):
    BamdiffArr = []
    SeqIDs = []
    Bam1Dic, Bam2Dic = {}, {}
    with open(Bamdifftxt, 'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if line.startswith(">"):
                items = line.strip().split()
                flag, pos, cigar, mapqual = "", "", "", ""
                for index, item in enumerate(items[1:]):
                    if index == 0:
                        flag = item
                    elif not str.isalnum(item):
                        pos = item
                    elif str.isdigit(item):
                        mapqual = item
                    else:
                        cigar = item

                Bam1Dic[key] = [flag, pos, cigar, mapqual]
            elif line.startswith("<"):
                items = line.strip().split()
                flag, pos, cigar, mapqual = "", "", "", ""
                for index, item in enumerate(items[1:]):
                    if index == 0:
                        flag = item
                    elif not str.isalnum(item):
                        pos = item
                    elif str.isdigit(item):
                        mapqual = item
                    else:
                        cigar = item

                Bam2Dic[key] = [flag, pos, cigar, mapqual]
            else:
                key = line.strip()
                SeqIDs.append(key)
    for key in SeqIDs:
        try:
            flag1, pos1, cigar1, mapqual1 = Bam1Dic[key]
            tag1 = 1
        except KeyError:
            flag1, pos1, cigar1, mapqual1 = "", "", "", ""
            tag1 = 0
        try:
            flag2, pos2, cigar2, mapqual2 = Bam2Dic[key]
            tag2 = -1
        except KeyError:
            flag2, pos2, cigar2, mapqual2 = "", "", "", ""
            tag2 = 0
        tag = tag1+tag2
        BamdiffArr.append([key,flag1, pos1, cigar1, mapqual1, tag1, flag2, pos2, cigar2, mapqual2, tag2, tag])

    return(BamdiffArr)

定义函数写文件:

def WritFile(GetBamdiffArr, samplename):
    BamdiffDF = pd.DataFrame(BamdiffArr,columns = ["key", "flag1", "pos1", "cigar1", "mapqual1", "tag1", "flag2", "pos2", "cigar2", "mapqual2", "tag2", "tag"])
    BamdiffDF[['mapqual1','mapqual2']] = BamdiffDF[['mapqual1','mapqual2']].apply(pd.to_numeric)

    BamdiffDF["tag"][(BamdiffDF.loc[:,"pos2"]=="")&(BamdiffDF.loc[:,"cigar2"]=="")&(BamdiffDF.loc[:,"tag"]==1)] = "Bam1only"

    BamdiffDF["tag"][(BamdiffDF.loc[:,"pos1"]=="")&(BamdiffDF.loc[:,"cigar1"]=="")&(BamdiffDF.loc[:,"tag"]==-1)] = "Bam2only"

    BamdiffDF["tag"][((BamdiffDF.loc[:,"mapqual1"]<20)&(BamdiffDF.loc[:,"mapqual2"]<20))&(BamdiffDF.loc[:,"tag"]==0)] = "poorquality"

    BamdiffDF["tag"][(BamdiffDF.loc[:,"pos1"]=="")&(BamdiffDF.loc[:,"cigar1"]=="")&(BamdiffDF.loc[:,"pos2"]=="")&(BamdiffDF.loc[:,"cigar2"]=="")&(BamdiffDF.loc[:,"tag"]!="poorquality")] = "match"
    BamdiffDF["tag"][(BamdiffDF.loc[:,"pos1"]=="")&(BamdiffDF.loc[:,"cigar1"]=="")&(BamdiffDF.loc[:,"tag"]!="poorquality")] = "match"

    BamdiffDF["tag"][BamdiffDF.loc[:,"tag"]==0] = "mismatch"

    pd.DataFrame(BamdiffDF).to_csv(f"./{samplename}.csv", index=False)

    TagUniqCounts = pd.Series(BamdiffDF.loc[:,"tag"]).value_counts()
    pd.DataFrame(TagUniqCounts).to_excel(f"./{samplename}.counts.xlsx", sheet_name='sheet1')

执行函数:

samplename = Bamdifftxt.strip(".txt")
BamdiffArr = GetBamdiffArr(Bamdifftxt)
WritFile(GetBamdiffArr, samplename)

最后根据xlsx文件,可以将结果整理成venn图。

上一篇下一篇

猜你喜欢

热点阅读