生信代码

筛选出两个分支的菌株的显著性差异为点的Python代码

2020-06-10  本文已影响0人  邱俊辉
import numpy
import collections
import scipy.stats
import math
import vcf
###读取SNP位点
pos_list=[]
vcf_reader=vcf.Reader(filename=r'suis.vcf')
for record in vcf_reader:
    pos_list.append(int(record.POS))


branch1_snp_list=[]
branch2_snp_list=[]
total_snp_list=[]
#读取fasta文件
with open(r'suis_branch1.fasta') as f:
    for line in f:
        if line.startswith('>'):
            continue
        else:
            snp=line.replace('\n','')
            snp=str(snp)
            branch1_snp_list.append(list(snp))
            total_snp_list.append(list(snp))
with open(r'suis_branch2.fasta') as f:
    for line in f:
        if line.startswith('>'):
            continue
        else:
            snp=line.replace('\n','')
            snp = str(snp)
            branch2_snp_list.append(list(snp))
            total_snp_list.append(list(snp))
branch1_array=[]
branch2_array=[]
total_array=[]
array_len=len(branch1_snp_list[0])
print("一共有%d个位点" % array_len)
for i in range(0,array_len):
    branch1_array.append(0)
    branch2_array.append(0)
    total_array.append(0)
branch1_array=numpy.array(branch1_array)
branch2_array=numpy.array(branch2_array)
total_array=numpy.array(total_array)
#将两个分支的序列合并成矩阵
for i in branch1_snp_list:
    i=numpy.array(i)
    branch1_array=numpy.column_stack((branch1_array,i))
branch1_array = numpy.delete(branch1_array, 0, axis=1)###矩阵的列数为branch1内菌株的个数,行数为总共SNP位点的个数

for i in branch2_snp_list:
    i=numpy.array(i)
    branch2_array=numpy.column_stack((branch2_array,i))
branch2_array = numpy.delete(branch2_array, 0, axis=1)

for i in total_snp_list:
    i=numpy.array(i)
    total_array=numpy.column_stack((total_array,i))
total_array = numpy.delete(total_array, 0, axis=1)

###卡方检验函数
def kafang(a,b,c,d,thre,index):
    n=(a+b+c+d)*1.0
    branch1=(a+b)*1.0
    branch2=(c+d)*1.0
    major=(a+c)*1.0
    minor=(b+d)*1.0
    TRC_a=branch1*(major/n)
    TRC_b=branch1-TRC_a
    TRC_c=branch2*(major/n)
    TRC_d=branch2-TRC_c
    out_res=0
    X2=0
    Pval=0
    ###判断理论数的大小,以选择不同的卡方检验
    if(n>=40):
        if(TRC_a>=5 and TRC_b>=5 and TRC_c>=5 and TRC_d >=5):
            X2=math.pow((a-TRC_a),2)/TRC_a+math.pow((b-TRC_b),2)/TRC_b+math.pow((c-TRC_c),2)/TRC_c+math.pow((d-TRC_d),2)/TRC_d
            if(thre==0.05):
                if(X2>=3.84):
                    out_res=1
            elif(thre==0.01):
                if(X2>=6.64):
                    out_res=1
        elif(TRC_a>=1 and TRC_b>=1 and TRC_c>=1 and TRC_d>=1):
            X2=math.pow((abs(a-TRC_a)-0.5),2)/TRC_a+math.pow((abs(b-TRC_b)-0.5),2)/TRC_b+math.pow((abs(c-TRC_c)-0.5),2)/TRC_c+math.pow((abs(d-TRC_d)-0.5),2)/TRC_d
            if (thre == 0.05):
                if (X2 >= 3.84):
                    out_res = 1
            elif (thre == 0.01):
                if (X2 >= 6.64):
                    out_res = 1
        else:
            Pval=scipy.stats.fisher_exact([[a,b],[c,d]])[1]
            if(thre==0.05):
                if(Pval<0.05):
                    out_res=1
            elif(thre==0.01):
                if(Pval<0.01):
                    out_res=1
    else:
        Pval = scipy.stats.fisher_exact([[a, b], [c, d]])[1]
        if (thre == 0.05):
            if (Pval < 0.05):
                out_res = 1
        elif (thre == 0.01):
            if (Pval < 0.01):
                out_res = 1
    with open(r'test.log','a+') as f:###输出所有位点的记录
        print("snp_pos:",pos_list[index],file=f)
        if(X2==0):
            print("Pval:",Pval,file=f)
        else:
            print("X2:", X2,file=f)
        print("num\tmajor_allele\tminor_allele",file=f)
        print("bran1\t%d\t%d" % (a,b),file=f)
        print("bran2\t%d\t%d" % (c,d),file=f)
    finall_out=0
    if(out_res==1):
        if(a>b and d>c):
            if(a/branch1>=0.8 and d/branch2>=0.8):
                finall_out = 1
                with open(r'diff_pos.log','a+') as f:
                    print("snp_pos:", pos_list[index], file=f)
                    if (X2 == 0):
                        print("Pval:", Pval, file=f)
                    else:
                        print("X2:", X2, file=f)
                    print("num\tmajor_allele\tminor_allele", file=f)
                    print("bran1\t%d\t%d" % (a, b), file=f)
                    print("bran2\t%d\t%d" % (c, d), file=f)
        if(b>a and c>d):
            if(b/branch1>=0.8 and c/branch2>=0.8):
                finall_out = 1
                with open(r'diff_pos.log','a+') as f:
                    print("snp_pos:", pos_list[index], file=f)
                    if (X2 == 0):
                        print("Pval:", Pval, file=f)
                    else:
                        print("X2:", X2, file=f)
                    print("num\tmajor_allele\tminor_allele", file=f)
                    print("bran1\t%d\t%d" % (a, b), file=f)
                    print("bran2\t%d\t%d" % (c, d), file=f)
    return finall_out



###选出两个分支矩阵中的两种碱基及其对应的个数
for i in range(0,len(total_array)):
    total_most_common=collections.Counter(total_array[i]).most_common()
    if len(total_most_common)==2:
        major_base=total_most_common[0][0]
        minor_base=total_most_common[1][0]
        branch1_most_common=collections.Counter(branch1_array[i]).most_common()
        branch2_most_common = collections.Counter(branch2_array[i]).most_common()
        ###对branch1进行处理
        if len(branch1_most_common)==1:
            if branch1_most_common[0][0]==major_base:
                branch1_major_num=branch1_most_common[0][1]
                branch1_minor_num=0
            else:
                branch1_major_num = 0
                branch1_minor_num = branch1_most_common[0][1]
        else:
            if branch1_most_common[0][0]==major_base:
                branch1_major_num = branch1_most_common[0][1]
                branch1_minor_num = branch1_most_common[1][1]
            else:
                branch1_major_num = branch1_most_common[1][1]
                branch1_minor_num = branch1_most_common[0][1]
        ###对branch2进行处理
        if len(branch2_most_common)==1:
            if branch2_most_common[0][0]==major_base:
                branch2_major_num=branch2_most_common[0][1]
                branch2_minor_num=0
            else:
                branch2_major_num = 0
                branch2_minor_num = branch2_most_common[0][1]
        else:
            if branch2_most_common[0][0]==major_base:
                branch2_major_num = branch2_most_common[0][1]
                branch2_minor_num = branch2_most_common[1][1]
            else:
                branch2_major_num = branch2_most_common[1][1]
                branch2_minor_num = branch2_most_common[0][1]
        ###进行卡方检验
        finall_out=kafang(a=branch1_major_num,b=branch1_minor_num,c=branch2_major_num,d=branch2_minor_num,thre=0.05,index=i)
        if(finall_out==1):
            with open(r'sign_diff_pos', 'a+') as f:
                chrome="FM252032.1"
                print("%s\t%d" % (chrome,pos_list[i]),file=f)
上一篇下一篇

猜你喜欢

热点阅读