xshell-7 本地构建blastp并快速计算POCP值202

2022-01-13  本文已影响0人  RashidinAbdu

*#### 目的: 进行保守蛋白相似性,即 percentage of conserved proteins (POCP) 从而区分属水平的距离。为此,首先需要构建本地blastp,并用现有的Python程序进行计算!

1. 本地blastp的构建:

ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
cd E:\POCP
图片.png 图片.png 图片.png image.png

2. 下载和安装pocp计算程序:

https://github.com/2015qyliang/POCP/commit/f2adfb205fb6d91a28d7afe308f209c175f266da#diff-5124dfd2e4a9ae1bf5b439932a2b65892f0ba108d5643c346e62ccc125109c64
https://github.com/2015qyliang/POCP

print("hello world")
import os
import pandas as pd
from collections import OrderedDict

def Total(filename):
    os.chdir(rawFilePath)
    file = open(filename)
    text = file.read()
    proteinTotal = text.count('>')
    file.close()
    return proteinTotal


def aaTotal(filename):
    os.chdir(rawFilePath)
    file = open(filename)
    text = file.read()
    file.close()
    ltsp = text.split('>')
    aaDict = OrderedDict()
    for line in ltsp:
        if line != '':
            firstBreakIndex = line.index('\n')
            key = line[:firstBreakIndex].split(' ')[0]
            aaDict[key] = len(line[firstBreakIndex:])
        else:
            pass
    return aaDict


def creatdb():
    os.chdir(scriptPath)
    proc1 = r'.\makeblastdb.exe -dbtype prot -parse_seqids -in .\Rawdata\\'
    proc2 = r' -out .\Database\\'
    for fn in fnlist:
        process = proc1 + fn + proc2 + fn.split('.')[0]
        print('='*10,process)
        os.system(process)

# 定义流程化blastp函数
def blast():
    proc1 = r'.\blastp.exe -evalue 1e-5 -max_target_seqs 1 -outfmt 6'
    proc2 = r' -query .\Rawdata\\'
    proc3 = r' -db .\Database\\'
    proc4 = r' -out .\Result\\'
    for fn in fnlist:
        os.chdir(scriptPath)
        index = fnlist.index(fn)
        if index != len(fnlist)-1:
            A = fn
            B = fnlist[index + 1]
            # A2B
            processAB = proc1 + proc2 + A + proc3 + B.split('.')[0] + proc4 + A.split('.')[0] + 'TO' + B.split('.')[0] + '.tab'
            # B2A
            processBA = proc1 + proc2 + B + proc3 + A.split('.')[0] + proc4 + B.split('.')[0] + 'TO' + A.split('.')[0] + '.tab'
            print('='*10,processAB)
            os.system(processAB)
            print('='*10,processBA)
            os.system(processBA)
            os.chdir(resultPath)
            resultFileName1 = A.split('.')[0] + 'TO' + B.split('.')[0] + '.tab'
            resultFileName2 = B.split('.')[0] + 'TO' + A.split('.')[0] + '.tab'
            dframe1 = pd.read_csv(resultFileName1, sep = '\t')
            dframe2 = pd.read_csv(resultFileName2, sep = '\t')
            D1 = aaTotal(A)
            for i in range(dframe1.shape[0]):
                dframe1.iloc[i,11] = (dframe1.iloc[i,3])/(D1[dframe1.iloc[i,0]])
            cdframe1 = dframe1[dframe1.iloc[:,2] > 40]
            cdframe11 = cdframe1[cdframe1.iloc[:,11] > 0.5]
            C1 = cdframe11.shape[0]
            D2 = aaTotal(B)
            for i in range(dframe2.shape[0]):
                dframe2.iloc[i,11] = (dframe2.iloc[i,3])/(D2[dframe2.iloc[i,0]])
            cdframe2 = dframe2[dframe2.iloc[:,2] > 40]
            cdframe22 = cdframe2[cdframe2.iloc[:,11] > 0.5]
            C2 = cdframe22.shape[0]
            T1 = Total(A)
            T2 = Total(B)
            POCP = round((((C1 + C2)/(T1 + T2))*100),2)

            pocpResult.append(A.split('.')[0] + '\t' + B.split('.')[0] + '\t' + str(POCP) + '\n')
        else:
            pass


rawFilePath = r'D:\POCP\Rawdata'
scriptPath = r'D:\POCP'
resultPath = r'D:\POCP\Result'
fnlist = os.listdir(rawFilePath)
pocpResult = []
creatdb()
blast()
os.chdir(scriptPath)




from datetime import datetime

with open('POCP_Result_'+datetime.now().date().isoformat()+'.txt', 'w')as file:
    for line in pocpResult:
        file.write(line)
    file.close()
print('='*50)

rawFilePath = r'D:\POCP\Rawdata'
scriptPath = r'D:\POCP'
resultPath = r'D:\POCP\Result'
POCP = round((((C1 + C2)/(T1 + T2))*100),2)
from datetime import datetime

with open('POCP'+datetime.now().date().isoformat()+'.txt', 'w')as file:
    for line in pocpResult:
        file.write(line)
    file.close()
print('='*50)
图片.png

3. 下载细菌基因组对用的蛋白质序列!注意是蛋白质序列!!!!

https://www.ncbi.nlm.nih.gov/genome/?term=Blautia+faecis

图片.png

4. 进行pocp的计算:

图片.png
  1. 进入路径:
 cd E:\POCP\
  1. 运行里面放着的程序(注意:事先确保电脑上装过Python,如果没有安装过,可以点击最下面那个下载并安装即可):
python .\pocpPipeline.py
图片.png
image.png
上一篇下一篇

猜你喜欢

热点阅读