test19 pho_split 不同场景下提取方括号内外的元素

2020-12-01  本文已影响0人  夕颜00

一、准备文件:

1、YNC.xlsx

“Modifications in Master Proteins” 一列包含以下几种情况:
需提取蛋白对应的位点(位点必须以数字结尾)

二、结果

输出文件 YNC_phos_out.txt**

在原始文件YNC.xlsx文件后面追加两列pho,res列

pro res
P12270 T2116
Q14676 S453
Q6PD62 S941

三、脚本

#! /usr/bin/env python
# _*_ coding: utf-8 _*_
# Format to extract for Modifications in Master Proteins
# eg:python3 pho_split.py YNC.txt
# output:YNC_phos_out.txt

__email__ = ".com.cn"

import re
import sys

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("usage: python pho_split.py YNC.txt")
        sys.exit()

    #file = open("YNC.txt", 'r')
    YNC = sys.argv[1]
    file = open(YNC, 'r')
    out = open("YNC_phos_out.txt", 'w')

    header = next(file).strip()
    out.write("{0}\t{1}\t{2}\n".format(header, "pro", "res"))

    for line in file:
        line = line.strip()
        line1 = line.split("\t")
        Modifications = line1[4]

        numEnd = re.compile(r".*[0-9]+$")

        if ';' not in Modifications:
            text = re.compile(r".*[0-9]+\]$")
            if text.match(Modifications):                                          # P12270 1xPhospho [T2116]
                pro, pho = Modifications.split(" ", 1)
                res = re.findall(r'\[(.*[0-9]+?)\]', pho)[0]
                out.write("{0}\t{1}\t{2}\n".format(line, pro, res))
        elif '];' in Modifications:
            items = Modifications.split("]; ")
            if numEnd.match(items[0].split(" ", 1)[0]) and items[1][0].isdigit():   # Q96FQ6 1xAcetyl [N-Term]; 1xPhospho [S2]
                pro, pho = Modifications.split(" ", 1)
                res = re.findall(r'\[(.*?)\]', pho)
                for i in res:
                    if ';' not in i and numEnd.match(i):
                        out.write("{0}\t{1}\t{2}\n".format(line, pro, i))
                    elif ';' in i:                                                  # O75531 1xAcetyl [N-Term]; 2xPhospho [T3; S4]
                        length = len(i.split(";"))
                        for n in range(length):
                            out.write("{0}\t{1}\t{2}\n".format(line, pro, i.split("; ")[n]))
            else:
                for item in Modifications.split("]; "):
                    pro, pho = item.split(" ", 1)
                    if ';' not in pho and numEnd.match(pro):                  # P49792 1xPhospho [S1894]; Q99666 1xPhospho [S918]
                        res = re.sub(r'[\[\]]', "", pho.split(" ", 1)[1])
                        if numEnd.match(res):
                            out.write("{0}\t{1}\t{2}\n".format(line, pro, res))
                    elif ';' in pho and numEnd.match(pro):                     # P49792 2xPhospho [T2450; S2454]; Q99666 2xPhospho [T1474; S1478]
                        res = re.sub(r'[\[\]]', "", pho.split(" ", 1)[1])
                        for i in res.split("; "):
                            if numEnd.match(i):
                                out.write("{0}\t{1}\t{2}\n".format(line, pro, i))

        else:
            pro, pho = Modifications.split(" ", 1)                            # Q9HCD6 2xPhospho [S1824; S1827]
            res = re.sub(r'[\[\]]', "", pho.split(" ", 1)[1])
            for i in res.split("; "):
                if numEnd.match(i):
                    out.write("{0}\t{1}\t{2}\n".format(line, pro, i))

上一篇下一篇

猜你喜欢

热点阅读