使用python处理生物信息数据（九）

2020-03-16 本文已影响0人你猜我菜不菜

Python学习的第九天，主要学习如何写自己的函数功能。

1. 从PDB文件中提取fasta序列

import struct

pdb_format = '6s5s1s4s1s3s1s1s4s1s3s8s8s8s6s6s10s2s3s'

amino_acids = {
    'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E',
    'PHE':'F', 'GLY':'G', 'HIS':'H', 'LYS':'K',
    'ILE':'I', 'LEU':'L', 'MET':'M', 'ASN':'N',
    'PRO':'P', 'GLN':'Q', 'ARG':'R', 'SER':'S',
    'THR':'T', 'VAL':'V', 'TYR':'Y', 'TRP':'W'
    }

def threeletter2oneletter(residues):
    for i, threeletter in enumerate(residues):
        residues[i][0] = amino_acids[threeletter[0]] 
#这将氨基酸的三个字母的代码转换为一个字母的代码。 
#该函数读取[res_type，chain]对的列表，
#并用对应的单字母代码替换氨基酸三字母代码的第一个元素。

def get_residues(pdb_file):
    residues = []
    for line in pdb_file:
        if line[0:4] == "ATOM":
            tmp = struct.unpack(pdb_format, line.encode('utf-8'))
            tmp_to_string = [line.decode('utf-8') for line in tmp]
            ca = tmp_to_string[3].strip()
            if ca == 'CA':
                res_type = tmp_to_string[5].strip()
                chain = tmp_to_string[7]
                residues.append([res_type, chain])
    return residues

def write_fasta_records(residues, pdb_id, fasta_file):
    seq = ''
    chain = residues[0][1]
    for aa, new_chain in residues:
        if new_chain == chain:
            seq = seq + aa
        else:
            fasta_file.write(">%s_%s\n%s\n" % (pdb_id, chain, seq))
            seq = aa
            chain = new_chains
    fasta_file.write(">%s_%s\n%s\n" % (pdb_id, chain, seq))

def extract_sequence(pdb_id):
    pdb_file = open(pdb_id + ".pdb")
    fasta_file = open(pdb_id + ".fasta", "w")
    residues = get_residues(pdb_file)
    threeletter2oneletter(residues)
    write_fasta_records(residues, pdb_id, fasta_file)
    pdb_file.close()
    fasta_file.close()
    
extract_sequence("3G5U")

2. 写一个极简单的函数

def calc_sum(num1, num2):
    '''一个名为calc_sum函数，定义函数功能，对两个参数求和，然后返回出结果'''
    result = num1 + num2
    return result
    
calc_sum(23,7)  #calc_sum计算23和7的和
Out[6]: 30

###直接返回函数运行后的结果
def increment(number):
    '''返回给定的数字加1的结果'''
    return number + 1
    
increment(5) #给定5，返回值为6
Out[8]: 6

###直接打印出给定的参数
def print_arg(number):
    print(number)
    
print_arg(5) #输入5，打印出5
5

###上述两个函数的套用
print_arg(increment(5)) #先是increment(5)函数，对5进行加1操作，然后print_arg()直接打印出increment(5)函数运行后的结果

3. 使用struct模块将字符串转为元组

import struct

format = "1s2s1s1s" #1s表示一个字符为一个单位， 2s则表示两个字符为一个单位

line = "12345"

col = struct.unpack(format, line.encode("utf-8") #将line这个字符串以format的格式拆分开
)

col_to_string = [line.decode("utf-8") for line in col]

print(col)
(b'1', b'23', b'4', b'5')

print(col_to_string)
['1', '23', '4', '5']

###calcsize()函数这个是统计format格式中有多少个字符
format = '30s30s20s1s'

print(struct.calcsize(format))
81

4. 将任意数量的参数转化为tab符分隔的字符串

def convert_to_string(*args):
    result = [str(a) for a in args]
    return "\t".join(result) + "\n"

output_file = open("nucleotideSubstitMatrix.txt", "w")

output_file.write(convert_to_string('', 'A', 'T', 'C', 'G'))
Out[17]: 9

output_file.write(convert_to_string('A', 1.0))
Out[18]: 6

output_file.write(convert_to_string('T', 0.5, 1.0))
Out[19]: 10

output_file.write(convert_to_string('C', 0.1, 0.1, 1.0))
Out[20]: 14

output_file.write(convert_to_string('G', 0.1, 0.1, 0.5, 1.0))
Out[21]: 18

output_file.close()

5. 提取PDB文件中特定的残基

import struct

from parse_pdb import parse_atom_line

def main(pdb_filename, residues, output_filename):
    pdb = open(pdb_filename)
    outfile = open(output_filename, "w")
    for line in pdb:
        if line.startswith("ATOM"):
            chain, res_type, res_num, atom, x, y, z = parse_atom_line(line)
            for aa, num in residues:
                if res_type == aa and res_num == num:
                    outfile.write(line)
    outfile.close()

residues = [('ASP', '102'), ('HIS', '57'), ('SER', '195')]
main("1TLD.pdb", residues, "trypsin_triad.pdb")

6. 计算两个原子之间的距离

#计算PDB文件中两个原子间的距离
from math import sqrt
from distance import calc_dist
from parse_pdb import parse_atom_line

pdb = open('3G5U.pdb')
points = []

while len(points) < 2:
    line = pdb.readline()
    if line.startswith("ATOM"):
        chain, res_type, res_num, atom, x, y, z = parse_atom_line(line)
        if res_num == '123' and chain == 'A' and atom == 'CA':
            points.append((x, y, z))
        if res_num == '209' and chain == 'A' and atom == 'CA':
            points.append((x, y, z))


print(calc_dist(points[0], points[1]))
35.219262627147664

####计算PDB中所有CA原子的距离
from math import sqrt
from distance import calc_dist
from parse_pdb import parse_atom_line

def get_ca_atoms(pdb_filename):
    pdb_file = open(pdb_filename, "r") #提取ca原子的数据列表
    ca_list = []
    for line in pdb_file:
        if line.startswith('ATOM'):
            data = parse_atom_line(line)
            chain, res_type, res_num, atom, x, y, z = data
            if atom == 'CA' and chain == 'A': 
                ca_list.append(data)
    pdb_file.close()
    return ca_list

ca_atoms = get_ca_atoms("1TLD.pdb")
for i, atom1 in enumerate(ca_atoms):
    # save coordinates in a variable
    name1 = atom1[1] + atom1[2]
    coord1 = atom1[4:]
    # compare atom1 with all other atoms
    for j in range(i+1, len(ca_atoms)):
        atom2 = ca_atoms[j]
        name2 = atom2[1] + atom2[2]
        coord2 = atom2[4:]
        # calculate the distance between atoms
        dist = calc_dist(coord1, coord2)
        print(name1, name2, dist)

使用python处理生物信息数据（九）

1. 从PDB文件中提取fasta序列

2. 写一个极简单的函数

3. 使用struct模块将字符串转为元组

4. 将任意数量的参数转化为tab符分隔的字符串

5. 提取PDB文件中特定的残基

6. 计算两个原子之间的距离

猜你喜欢

热点阅读