基因组生信GO注释

「生信」基因组功能注释后的基因筛选

2019-05-13  本文已影响7人  bioinfo_boy

目录

  • 写在前面
  • 功能注释数据库介绍
  • 方法一: 以KEGG的注释结果为主, 筛选出每个品种包含的特异通路及基因
  • 方法二: 利用pfam的注释结果, 过滤掉每个品种具有相同结构域的基因, 对于剩下的基因, 利用其swissprot id 在DAVID数据库中查找他们的功能信息, 单个进行筛选
  • 最后

写在前面

功能注释数据库介绍

GO数据库

NCBI数据库

Swissprot数据库

KEGG数据库

InterProScan数据库

Pfam

方法一: 以KEGG的注释结果为主, 筛选出每个品种包含的特异通路及基因

文件准备

基本思路

脚本

这个脚本输出部分写的异常垃圾, 当时只是觉得自己能看懂就行了

# -*- encoding:UTF-8 -*-
import re
import os
import pprint

def text_open(file_path):
    '''用于一个通路文件和9个特异基因文件的导入'''
    try:
        text_name = []
        for line in open(file_path):
            line = line.strip()
            text_name.append(line)
        return text_name
    except:
        print "The 'text_open' has a question!"

def pathway_dic(pathway_text):
    '''通路文件生成字典,通路名称为key,通路包含的基因为value'''
    try:
        dic = {}
        partern1 = re.compile('^C')
        partern2 = re.compile('^D')
        for line in pathway_text:
            if partern1.search(line):
                key_name =line
                dic[key_name] = []
                continue
            if partern2.search(line):
                dic[key_name].append(line)
        return dic
    except:
        print "The 'pathway_dic' has a question!"

def geneid2keggid_dic(pear_text):
    '''特异基因信息生成字典1,基因id为key,kegg id为value'''
    try:
        dic = {}
        partern1 = re.compile('K\d{5}')
        for line in pear_text:
            line_table = line.split('\t')
            if partern1.search(line_table[8]):
                dic[line_table[0]] = partern1.search(line_table[8]).group()
        return dic
    except:
        print "The 'geneid2keggid_dic' has a question!"

def geneid2keggline_dic(pear_text):
    '''特异基因信息生成字典2,基因id为key,kegg注释信息为value'''
    try:
        dic = {}
        partern1 = re.compile('K\d{5}')
        for line in pear_text:
            line_table = line.split('\t')
            if partern1.search(line_table[8]):
                dic[line_table[0]] = line_table[8]
        return dic
    except:
        print "The 'geneid2keggline_dic' has q question!"

def nine_list(osdir):
    '''将9个文件的列表写入一个更大列表中'''
    try:
        specific_genefile_list = []
        big_table = []
        partern1 = re.compile('^P\w+_.*')
        # partern1 = re.compile('^sun.*')
        for lists in os.listdir(osdir):
            if partern1.search(lists):
                specific_genefile_list.append(lists)
        for file in specific_genefile_list:
            big_table.append(text_open(osdir+'/'+file))
        return big_table
    except:
        print "The 'nine_list' has a question!"

def our_assume1(big_dic):
    try:
        new_dic = {}
        for big_dic_key in big_dic.keys():
            if big_dic[big_dic_key] != []:
                n = 1
                small_table = big_dic[big_dic_key][-1].split('==>')
                partern2 = re.compile('^P[a-zA-Z]+')
                word = partern2.search(small_table[0]).group()
                partern1 = re.compile(word)
                for big_dic_value in big_dic[big_dic_key]:
                    if not partern1.search(big_dic_value):
                        n +=1
                if n == 1:
                    new_dic[big_dic_key] = big_dic[big_dic_key]
        return new_dic
    except:
        print "The 'our_assume1' has a question!"

def our_assume2and3(big_dic):
    try:
        new_dic1 = {}
        new_dic2 = {}
        new_dic3 = {}
        new_dic4 = {}
        for big_dic_key in big_dic.keys():
            if big_dic[big_dic_key] != []:
                words = []
                for big_dic_value in big_dic[big_dic_key]:
                    small_table = big_dic_value.split('==>')
                    partern2 = re.compile('^P[a-zA-Z]+')
                    words.append(partern2.search(small_table[0]).group())
                if 'Pbr' in words and 'Ppc' in words and 'Puc' in words and 'Pco' in words and 'Psi' in words and  'Ppw' not in words and 'Puw' not in words and 'Ppy' not in words and 'Pbe' not in words:
                    new_dic1[big_dic_key] = big_dic[big_dic_key]
                if 'Pbr' not in words and 'Ppc' not in words and 'Puc' not in words and 'Pco' not in words and 'Psi' not in words and  'Ppw' in words and 'Puw' in words and 'Ppy' in words and 'Pbe' in words:
                    new_dic2[big_dic_key] = big_dic[big_dic_key]
                if 'Pbr' in words and 'Ppc' in words and 'Puc' in words and 'Ppw' in words and 'Psi' in words and 'Puw' in words and 'Pbe' in words and 'Pco' not in words and 'Ppy' not in words:
                    new_dic3[big_dic_key] = big_dic[big_dic_key]
                if 'Pbr'not in words and 'Ppc'not in words and 'Puc' not in words and 'Ppw'not in words and 'Psi'not in words and 'Puw'not in words and 'Pbe' not in words and 'Pco' in words and 'Ppy' in words:
                    new_dic4[big_dic_key] = big_dic[big_dic_key]
        return new_dic1, new_dic2, new_dic3, new_dic4
    except:
        print "The 'our_assume2' has a question!"


if __name__ == '__main__':
    '''problems 不能找到其他物种没有定义的通路;没有KEGG注释的基因没办法参与定位;没有定位到的基因可能位于非特异区域'''

    big_table = nine_list('/Users/suncongrui/home/03_workshop/specific_gene/target_gene')
    pathway_text = text_open('/Users/suncongrui/home/03_workshop/specific_gene/plant/ko00001.keg')
    pathway_dic1 = pathway_dic(pathway_text)
    big_dic = {}
    for def2_key in pathway_dic1.keys():
        big_dic[def2_key] = []
    for specise in big_table:
        geneid2keggline_dic1 = geneid2keggline_dic(specise)
        geneid2keggid_dic1 = geneid2keggid_dic(specise)
        for geneid in geneid2keggid_dic1.keys():
            partern1 = re.compile(geneid2keggid_dic1[geneid])
            for pathway_dir_key in pathway_dic1.keys():
                for pathway_dir_value in pathway_dic1[pathway_dir_key]:
                    if partern1.search(pathway_dir_value):
                        big_dic[pathway_dir_key].append(geneid+'==>'+geneid2keggline_dic1[geneid])

    sun = our_assume1(big_dic)
    cong1, cong2, rui1, rui2 = our_assume2and3(big_dic)
    pp = pprint.PrettyPrinter(indent=4)
    # pp.pprint(sun)
    # pp.pprint(cong1)
    # pp.pprint(cong2)
    # pp.pprint(rui1)
    pp.pprint(rui2)

部分结果展示

输出异常垃圾...


垃圾输出

此方法的问题

方法二: 利用pfam的注释结果, 过滤掉每个品种具有相同结构域的基因, 对于剩下的基因, 利用其swissprot id 在DAVID数据库中查找他们的功能信息, 单个进行筛选

文件准备

脚本

# -*- encoding:UTF-8 -*-
import re
import os

def text_open(file_path):
    '''用于一个通路文件和9个特异基因文件的导入'''
    try:
        text_name = []
        for line in open(file_path):
            line = line.strip()
            text_name.append(line)
        return text_name
    except:
        print "The 'text_open' has a question!"

def nine_list(osdir):
    '''将9个文件的id-pfam列写入一个更大列表中'''
    try:
        specific_genefile_list = []
        big_table = []
        partern1 = re.compile('^P\w+_.*')
        for lists in os.listdir(osdir):
            if partern1.search(lists):
                specific_genefile_list.append(lists)
        for file in specific_genefile_list:
            pfam = []
            text_name = text_open(osdir+'/'+file)
            for line in text_name:
                line_table = line.split('\t')
                pfam.append(line_table[0]+'\t'+line_table[-1])
            big_table.append(pfam)
        return big_table
    except:
        print "The 'nine_list' has a question!"

if __name__ == '__main__':
    big_table = nine_list('/Users/suncongrui/home/03_workshop/specific_gene/target_gene')
    table = []
    for i in range(9):
        big_table_copy = big_table[:]
        table1 = []
        table2 = []
        del big_table_copy[i]
        for small_table_copy in big_table_copy:
            for line in small_table_copy:
                line_table1 = line.split('\t')
                table1.append(line_table1[1])
        for line in big_table[i]:
            line_table2 = line.split('\t')
            if line_table2[1] in table1:
                table2.append(line_table2[0]+'\t'+line_table2[1]+'\t'+'NO')
            else:
                table2.append(line_table2[0]+'\t'+line_table2[1]+'\t'+'YES')
        table.append(table2)
    for b in table:
        for c in b:
            print c
# -*- encoding:UTF-8 -*-
import re
import os

def text_open(file_path):
    '''用于一个通路文件和9个特异基因文件的导入'''
    try:
        text_name = []
        for line in open(file_path):
            line = line.strip()
            text_name.append(line)
        return text_name
    except:
        print "The 'text_open' has a question!"

def nine_list(osdir):
    '''将9个文件的id-swissprot-id列写入一个更大列表中'''
    try:
        specific_genefile_list = []
        big_table = []
        partern1 = re.compile('^P\w+_.*')
        for lists in os.listdir(osdir):
            if partern1.search(lists):
                specific_genefile_list.append(lists)
        for file in specific_genefile_list:
            text_name = text_open(osdir+'/'+file)
            for line in text_name:
                line_table = line.split('\t')
                big_table.append(line_table[0]+'\t'+line_table[7])
        return big_table
    except:
        print "The 'nine_list' has a question!"

if __name__ == '__main__':
    big_table = nine_list('/home/Dai_XG/software/user/74/target_gene')
    swissprot_text = text_open('/home/Dai_XG/software/user/74/uniprot_sprot.head')
    table = []
    for element1 in big_table:
        line_table = element1.split('\t')
        n = 0
        for element2 in swissprot_text:
            if line_table[1] == 'NA':
                continue
            if line_table[1] in element2:
                split = element2.split('|')
                print line_table[0]+'\t'+split[1]+'\t'+line_table[1]
                n += 1
        if n == 0:
            print line_table[0]+'\t'+'NO'+'\t'+line_table[1]

查找部分核心展示

这里要感谢一下chuxinxzz同学的帮助, 效率提高了不少

最后

夏天很热 -_-


热死我了
上一篇下一篇

猜你喜欢

热点阅读