python：获取BPGA基因聚类表

2021-07-09 本文已影响0人胡童远

BPGA结果是这个样子的

打开Sequence是这样的

core_seq.txt是这样的

解析该格式，发现聚类对应信息：
>泛基因组类型/基因cluster编号/物种编号/物种编号_基因编号
序列序列序列序列序列序列序列序列序列序列序列序列序列序列

基因cluster总数是和该物种cluster core gene number一致的，可推断

由此设计基因聚类表基本格式：
rowname/index -> 基因cluster编号
colname -> 物种编号
cell -> 物种编号_基因编号

python3实现

#!/usr/bin/env python3
import re, os, sys
import pandas as pd  # pandas 编辑表格？？
import numpy as np  # numpy 设计初始表

ms, core_file, out = sys.argv

##########
## core ##
##########
#infile = "core_seq.txt"
#out = "cluster_info_core.txt"

org = []
cluster = []

# 读取，拆分fasta标题行，存储物种，存储基因cluster
with open(core_file, 'r') as i:
    for line in i:
        if ">" in line:
            line = line.strip()
            line = re.split(r'>|/', line)
            #>core/72/3/Org3_Gene164
            #['', 'core', '72', '3', 'Org3_Gene164']
            org.append(line[3])
            cluster.append(line[2])

org = sorted(list(set(org)), key=int)
cluster = sorted(list(set(cluster)), key=int)

# numpy 构造数据框，用 org cluster 行列数
num_row = len(cluster)
num_col = len(org)
num_total = num_row * num_col
df = pd.DataFrame(
    np.arange(num_total).reshape((num_row, num_col)),
    columns = org,
    index = cluster)

# 给table 的每个cell赋上org_gene
with open(core_file, 'r') as i:
    for line in i:
        if ">" in line:
            line = line.strip()
            line = re.split(r'>|/', line)
            df.loc[line[2], line[3]] = line[4]

# 保存csv
df.to_csv(out, sep='\t', index = True)

运行：

conda activate python3.7
python3 script/cluster_info_core.py core_seq.txt cluster_info_core.txt

结果：

core

accessory

accessory/uniq文件也用类似的方法处理
python3
稍有不同的是，uniq gene cluster文件中的物种可能不全（造成表格空缺），因此是最全的core文件物种代谢。

#!/usr/bin/env python3
import re, os, sys
import pandas as pd
import numpy as np

ms, core_file, infile, out = sys.argv

###################
## accessory or uniq ##
###################
#infile = "accessory_seq.txt"
#out = "cluster_info_accessory.txt"

org = []
cluster = []

# get all core org only
with open(core_file, 'r') as i:
    for line in i:
        if ">" in line:
            line = line.strip()
            line = re.split(r'>|/', line)
            #>core/72/3/Org3_Gene164
            #['', 'core', '72', '3', 'Org3_Gene164']
            org.append(line[3])

# get all accessory cluster only
with open(infile, 'r') as i:
    for line in i:
        if ">" in line:
            line = line.strip()
            line = re.split(r'>|/', line)
            #>core/72/3/Org3_Gene164
            #['', 'core', '72', '3', 'Org3_Gene164']
            cluster.append(line[2])

org = sorted(list(set(org)), key=int)
cluster = sorted(list(set(cluster)), key=int)

# 构造数据框
num_row = len(cluster)
num_col = len(org)
num_total = num_row * num_col
df = pd.DataFrame(
    np.arange(num_total).reshape((num_row, num_col)),
    columns = org,
    index = cluster)

# table 重新赋值
with open(infile, 'r') as i:
    for line in i:
        if ">" in line:
            line = line.strip()
            line = re.split(r'>|/', line)
            df.loc[line[2], line[3]] = line[4]

df.to_csv(out, sep='\t', index = True)

运行：

python3 script/cluster_info_acc_or_uniq.py core_seq.txt accessory_seq.txt cluster_info_accessory.txt

python3 script/cluster_info_acc_or_uniq.py core_seq.txt unique_seq.txt cluster_info_unique.txt

结果：

数字空白转NA

numpy构造的初始表使用数字填充的，accessory uniq部分会出现未被重新赋值的数字部分其实是NA

#!/usr/bin/env python3
import pandas as pd
import re, sys, os
ms, infile, out = sys.argv

df = pd.read_csv(infile, index_col = 0, header = 0, sep = "\t")
for i in range(len(df.index)):
    for j in range(len(df.columns)):
        if "Gene" not in str(df.iloc[i, j]):
            df.iloc[i, j] = "NA"

df.to_csv(out, sep='\t', index = True)

运行：

mkdir merge
python3 script/cluster_num2na.py \
cluster_info_unique.txt merge/cluster_info_unique_na.txt
python3 script/cluster_num2na.py \
cluster_info_accessory.txt merge/cluster_info_accessory_na.txt

accessory

uniq

python：获取BPGA基因聚类表

猜你喜欢

热点阅读