python编程

python:获取BPGA基因聚类表

2021-07-09  本文已影响0人  胡童远

BPGA结果是这个样子的

打开Sequence是这样的

core_seq.txt是这样的


解析该格式,发现聚类对应信息:
>泛基因组类型/基因cluster编号/物种编号/物种编号_基因编号
序列序列序列序列序列序列序列序列序列序列序列序列序列序列

基因cluster总数是和该物种cluster core gene number一致的,可推断

由此设计基因聚类表基本格式:
rowname/index -> 基因cluster编号
colname -> 物种编号
cell -> 物种编号_基因编号

python3实现

#!/usr/bin/env python3
import re, os, sys
import pandas as pd  # pandas 编辑表格??
import numpy as np  # numpy 设计初始表

ms, core_file, out = sys.argv

##########
## core ##
##########
#infile = "core_seq.txt"
#out = "cluster_info_core.txt"

org = []
cluster = []

# 读取,拆分fasta标题行,存储物种,存储基因cluster
with open(core_file, 'r') as i:
    for line in i:
        if ">" in line:
            line = line.strip()
            line = re.split(r'>|/', line)
            #>core/72/3/Org3_Gene164
            #['', 'core', '72', '3', 'Org3_Gene164']
            org.append(line[3])
            cluster.append(line[2])

org = sorted(list(set(org)), key=int)
cluster = sorted(list(set(cluster)), key=int)

# numpy 构造数据框,用 org cluster 行列数
num_row = len(cluster)
num_col = len(org)
num_total = num_row * num_col
df = pd.DataFrame(
    np.arange(num_total).reshape((num_row, num_col)),
    columns = org,
    index = cluster)

# 给table 的每个cell赋上org_gene
with open(core_file, 'r') as i:
    for line in i:
        if ">" in line:
            line = line.strip()
            line = re.split(r'>|/', line)
            df.loc[line[2], line[3]] = line[4]

# 保存csv
df.to_csv(out, sep='\t', index = True)

运行:

conda activate python3.7
python3 script/cluster_info_core.py core_seq.txt cluster_info_core.txt

结果:


core accessory

accessory/uniq文件也用类似的方法处理
python3
稍有不同的是,uniq gene cluster文件中的物种可能不全(造成表格空缺),因此是最全的core文件物种代谢。

#!/usr/bin/env python3
import re, os, sys
import pandas as pd
import numpy as np

ms, core_file, infile, out = sys.argv

###################
## accessory or uniq ##
###################
#infile = "accessory_seq.txt"
#out = "cluster_info_accessory.txt"

org = []
cluster = []

# get all core org only
with open(core_file, 'r') as i:
    for line in i:
        if ">" in line:
            line = line.strip()
            line = re.split(r'>|/', line)
            #>core/72/3/Org3_Gene164
            #['', 'core', '72', '3', 'Org3_Gene164']
            org.append(line[3])

# get all accessory cluster only
with open(infile, 'r') as i:
    for line in i:
        if ">" in line:
            line = line.strip()
            line = re.split(r'>|/', line)
            #>core/72/3/Org3_Gene164
            #['', 'core', '72', '3', 'Org3_Gene164']
            cluster.append(line[2])

org = sorted(list(set(org)), key=int)
cluster = sorted(list(set(cluster)), key=int)

# 构造数据框
num_row = len(cluster)
num_col = len(org)
num_total = num_row * num_col
df = pd.DataFrame(
    np.arange(num_total).reshape((num_row, num_col)),
    columns = org,
    index = cluster)

# table 重新赋值
with open(infile, 'r') as i:
    for line in i:
        if ">" in line:
            line = line.strip()
            line = re.split(r'>|/', line)
            df.loc[line[2], line[3]] = line[4]

df.to_csv(out, sep='\t', index = True)

运行:

python3 script/cluster_info_acc_or_uniq.py core_seq.txt accessory_seq.txt cluster_info_accessory.txt

python3 script/cluster_info_acc_or_uniq.py core_seq.txt unique_seq.txt cluster_info_unique.txt

结果:

数字空白转NA

numpy构造的初始表使用数字填充的,accessory uniq部分会出现未被重新赋值的数字部分其实是NA

#!/usr/bin/env python3
import pandas as pd
import re, sys, os
ms, infile, out = sys.argv

df = pd.read_csv(infile, index_col = 0, header = 0, sep = "\t")
for i in range(len(df.index)):
    for j in range(len(df.columns)):
        if "Gene" not in str(df.iloc[i, j]):
            df.iloc[i, j] = "NA"

df.to_csv(out, sep='\t', index = True)

运行:

mkdir merge
python3 script/cluster_num2na.py \
cluster_info_unique.txt merge/cluster_info_unique_na.txt
python3 script/cluster_num2na.py \
cluster_info_accessory.txt merge/cluster_info_accessory_na.txt
accessory uniq
上一篇 下一篇

猜你喜欢

热点阅读