处理deeparg结果的python脚本
2021-05-23 本文已影响0人
九月_1012
#!/bin/python3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import click
import os
#test="XXX.mapping.ARG"
#features_gene_length="/XX/deeparg-ss/data/database/v2/features.gene.length"
def arg_png(out_png, arg_pd):
#1 药物类型饼图
drug_class=arg_pd['predicted_ARG-class']
drug_class_type=drug_class.value_counts()
drug_class_df=pd.DataFrame(drug_class_type)
#plt.figure()
drug_class_df.plot.pie(subplots=True, figsize=(8, 4))
#fig.savefig('AMR.'+out_png, dpi=300)
plt.savefig('AMR.'+out_png)
#plt.show()
#return out_png
#得到数据库中每个ARG的平均长度
def read_amr_len(ARG_length):
d_arg_len={}
d_arg_avg_len={}
with open (ARG_length,'r') as f:
for line in f:
ARG, length = line.rstrip().split()
ARG = ARG.split("|")[-1]
ARG=ARG.upper()
d_arg_len.setdefault(ARG,[]).append(int(length))
for k,v in d_arg_len.items():
avg = int(np.mean(v))
d_arg_avg_len[k]=avg
return d_arg_avg_len
#去除覆盖区间的overlap区域,计算ARG的cover region
def cover_region_depth(num_list):
'''
输入:线段起点、终点
输出:总的覆盖长度
'''
start=10000
end=0
for one_list in num_list:
if one_list[0]<start:
start=one_list[0]
if one_list[1]>end:
end=one_list[1]
#print start, end
flag=['false']*end
for i in range(len(num_list)):
for j in range(num_list[i][0], num_list[i][1]):
flag[j]=True
#print flag
return flag.count(True)
def arg_table(deeparg_file, arg_pd, d_arg_avg_len):
#2 drug class ARG的reads数
drug_class_arg=arg_pd.loc[:, ['predicted_ARG-class','#ARG','counts']]
arg_count=drug_class_arg.groupby(['predicted_ARG-class','#ARG'],as_index=False).sum()
#3 计算cover、深度、相似度
d_arg_cover={}
d_arg_identity={}
with open (deeparg_file,'r') as f:
next(f) # skip the first line.
for line in f:
info=line.rstrip().split()
#temp=[]
temp=[int(info[1]),int(info[2])]
#d_arg_reads[info[0]]+=1
d_arg_cover.setdefault(info[0],[]).append(temp)
d_arg_identity.setdefault(info[0],[]).append(float(info[7]))
#d_arg_avg_len=read_amr_len(test2)
temp_list = []
for arg in d_arg_cover.keys():
cover_len=cover_region_depth(d_arg_cover[arg])
cover_p=format(cover_len*100/d_arg_avg_len[arg],'.1f')
id=max(d_arg_identity[arg])
#cover_p=cover_len/read_amr_len(arg)
#print("%s\t%d\t%.2f%%\t%.2f%%" % (arg, cover_region_depth(d_arg_cover[arg]),cover_p,id))
temp_list.append([arg, cover_region_depth(d_arg_cover[arg]),d_arg_avg_len[arg],cover_p,id])
cover_id_pd=pd.DataFrame(temp_list,columns=['#ARG', 'Cover_length','Gene_length','Coverage(%)', 'Identity(%)'])
final=pd.merge(arg_count, cover_id_pd, on='#ARG')
final=final.rename(columns={'counts':'Reads','#ARG':'ARG','Gene_length':'ARG_length'})
return final
@click.command()
@click.option('--deeparg_file', help='the deeparg results as: XXX.mapping.ARG')
@click.option('--out_table', help='The output ARG table.')
@click.option('--out_png', help='The output ARG drug class png.')
@click.option('--features_gene_length', default="/XX/deeparg/deeparg-ss/data/database/v2/features.gene.length", help='The database ARG length file.')
def main(features_gene_length, deeparg_file, out_png, out_table):
#features_gene_length="/XX/deeparg/deeparg-ss/data/database/v2/features.gene.length"
d_arg_avg_len=read_amr_len(features_gene_length)
arg_pd=pd.read_table(deeparg_file, header=0)
arg_png(out_png, arg_pd)
final=arg_table(deeparg_file, arg_pd, d_arg_avg_len)
#xls=open(out_table,'w')
#xls.write(final)
final.to_csv(out_table)
if not os.path.exists(out_table):
os.system(r"touch {}".format(path))#调用系统命令行来创建文件
if __name__=="__main__":
main()
"""
注:predicted_ARG-class:耐药基因的种类;ARG:Antibiotic resistance genes, 耐药基因名称;
Reads:覆盖到耐药基因的reads数;Cover_length:覆盖的耐药基因的长度;ARG_length:耐药基因长度;
Coverage:覆盖率;Identity:reads和耐药基因的相似度;
"""