python:批量汇总统计fastq文件序列数、碱基数、GC%、
2020-07-13 本文已影响0人
胡童远
导读
python:文件查询,统计fastq序列数、碱基数、GC%、MaxLength、MinLength
前面写了类似的上篇,用来处理一个样品的测序数据。这篇可以处理多个测序数据。
一、输入数据
tree rawdata
rawdata
├── CON1_R1.fastq
├── CON1_R2.fastq
├── CON2_R1.fastq
├── CON2_R2.fastq
├── CON3_R1.fastq
├── CON3_R2.fastq
├── TREAT1_R1.fastq
├── TREAT1_R2.fastq
├── TREAT2_R1.fastq
├── TREAT2_R2.fastq
├── TREAT3_R1.fastq
└── TREAT3_R2.fastq
或者
tree Clean_data/
Clean_data/
├── CON1_1.fastq
├── CON1_2.fastq
├── CON2_1.fastq
├── CON2_2.fastq
├── CON3_1.fastq
├── CON3_2.fastq
├── TREAT1_1.fastq
├── TREAT1_2.fastq
├── TREAT2_1.fastq
├── TREAT2_2.fastq
├── TREAT3_1.fastq
└── TREAT3_2.fastq
二、python3实现
2.1 思路:
1 写序列统计函数
2 读取文件名,split,获取样品名
3 re.findall确定后缀【列表排序后取后缀,保证分别是R1,R2】
4 函数处理文件
5 格式化输出
2.2 python3脚本
vi summary_fastq.py
#!/usr/bin/env python3
import os, re, sys
ms, infold, outfile = sys.argv
def count_fastq(fastq_1, fastq_2):
with open(fastq_1) as f1, open(fastq_2) as f2:
result = []
lengths = []
read_num = 0
base_num = 0
gc_num = 0
line_num = 0
for line in f1:
line_num += 1
if line_num % 4 == 2:
read_num += 1
base_num += len(line)
gc_num += line.count("G") + line.count("g") + line.count("C") + line.count("c")
lengths.append(len(line))
line_num = 0
for line in f2:
line_num += 1
if line_num % 4 == 2:
read_num += 1
base_num += len(line)
gc_num += line.count("G") + line.count("g") + line.count("C") + line.count("c")
lengths.append(len(line))
gc_percent = format(gc_num/base_num*100, '0.2f')
result.append(read_num)
result.append(base_num)
result.append(gc_percent)
result.append(max(lengths))
result.append(min(lengths))
return(result)
file_name = []
for each in os.listdir(infold):
each = re.findall(r'(^.*)_', each)
file_name.append(''.join(each))
postfix_1 = re.findall(r'_(.*)', sorted(os.listdir(infold))[0])
postfix_2 = re.findall(r'_(.*)', sorted(os.listdir(infold))[1])
with open(outfile, 'w') as o:
o.write("\tInsertSize(bp)\tSeqStrategy\tReads(#)\tBase(GB)\tGC(%)\tMaxLength\tMinLength\n")
for each in list(set(file_name)):
fastq_1 = "{}/{}_{}".format(infold, each, ''.join(postfix_1))
fastq_2 = "{}/{}_{}".format(infold, each, ''.join(postfix_2))
reads_info = count_fastq(fastq_1, fastq_2)
read_num = reads_info[0]
base_num = format(reads_info[1]/10**9, '0.2f')
gc_percent = reads_info[2]
max_length = reads_info[3]
min_length = reads_info[4]
o.write("{}\t350\t(150:150)\t{}\t{}\t{}\t{}\t{}\n".format(each, read_num, base_num, gc_percent, max_length, min_length))
三、批量统计
python3 summary_fastq.py rawdata summary_data.txt
cat summary_data.txt
InsertSize(bp) SeqStrategy Reads(#) Base(GB) GC(%) MaxLength MinLength
TREAT1 350 (150:150) 86126172 13.01 60.98 151 151
TREAT2 350 (150:150) 90939422 13.73 61.54 151 151
CON1 350 (150:150) 75509882 11.4 64.44 151 151
CON2 350 (150:150) 67161272 10.14 64.16 151 151
CON3 350 (150:150) 83932176 12.67 63.66 151 151
TREAT3 350 (150:150) 80084508 12.09 61.96 151 151
四、排序,网页格式化
4.1 思路:
1 frame.sort_index(axis=0, ascending=True)按0列排序,升序
2 html表头head:<tr><th>{}</th></tr>
3 html标体data:<tr><td>{}<>/td></tr>
4.2 代码:
vi table_to_html_summary_data.py
#!/usr/bin/env python3
import os, re, sys
import pandas as pd
ms, infile, outfile = sys.argv
with open(infile, 'r') as f:
f = pd.read_csv(f, index_col=0, header=0, sep="\t")
f = f.sort_index(axis=0, ascending=True)
f.to_csv("temp.txt", sep="\t", index=True)
with open(outfile, 'w') as o:
with open("./temp.txt") as temp:
num = 1
for line in temp:
line = line.strip()
cell = re.split(r'\t', line)
if num == 1:
o.write("<tr><th>{}</th><th>{}</th><th>{}</th><th>{}</th><th>{}</th><th>{}</th><th>{}</th><th>{}</th></tr>\n".format("SampleID", "InsertSize(bp)", "SeqStrategy", "Reads(#)", "Base(GB)", "GC(%)", "MaxLength", "MinLength"))
num += 1
else:
o.write("\t\t\t\t\t<tr><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td></tr>\n".format(cell[0], cell[1], cell[2], cell[3], cell[4], cell[5], cell[6], cell[7]))
os.remove("temp.txt")
4.3 把排序好的temp.txt文件直接格式化了,接着删除:
cat temp.txt
InsertSize(bp) SeqStrategy Reads(#) Base(GB) GC(%) MaxLength MinLength
CON1 350 (150:150) 75509882 11.4 64.44 151 151
CON2 350 (150:150) 67161272 10.14 64.16 151 151
CON3 350 (150:150) 83932176 12.67 63.66 151 151
TREAT1 350 (150:150) 86126172 13.01 60.98 151 151
TREAT2 350 (150:150) 90939422 13.73 61.54 151 151
TREAT3 350 (150:150) 80084508 12.09 61.96 151 151
4.4 运行:
python3 table_to_html_summary_fastq.py summary_fastq.txt summary_fastq_html.txt
五、插到html table
python3 table_insert_html.py summary_rawdata table4 summary_rawdata_html.txt temp.html