python:文件查询,统计fastq序列数、碱基数、GC%、M
2020-06-30 本文已影响0人
胡童远
导读
做最基本的fastq序列统计。这里的输入数据是一个上游文件,一个下游文件。
一、新建文件
vi summary_fastq.py
二、写函数
- 函数1:查询文件
思路:
1 for os.listdir 遍历文件
2 if in判断是否含关键字符串
- 函数2:统计fastq序列数、碱基数、GC%
思路:
1 for in 逐行遍历fastq文件
2 行数%4==2是序列行
3 累加序列数、碱基数、GC数【format格式化GB单位】,记录序列长度,求GC%和MaxLength、MinLength
4 加到列表return
#!/usr/bin/env python3
import os, re, sys
def search(path, key):
result = []
for filename in os.listdir(path):
file = os.path.join(path, filename)
if os.path.isfile(file) and key in filename:
result.append(file)
return(result)
def reads(file):
with open(file) as f:
result = []
lengths = []
line_num = 0
gc_num = 0
base_num = 0
read_num = 0
for line in f:
line_num += 1
if line_num%4 ==2:
read_num += 1
gc_num += line.count("G") + line.count("g") + line.count("C") + line.count("c")
base_num += len(line)
lengths.append(len(line))
print("\033[32m---------------read_num, base_num, gc_percent, max_length, min_length---------------\033[0m")
gc_percent = format(gc_num/base_num*100, '0.2f')
result.append(read_num)
result.append(base_num)
result.append(gc_percent)
result.append(max(lengths))
result.append(min(lengths))
return(result)
三、主函数
思路:
1 合并raw clean数据r1 r2
2 reads函数处理
3 格式化输出
4 判断并删除中间文件
if __name__ == '__main__':
print("\033[32m---------------merge rawdata---------------\033[0m")
os.system("cat " + search("rawdata", "R")[0] + " " + search("rawdata", "R")[1] + " > merge_raw.fq")
print("\033[32m---------------merge cleandata---------------\033[0m")
os.system("cat " + search("Result/kneaddata", "trimmed.1")[0] + " " + search("Result/kneaddata", "trimmed.2")[0] + " > merge_clean.fq")
with open("Result/kneaddata/fastq_summary.txt", 'w') as o:
o.write("\tInsertSize(bp)\tSeqStrategy\tReads(#)\tBase(GB)\tGC(%)\tMaxLength\tMinLength\n")
if os.path.isfile("./merge_raw.fq") and os.path.isfile("./merge_raw.fq"):
f1 = "./merge_raw.fq"
reads_info = reads(f1)
read_num = reads_info[0]
base_num = format(reads_info[1]/10**9, '0.2f')
gc_percent = reads_info[2]
max_length = reads_info[3]
min_length = reads_info[4]
#print(read_num, base_num, gc_percent, max_length, min_length)
o.write("RawData\t350\t(150:150)\t{}\t{}\t{}\t{}\t{}\n".format(read_num, base_num, gc_percent, max_length, min_length))
f2 = "./merge_clean.fq"
reads_info = reads(f2)
read_num = reads_info[0]
base_num = format(reads_info[1]/10**9, '0.2f')
gc_percent = reads_info[2]
max_length = reads_info[3]
min_length = reads_info[4]
#print(read_num, base_num, gc_percent, max_length, min_length)
o.write("CleanData\t350\t(150:150)\t{}\t{}\t{}\t{}\t{}".format(read_num, base_num, gc_percent, max_length, min_length))
if os.path.isfile("Result/kneaddata/fastq_summary.txt"):
os.remove("./merge_raw.fq")
os.remove("./merge_clean.fq")
else:
print("\033[31m no fastq_summary.txt! please check!\033[0m")
四、结果
python3 summary_fastq.py
InsertSize(bp) SeqStrategy Reads(#) Base(GB) GC(%) MaxLength MinLength
RawData 350 (150:150) 16493814 2.49 32.90 151 151
CleanData 350 (150:150) 14872620 2.14 32.82 151 51