python：批量汇总统计fastq文件序列数、碱基数、GC%、

2020-07-13 本文已影响0人胡童远

导读

python：文件查询，统计fastq序列数、碱基数、GC%、MaxLength、MinLength

前面写了类似的上篇，用来处理一个样品的测序数据。这篇可以处理多个测序数据。

一、输入数据

tree rawdata
rawdata
├── CON1_R1.fastq
├── CON1_R2.fastq
├── CON2_R1.fastq
├── CON2_R2.fastq
├── CON3_R1.fastq
├── CON3_R2.fastq
├── TREAT1_R1.fastq
├── TREAT1_R2.fastq
├── TREAT2_R1.fastq
├── TREAT2_R2.fastq
├── TREAT3_R1.fastq
└── TREAT3_R2.fastq

或者

tree Clean_data/
Clean_data/
├── CON1_1.fastq
├── CON1_2.fastq
├── CON2_1.fastq
├── CON2_2.fastq
├── CON3_1.fastq
├── CON3_2.fastq
├── TREAT1_1.fastq
├── TREAT1_2.fastq
├── TREAT2_1.fastq
├── TREAT2_2.fastq
├── TREAT3_1.fastq
└── TREAT3_2.fastq

二、python3实现

2.1 思路：
1 写序列统计函数
2 读取文件名，split，获取样品名
3 re.findall确定后缀【列表排序后取后缀，保证分别是R1，R2】
4 函数处理文件
5 格式化输出

2.2 python3脚本

vi summary_fastq.py

#!/usr/bin/env python3
import os, re, sys
ms, infold, outfile = sys.argv

def count_fastq(fastq_1, fastq_2):
    with open(fastq_1) as f1, open(fastq_2) as f2:
        result = []
        lengths = []
        read_num = 0
        base_num = 0
        gc_num = 0
        
        line_num = 0
        for line in f1:
            line_num += 1
            if line_num % 4 == 2:
                read_num += 1
                base_num += len(line)
                gc_num += line.count("G") + line.count("g") + line.count("C") + line.count("c")
                lengths.append(len(line))
                
        line_num = 0
        for line in f2:
            line_num += 1
            if line_num % 4 == 2:
                read_num += 1
                base_num += len(line)
                gc_num += line.count("G") + line.count("g") + line.count("C") + line.count("c")
                lengths.append(len(line))
        gc_percent = format(gc_num/base_num*100, '0.2f')
        result.append(read_num)
        result.append(base_num)
        result.append(gc_percent)
        result.append(max(lengths))
        result.append(min(lengths))
        return(result)
  
file_name = []
for each in os.listdir(infold):
    each = re.findall(r'(^.*)_', each)
    file_name.append(''.join(each))

postfix_1 = re.findall(r'_(.*)', sorted(os.listdir(infold))[0])
postfix_2 = re.findall(r'_(.*)', sorted(os.listdir(infold))[1])

with open(outfile, 'w') as o:
    o.write("\tInsertSize(bp)\tSeqStrategy\tReads(#)\tBase(GB)\tGC(%)\tMaxLength\tMinLength\n")
    for each in list(set(file_name)):
        fastq_1 = "{}/{}_{}".format(infold, each, ''.join(postfix_1))
        fastq_2 = "{}/{}_{}".format(infold, each, ''.join(postfix_2))
        reads_info = count_fastq(fastq_1, fastq_2)
        read_num = reads_info[0]
        base_num = format(reads_info[1]/10**9, '0.2f')
        gc_percent = reads_info[2]
        max_length = reads_info[3]
        min_length = reads_info[4]
        o.write("{}\t350\t(150:150)\t{}\t{}\t{}\t{}\t{}\n".format(each, read_num, base_num, gc_percent, max_length, min_length))

三、批量统计

python3 summary_fastq.py rawdata summary_data.txt
cat summary_data.txt
        InsertSize(bp)  SeqStrategy     Reads(#)        Base(GB)        GC(%)   MaxLength       MinLength
TREAT1  350     (150:150)       86126172        13.01   60.98   151     151
TREAT2  350     (150:150)       90939422        13.73   61.54   151     151
CON1    350     (150:150)       75509882        11.4    64.44   151     151
CON2    350     (150:150)       67161272        10.14   64.16   151     151
CON3    350     (150:150)       83932176        12.67   63.66   151     151
TREAT3  350     (150:150)       80084508        12.09   61.96   151     151

四、排序，网页格式化

4.1 思路：
1 frame.sort_index(axis=0, ascending=True)按0列排序，升序
2 html表头head：<tr><th>{}</th></tr>
3 html标体data：<tr><td>{}<>/td></tr>

4.2 代码：

vi table_to_html_summary_data.py

#!/usr/bin/env python3
import os, re, sys
import pandas as pd
ms, infile, outfile = sys.argv
with open(infile, 'r') as f:
    f = pd.read_csv(f, index_col=0, header=0, sep="\t")
    f = f.sort_index(axis=0, ascending=True)
    f.to_csv("temp.txt", sep="\t", index=True)
    with open(outfile, 'w') as o:
        with open("./temp.txt") as temp:
            num = 1
            for line in temp:
                line = line.strip()
                cell = re.split(r'\t', line)
                if num == 1:
                    o.write("<tr><th>{}</th><th>{}</th><th>{}</th><th>{}</th><th>{}</th><th>{}</th><th>{}</th><th>{}</th></tr>\n".format("SampleID", "InsertSize(bp)", "SeqStrategy", "Reads(#)", "Base(GB)", "GC(%)", "MaxLength", "MinLength"))
                    num += 1
                else:
                    o.write("\t\t\t\t\t<tr><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td><td>{}</td></tr>\n".format(cell[0], cell[1], cell[2], cell[3], cell[4], cell[5], cell[6], cell[7]))
os.remove("temp.txt")

4.3 把排序好的temp.txt文件直接格式化了，接着删除:

cat temp.txt
        InsertSize(bp)  SeqStrategy     Reads(#)        Base(GB)        GC(%)   MaxLength       MinLength
CON1    350     (150:150)       75509882        11.4    64.44   151     151
CON2    350     (150:150)       67161272        10.14   64.16   151     151
CON3    350     (150:150)       83932176        12.67   63.66   151     151
TREAT1  350     (150:150)       86126172        13.01   60.98   151     151
TREAT2  350     (150:150)       90939422        13.73   61.54   151     151
TREAT3  350     (150:150)       80084508        12.09   61.96   151     151

4.4 运行：

python3 table_to_html_summary_fastq.py summary_fastq.txt summary_fastq_html.txt

五、插到html table

python：在文本指定位置插入文本

python3 table_insert_html.py summary_rawdata table4 summary_rawdata_html.txt temp.html