[python]根据基因提取信息
2024-07-31 本文已影响0人
花生学生信
合并
读取gene文件,该文件是一列基因,然后再读取第二个文件,第七列是基因,第十一列是SV类型。

import sys
def read_gene_list(filename):
"""从文件中读取基因列表"""
with open(filename, 'r') as file:
genes = set(line.strip() for line in file)
return genes
def process_sv_file(genes, sv_filename, output_filename):
"""处理 SV 文件并筛选出基因列表中的基因及其对应的 SV 类型"""
with open(sv_filename, 'r') as sv_file, open(output_filename, 'w') as output_file:
for line in sv_file:
parts = line.strip().split('\t')
if len(parts) >= 11: # 确保至少有11列
gene_part = parts[6]
for gene in genes:
if gene in gene_part: # 检查第七列是否包含基因列表中的基因
sv_type = parts[10]
output_file.write(f"{gene}\t{sv_type}\n")
break
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: python script.py <gene_list_file> <sv_data_file> <output_file>")
sys.exit(1)
gene_list_filename = sys.argv[1]
sv_data_filename = sys.argv[2]
output_filename = sys.argv[3]
# 读取基因列表
genes = read_gene_list(gene_list_filename)
# 处理 SV 数据文件
process_sv_file(genes, sv_data_filename, output_filename)
print("处理完成!")

统计每个基因受SV影响的数量
import sys
def read_gene_list(filename):
"""从文件中读取基因列表"""
with open(filename, 'r') as file:
genes = [line.strip() for line in file]
return genes
def process_sv_file(genes, sv_filename, output_filename):
"""处理 SV 文件并筛选出基因列表中的基因及其对应的 SV 类型"""
gene_counts = {gene: 0 for gene in genes} # 初始化所有基因的计数为 0
with open(sv_filename, 'r') as sv_file, open(output_filename, 'w') as output_file:
for line in sv_file:
parts = line.strip().split('\t')
if len(parts) >= 11: # 确保至少有11列
gene_part = parts[6]
for gene in genes:
if gene in gene_part: # 检查第七列是否包含基因列表中的基因
sv_type = parts[10]
gene_counts[gene] += 1 # 更新计数
break
# 按照基因列表的顺序输出所有基因及其计数
for gene in genes:
output_file.write(f"{gene}\t{gene_counts[gene]}\n")
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: python script.py <gene_list_file> <sv_data_file> <output_file>")
sys.exit(1)
gene_list_filename = sys.argv[1]
sv_data_filename = sys.argv[2]
output_filename = sys.argv[3]
# 读取基因列表
genes = read_gene_list(gene_list_filename)
# 处理 SV 数据文件
process_sv_file(genes, sv_data_filename, output_filename)
print("处理完成!")

cat all|while read id
do
python wo1.py gene ../$id x_g_diff2/$id
done

合并结果:
import os
import csv
def merge_tsv_files(output_filename):
"""合并当前目录下的所有 TSV 文件到一个新的 TSV 文件中,并保留文件名作为额外一列"""
# 获取当前目录下的所有文件
files = os.listdir('.')
# 过滤出 TSV 文件
tsv_files = [f for f in files if f.endswith('.tsv')]
# 如果没有找到任何 TSV 文件,则返回
if not tsv_files:
print("没有找到 TSV 文件。")
return
# 写入头部信息(假设所有文件的头部都相同)
header_written = False
with open(output_filename, 'w', newline='') as outfile:
writer = csv.writer(outfile, delimiter='\t')
for filename in tsv_files:
with open(filename, 'r') as infile:
reader = csv.reader(infile, delimiter='\t')
# 只写入一次头部信息
if not header_written:
header = next(reader)
# 添加文件名列
header.append('File_Name')
writer.writerow(header)
header_written = True
# 写入数据行
for row in reader:
# 添加文件名作为最后一列
# row.append(filename)
# writer.writerow(row)
print(f"文件已成功合并到 {output_filename}")
if __name__ == "__main__":
# 指定输出文件名
output_filename = "merged_with_filenames.tsv"
# 合并文件
merge_tsv_files(output_filename)
