python生信小练习(三)
2018-02-23 本文已影响25人
杨亮_SAAS
生信菜鸟团的编程练习:
对FASTQ的操作
- 5,3段截掉几个碱基
- 序列长度分布统计
- FASTQ 转换成 FASTA
- 统计碱基个数及GC%
对FASTA的操作 - 取互补序列
- 取反向序列
- DNA to RNA
- 大小写字母形式输出
- 每行指定长度输出序列
- 按照序列长度/名字排序
- 提取指定ID的序列
- 随机抽取序列
def trim(file, terminal5, terminal3):
fastq = {}
count = 1
for line in open(file):
if count % 4 == 1: #取第一行作为reads name
readID = line.strip()
fastq[readID] = []
elif count % 4 == 2: #取第二行作为序列
seq = line.strip()
fastq[readID] = seq[terminal5 : -terminal3] #序列切片操作,截取两端,保留中间序列,并存储为字典
count += 1
with open(r'E:\Bioinformatics\Python\practice\PyCharm\practice of biotrainee\trim.txt', 'w') as f:
for key, value in fastq.items():
print('{}\n{}'.format(key, value), file = f)
f1 = r'E:\Bioinformatics\Python\practice\chentong\notebook-master\data\test1.fq'
trim(f1, 5, 8)
def readLength(file):
fastq = {}
count = 1
for line in open(file):
if count % 4 == 1: # 取第一行作为reads name
readID = line.strip()
fastq[readID] = []
elif count % 4 == 2: # 取第二行作为序列
seq = line.strip()
fastq[readID] = len(seq) # 序列长度统计,并存储为字典
count += 1
for key, value in fastq.items():
print(value)
def fq2fa(file):
fastq = {}
count = 1
for line in open(file):
if count % 4 == 1: # 取第一行作为reads name
readID = line.split(' ')[1:] #去除@,取第一个空格前字符为ID
fastq[readID] = []
elif count % 4 == 2: # 取第二行作为序列
seq = line.strip()
fastq[readID] = seq
count += 1
with open(r'E:\Bioinformatics\Python\practice\PyCharm\practice of biotrainee\fq2fa.txt', 'w') as f:
for key, value in fastq.items():
print('>{}\n{}'.format(key, value), file = f)
def countGC(file):
count = 1
seq = []
for line in open(file):
if count % 4 == 2: # 取第二行作为序列
seq.append(line.strip())
count += 1
seq1 = ''.join(seq)
gc = 0
for i in seq1:
if i == 'G' or i == 'C':
gc += 1
print('The number of length is {}'.format(len(seq1)))
print('GC% is {}%'.format(gc/len(seq1)*100))
def complementary(file):
fasta = {}
for line in open(file):
if line.startswith('>'):
key = line.strip()
fasta[key] = []
else:
complem = line.strip().replace('A', 't').replace('T', 'a').replace('G', 'c').replace('C', 'g').upper()
fasta[key].append(complem)
for key, value in fasta.items():
print(key)
value2 = ''.join(value)
for i in range(0, len(value2), 60):
print(value2[i: i + 60])
def reverse(file):
fasta = {}
for line in open(file):
if line.startswith('>'):
key = line.strip()
fasta[key] = []
else:
fasta[key].append(line.strip())
for key, value in fasta.items():
print(key)
rev = ''.join(value)[:: -1]
for i in range(0, len(rev), 60):
print(rev[i: i + 60])
def dna2rna(file):
fasta = {}
for line in open(file):
if line.startswith('>'):
key = line.strip()
fasta[key] = []
else:
seq = list(line.strip())
for i in range(len(seq)):
if seq[i] == 'T':
seq[i] = 'U'
elif seq[i] == 't':
seq[i] = 'u'
fasta[key].append(''.join(seq))
for key, value in fasta.items():
print(key)
value2 = ''.join(value)
for i in range(0, len(value2), 60):
print(value2[i: i + 60])
def upperandlower(file):
upper = {}
lower = {}
for line in open(file):
if line.startswith('>'):
key = line.strip()
upper[key] = []
lower[key] = []
else:
upper[key].append(line.strip().upper())
lower[key].append(line.strip().lower())
for key, value in upper.items():
print(key)
value2 = ''.join(value)
for i in range(0, len(value2), 60):
print(value2[i: i + 60])
for key, value in lower.items():
print(key)
value2 = ''.join(value)
for i in range(0, len(value2), 60):
print(value2[i: i + 60])
def sortLength():
fasta = {}
for line in open(file):
if line.startswith('>'):
key = line.strip()
fasta[key] = []
else:
fasta[key].append(line.strip())
for key, value in fasta.items():
seq = ''.join(value)
fasta[key] = seq