使用python处理生物信息数据(七)
2020-03-09 本文已影响0人
你猜我菜不菜
Python学习的第七天,主要学习数据排序。
1. 表格按指定列排序并写入文件。
from operator import itemgetter
table = []
for line in open("random_distribution.tsv"): #读取数据
columns = line.split()
columns = [float(x) for x in columns] #将列设置为浮点数据
table.append(columns)
columns =1
table_sorted = sorted(table, key=itemgetter(columns)) #根据第一列数据对表格数据排序
for row in table_sorted:
row = [str(x) for x in row]
print("\t".join(row))
6153.0 58.0 0.00942629611572 40.0 0.00650089387291 260.0 0.0422558101739
6101.0 64.0 0.0104900835929 41.0 0.00672020980167 299.0 0.0490083592854
6101.0 68.0 0.0111457138174 39.0 0.0063923946894 274.0 0.0449106703819
6066.0 70.0 0.0115397296406 35.0 0.00576986482031 260.0 0.0428618529509
6100.0 72.0 0.0118032786885 21.0 0.00344262295082 257.0 0.042131147541
6131.0 72.0 0.011743598108 36.0 0.00587179905399 260.0 0.0424074376121
6119.0 73.0 0.0119300539304 39.0 0.00637359045596 249.0 0.0406929236803
6107.0 73.0 0.0119534959882 68.0 0.0111347633863 276.0 0.0451940396267
6114.0 74.0 0.0121033693163 29.0 0.00474321229964 241.0 0.0394177298005
6138.0 74.0 0.0120560443141 44.0 0.00716845878136 274.0 0.0446399478658
6121.0 74.0 0.0120895278549 39.0 0.00637150792354 269.0 0.0439470674726
6101.0 75.0 0.0122930667104 36.0 0.00590067202098 279.0 0.0457302081626
6104.0 75.0 0.0122870249017 29.0 0.00475098296199 278.0 0.0455439056356
6142.0 76.0 0.0123738196027 37.0 0.00602409638554 252.0 0.041028980788
6184.0 76.0 0.0122897800776 40.0 0.00646830530401 250.0 0.0404269081501
6216.0 76.0 0.0122265122265 50.0 0.00804375804376 256.0 0.041184041184
6073.0 76.0 0.0125144080356 40.0 0.00658653054504 266.0 0.0438004281245
6111.0 77.0 0.0126002290951 31.0 0.00507281950581 280.0 0.0458190148912
6127.0 78.0 0.0127305369675 43.0 0.00701811653338 259.0 0.0422719112127
6113.0 79.0 0.0129232782594 22.0 0.00359888761655 290.0 0.0474398822182
.....
.......
.........
2. 字典排序
data = {1: 'a', 2: 'b', 4: 'd', 3: 'c',
5: 't', 6: 'm', 36: 'z'}
keys = list(data) #提取字典里的key
keys
Out[41]: [1, 2, 4, 3, 5, 6, 36]
keys.sort() #key的排序
for key in keys:
print(key,data[key]) #根据key的排序,显示key:value对
1 a
2 b
3 c
4 d
5 t
6 m
36 z
for key in sorted(data): #通过sorted()函数排序
print(key, data[key])
1 a
2 b
3 c
4 d
5 t
6 m
36 z
3. 元组排序
data = (1, 4, 5, 3, 8, 9, 2, 6, 8, 9, 30)
list_data = list(data)
list_data.sort()
new_tup = tuple(list_data)
print(new_tup)
(1, 2, 3, 4, 5, 6, 8, 8, 9, 9, 30)
new_tup = tuple(sorted(data))
print(new_tup)
(1, 2, 3, 4, 5, 6, 8, 8, 9, 9, 30)
4. 根据DNA序列长度排序
data = ['ACCTGGCCA', 'ACTG', 'TACGGCAGGAGACG', 'TTGGATC']
bylength = sorted(data, key=lambda x:len(x))
print(bylength)
['ACTG', 'TTGGATC', 'ACCTGGCCA', 'TACGGCAGGAGACG']
5. 根据两列对逗号分隔的文件进行排序。
from operator import itemgetter
in_file = open("random_distribution.tsv")
table = []
for line in in_file:
columns = line.split()
columns = [float(x) for x in columns]
table.append(columns)
table_sorted = sorted(table, key=itemgetter(0, 1, 2, 3, 4, 5, 6), reverse=True)
table_sorted_2 = sorted(table, key=itemgetter(0, 1, 2, 3, 4, 5, 6))
print(table_sorted)
print(table_sorted_2)
5. 对Blast结果文件进行排序
Blast结果文件from operator import itemgetter
input_file =open("BlastOut.csv")
output_file = open("BlastOutSorted.csv", "w")
table = []
for line in input_file:
col = line.split(",") #以","为分隔符
col[2] = float(col[2]) #将第三列格式化为浮点数据
table.append(col) #将数据读入table中
table_sorted = sorted(table, key = itemgetter(2), reverse=True)
#根据table中的第3列降序排序,
#第3列其实是blast结果中的序列一致性百分比,
#从高到低排列了以便查找到ncbi数据库中与目标序列最相似的序列
for row in table_sorted:
row = [str(x) for x in row]
output_file.write("\t".join(row) + "\n")
input_file.close()
output_file.close()
6. 对PDB文件进行排序
PDB文件from operator import itemgetter
input_file = open("PDBhaemoglobinReport.csv")
output_file = open("PDBhaemoglobinSorted.csv","w")
table = []
header = input_file.readline()
for line in input_file:
col = line.split(",")
col[3] = float(col[3][1:-1])
col[4] = int(col[4][1:-2])
table.append(col)
table_sorted = sorted(table, key=itemgetter(3, 4))
output_file.write(header + '\t')
Out[92]: 53
for row in table_sorted:
row = [str(x) for x in row]
output_file.write("\t".join(row) + "\n")
input_file.close()
output_file.close()