使用python处理生物信息数据（七）

2020-03-09 本文已影响0人你猜我菜不菜

Python学习的第七天，主要学习数据排序。

1. 表格按指定列排序并写入文件。

from operator import itemgetter

table = []

for line in open("random_distribution.tsv"): #读取数据
    columns = line.split()
    columns = [float(x) for x in columns] #将列设置为浮点数据
    table.append(columns)
    

columns =1

table_sorted = sorted(table, key=itemgetter(columns)) #根据第一列数据对表格数据排序

for row in table_sorted:
    row = [str(x) for x in row]
    print("\t".join(row))
    
6153.0  58.0    0.00942629611572        40.0    0.00650089387291        260.0   0.0422558101739
6101.0  64.0    0.0104900835929 41.0    0.00672020980167        299.0   0.0490083592854
6101.0  68.0    0.0111457138174 39.0    0.0063923946894 274.0   0.0449106703819
6066.0  70.0    0.0115397296406 35.0    0.00576986482031        260.0   0.0428618529509
6100.0  72.0    0.0118032786885 21.0    0.00344262295082        257.0   0.042131147541
6131.0  72.0    0.011743598108  36.0    0.00587179905399        260.0   0.0424074376121
6119.0  73.0    0.0119300539304 39.0    0.00637359045596        249.0   0.0406929236803
6107.0  73.0    0.0119534959882 68.0    0.0111347633863 276.0   0.0451940396267
6114.0  74.0    0.0121033693163 29.0    0.00474321229964        241.0   0.0394177298005
6138.0  74.0    0.0120560443141 44.0    0.00716845878136        274.0   0.0446399478658
6121.0  74.0    0.0120895278549 39.0    0.00637150792354        269.0   0.0439470674726
6101.0  75.0    0.0122930667104 36.0    0.00590067202098        279.0   0.0457302081626
6104.0  75.0    0.0122870249017 29.0    0.00475098296199        278.0   0.0455439056356
6142.0  76.0    0.0123738196027 37.0    0.00602409638554        252.0   0.041028980788
6184.0  76.0    0.0122897800776 40.0    0.00646830530401        250.0   0.0404269081501
6216.0  76.0    0.0122265122265 50.0    0.00804375804376        256.0   0.041184041184
6073.0  76.0    0.0125144080356 40.0    0.00658653054504        266.0   0.0438004281245
6111.0  77.0    0.0126002290951 31.0    0.00507281950581        280.0   0.0458190148912
6127.0  78.0    0.0127305369675 43.0    0.00701811653338        259.0   0.0422719112127
6113.0  79.0    0.0129232782594 22.0    0.00359888761655        290.0   0.0474398822182
.....
.......
.........

2. 字典排序

data = {1: 'a', 2: 'b', 4: 'd', 3: 'c',
        5: 't', 6: 'm', 36: 'z'}

keys = list(data) #提取字典里的key

keys

Out[41]: [1, 2, 4, 3, 5, 6, 36]

keys.sort() #key的排序

for key in keys:
    print(key,data[key]) #根据key的排序，显示key：value对
    
1 a
2 b
3 c
4 d
5 t
6 m
36 z

for key in sorted(data): #通过sorted()函数排序
    print(key, data[key])
    
1 a
2 b
3 c
4 d
5 t
6 m
36 z

3. 元组排序

data = (1, 4, 5, 3, 8, 9, 2, 6, 8, 9, 30)

list_data = list(data)

list_data.sort()

new_tup = tuple(list_data)

print(new_tup)
(1, 2, 3, 4, 5, 6, 8, 8, 9, 9, 30)

new_tup = tuple(sorted(data))

print(new_tup)
(1, 2, 3, 4, 5, 6, 8, 8, 9, 9, 30)

4. 根据DNA序列长度排序

data = ['ACCTGGCCA', 'ACTG', 'TACGGCAGGAGACG', 'TTGGATC']

bylength = sorted(data, key=lambda x:len(x))

print(bylength)
['ACTG', 'TTGGATC', 'ACCTGGCCA', 'TACGGCAGGAGACG']

5. 根据两列对逗号分隔的文件进行排序。

from operator import itemgetter

in_file = open("random_distribution.tsv")

table = []

for line in in_file:
    columns = line.split()
    columns = [float(x) for x in columns]
    table.append(columns)

table_sorted = sorted(table, key=itemgetter(0, 1, 2, 3, 4, 5, 6), reverse=True)

table_sorted_2 = sorted(table, key=itemgetter(0, 1, 2, 3, 4, 5, 6))

print(table_sorted)
print(table_sorted_2)

5. 对Blast结果文件进行排序

Blast结果文件

from operator import itemgetter

input_file =open("BlastOut.csv") 

output_file = open("BlastOutSorted.csv", "w")

table = []

for line in input_file:
    col = line.split(",") #以"，"为分隔符
    col[2] = float(col[2]) #将第三列格式化为浮点数据
    table.append(col) #将数据读入table中
    

table_sorted = sorted(table, key = itemgetter(2), reverse=True) 
#根据table中的第3列降序排序，
#第3列其实是blast结果中的序列一致性百分比，
#从高到低排列了以便查找到ncbi数据库中与目标序列最相似的序列


for row in table_sorted:
    row = [str(x) for x in row]
    output_file.write("\t".join(row) + "\n")
    

input_file.close()

output_file.close()

6. 对PDB文件进行排序

PDB文件

from operator import itemgetter

input_file = open("PDBhaemoglobinReport.csv")

output_file = open("PDBhaemoglobinSorted.csv","w")

table = []

header = input_file.readline()

for line in input_file:
    col = line.split(",")
    col[3] = float(col[3][1:-1])
    col[4] = int(col[4][1:-2])
    table.append(col)
    


table_sorted = sorted(table, key=itemgetter(3, 4))

output_file.write(header + '\t')
Out[92]: 53

for row in table_sorted:
    row = [str(x) for x in row]
    output_file.write("\t".join(row) + "\n")
    

input_file.close()

output_file.close()

使用python处理生物信息数据（七）

1. 表格按指定列排序并写入文件。

2. 字典排序

3. 元组排序

4. 根据DNA序列长度排序

5. 根据两列对逗号分隔的文件进行排序。

5. 对Blast结果文件进行排序

6. 对PDB文件进行排序

猜你喜欢

热点阅读