python统计hg38的A,T,C,G碱基含量(第二题)
2020-02-18 本文已影响0人
多啦A梦詹
生信技能书第二题
hash1 = {}
bases = ["A","T","C","G","N"]
import sys
import os
#os.chdir("D:\python")
os.chdir("G:\R\Genome\hsa")
with open("genome.fa","rt") as f:
for line in f:
if line.startswith(">"):
chr_id = line.split(" ")[0][1:] #取“>”后面的第一个字符,以" "分割
hash1[chr_id] = {}
for base in bases:
hash1[chr_id][base] = 0
else:
line = line.upper()
for base in bases:
hash1[chr_id][base] += line.count(base)
for keys, values in hash1.items():
GC = hash1[keys]["C"] + hash1[keys]["G"]
SUM = sum(values.values()) # 注意是values,不是hash1,也可以hash1[keys]["A"] + hash1[keys]["T"] + hash1[keys]["C"] + hash1[keys]["G"] + hash1[keys]["N"]
print(keys)
for base in bases:
print(base+": %s\t%.4s%%" % (hash1[keys][base], hash1[keys][base]*100/SUM))
print("GC: %s\t%.4s%%" % (GC, GC*100/SUM))
print(SUM,"\n")
1
A: 67070277 26.9%
T: 67244164 27.0%
C: 48055043 19.3%
G: 48111528 19.3%
N: 18475410 7.42%
GC: 96166571 38.6%
248956422
10
A: 38875926 29.0%
T: 39027555 29.1%
C: 27639505 20.6%
G: 27719976 20.7%
N: 534460 0.39%
GC: 55359481 41.3%
133797422
11
A: 39286730 29.0%
T: 39361954 29.1%
C: 27903257 20.6%
G: 27981801 20.7%
N: 552880 0.40%
GC: 55885058 41.3%
135086622
12
A: 39370109 29.5%
T: 39492225 29.6%
C: 27092804 20.3%
G: 27182678 20.3%
N: 137493 0.10%
GC: 54275482 40.7%
133275309
13
A: 30047611 26.2%
T: 30162717 26.3%
C: 18839192 16.4%
G: 18933605 16.5%
N: 16381203 14.3%
GC: 37772797 33.0%
114364328
14
A: 26673415 24.9%
T: 26911943 25.1%
C: 18423758 17.2%
G: 18559033 17.3%
N: 16475569 15.3%
GC: 36982791 34.5%
107043718
15
A: 24508669 24.0%
T: 24553812 24.0%
C: 17752941 17.4%
G: 17825903 17.4%
N: 17349864 17.0%
GC: 35578844 34.8%
101991189
16
A: 22558319 24.9%
T: 22774906 25.2%
C: 18172742 20.1%
G: 18299976 20.2%
N: 8532402 9.44%
GC: 36472718 40.3%
90338345
17
A: 22639499 27.1%
T: 22705261 27.2%
C: 18723944 22.4%
G: 18851500 22.6%
N: 337237 0.40%
GC: 37575444 45.1%
83257441
18
A: 24050680 29.9%
T: 24182819 30.0%
C: 15794455 19.6%
G: 16061651 19.9%
N: 283680 0.35%
GC: 31856106 39.6%
80373285
19
A: 15142293 25.8%
T: 15282753 26.0%
C: 13954580 23.8%
G: 14061132 23.9%
N: 176858 0.30%
GC: 28015712 47.7%
58617616
2
A: 71791213 29.6%
T: 71987932 29.7%
C: 48318180 19.9%
G: 48450903 20.0%
N: 1645301 0.67%
GC: 96769083 39.9%
242193529
20
A: 17867246 27.7%
T: 18066406 28.0%
C: 13916133 21.5%
G: 14094472 21.8%
N: 499910 0.77%
GC: 28010605 43.4%
64444167
21
A: 11820664 25.3%
T: 11856330 25.3%
C: 8185244 17.5%
G: 8226381 17.6%
N: 6621364 14.1%
GC: 16411625 35.1%
46709983
22
A: 10382214 20.4%
T: 10370725 20.4%
C: 9160652 18.0%
G: 9246186 18.1%
N: 11658691 22.9%
GC: 18406838 36.2%
50818468
3
A: 59689091 30.1%
T: 59833302 30.1%
C: 39233483 19.7%
G: 39344259 19.8%
N: 195424 0.09%
GC: 78577742 39.6%
198295559
4
A: 58561236 30.7%
T: 58623430 30.8%
C: 36236976 19.0%
G: 36331025 19.1%
N: 461888 0.24%
GC: 72568001 38.1%
190214555
5
A: 54699094 30.1%
T: 54955010 30.2%
C: 35731600 19.6%
G: 35879674 19.7%
N: 272881 0.15%
GC: 71611274 39.4%
181538259
6
A: 51345477 30.0%
T: 51373025 30.0%
C: 33646690 19.6%
G: 33713330 19.7%
N: 727457 0.42%
GC: 67360020 39.4%
170805979
7
A: 47058248 29.5%
T: 47215040 29.6%
C: 32317984 20.2%
G: 32378859 20.3%
N: 375842 0.23%
GC: 64696843 40.6%
159345973
8
A: 43333530 29.8%
T: 43300646 29.8%
C: 29030173 20.0%
G: 29103787 20.0%
N: 370500 0.25%
GC: 58133960 40.0%
145138636
9
A: 35736329 25.8%
T: 35783748 25.8%
C: 25099811 18.1%
G: 25170662 18.1%
N: 16604167 11.9%
GC: 50270473 36.3%
138394717
MT
A: 5124 30.9%
T: 4094 24.7%
C: 5181 31.2%
G: 2169 13.0%
N: 1 0.00%
GC: 7350 44.3%
16569
X
A: 46754807 29.9%
T: 46916701 30.0%
C: 30523780 19.5%
G: 30697741 19.6%
N: 1147866 0.73%
GC: 61221521 39.2%
156040895
Y
A: 7155845 12.5%
T: 7217789 12.6%
C: 4632232 8.09%
G: 4630489 8.09%
N: 33591060 58.6%
GC: 9262721 16.1%
57227415