python统计hg38的A,T,C,G碱基含量(第二题)

2020-02-18  本文已影响0人  多啦A梦詹

生信技能书第二题

hash1 = {}
bases = ["A","T","C","G","N"]
import sys
import os
#os.chdir("D:\python")
os.chdir("G:\R\Genome\hsa")

with open("genome.fa","rt") as f:
    for line in f:
        if line.startswith(">"):
            chr_id = line.split(" ")[0][1:]     #取“>”后面的第一个字符,以" "分割
            hash1[chr_id] = {}
            for base in bases:
                hash1[chr_id][base] = 0
        else:
            line = line.upper()
            for base in bases:
                hash1[chr_id][base] += line.count(base)
    for keys, values in hash1.items():
        GC = hash1[keys]["C"] + hash1[keys]["G"]
        SUM = sum(values.values())  # 注意是values,不是hash1,也可以hash1[keys]["A"] + hash1[keys]["T"] + hash1[keys]["C"] + hash1[keys]["G"] + hash1[keys]["N"]
        print(keys)
        for base in bases:
            print(base+": %s\t%.4s%%" % (hash1[keys][base], hash1[keys][base]*100/SUM))
        print("GC: %s\t%.4s%%" % (GC, GC*100/SUM))
        print(SUM,"\n")
1
A: 67070277 26.9%
T: 67244164 27.0%
C: 48055043 19.3%
G: 48111528 19.3%
N: 18475410 7.42%
GC: 96166571    38.6%
248956422 

10
A: 38875926 29.0%
T: 39027555 29.1%
C: 27639505 20.6%
G: 27719976 20.7%
N: 534460   0.39%
GC: 55359481    41.3%
133797422 

11
A: 39286730 29.0%
T: 39361954 29.1%
C: 27903257 20.6%
G: 27981801 20.7%
N: 552880   0.40%
GC: 55885058    41.3%
135086622 

12
A: 39370109 29.5%
T: 39492225 29.6%
C: 27092804 20.3%
G: 27182678 20.3%
N: 137493   0.10%
GC: 54275482    40.7%
133275309 

13
A: 30047611 26.2%
T: 30162717 26.3%
C: 18839192 16.4%
G: 18933605 16.5%
N: 16381203 14.3%
GC: 37772797    33.0%
114364328 

14
A: 26673415 24.9%
T: 26911943 25.1%
C: 18423758 17.2%
G: 18559033 17.3%
N: 16475569 15.3%
GC: 36982791    34.5%
107043718 

15
A: 24508669 24.0%
T: 24553812 24.0%
C: 17752941 17.4%
G: 17825903 17.4%
N: 17349864 17.0%
GC: 35578844    34.8%
101991189 

16
A: 22558319 24.9%
T: 22774906 25.2%
C: 18172742 20.1%
G: 18299976 20.2%
N: 8532402  9.44%
GC: 36472718    40.3%
90338345 

17
A: 22639499 27.1%
T: 22705261 27.2%
C: 18723944 22.4%
G: 18851500 22.6%
N: 337237   0.40%
GC: 37575444    45.1%
83257441 

18
A: 24050680 29.9%
T: 24182819 30.0%
C: 15794455 19.6%
G: 16061651 19.9%
N: 283680   0.35%
GC: 31856106    39.6%
80373285 

19
A: 15142293 25.8%
T: 15282753 26.0%
C: 13954580 23.8%
G: 14061132 23.9%
N: 176858   0.30%
GC: 28015712    47.7%
58617616 

2
A: 71791213 29.6%
T: 71987932 29.7%
C: 48318180 19.9%
G: 48450903 20.0%
N: 1645301  0.67%
GC: 96769083    39.9%
242193529 

20
A: 17867246 27.7%
T: 18066406 28.0%
C: 13916133 21.5%
G: 14094472 21.8%
N: 499910   0.77%
GC: 28010605    43.4%
64444167 

21
A: 11820664 25.3%
T: 11856330 25.3%
C: 8185244  17.5%
G: 8226381  17.6%
N: 6621364  14.1%
GC: 16411625    35.1%
46709983 

22
A: 10382214 20.4%
T: 10370725 20.4%
C: 9160652  18.0%
G: 9246186  18.1%
N: 11658691 22.9%
GC: 18406838    36.2%
50818468 

3
A: 59689091 30.1%
T: 59833302 30.1%
C: 39233483 19.7%
G: 39344259 19.8%
N: 195424   0.09%
GC: 78577742    39.6%
198295559 

4
A: 58561236 30.7%
T: 58623430 30.8%
C: 36236976 19.0%
G: 36331025 19.1%
N: 461888   0.24%
GC: 72568001    38.1%
190214555 

5
A: 54699094 30.1%
T: 54955010 30.2%
C: 35731600 19.6%
G: 35879674 19.7%
N: 272881   0.15%
GC: 71611274    39.4%
181538259 

6
A: 51345477 30.0%
T: 51373025 30.0%
C: 33646690 19.6%
G: 33713330 19.7%
N: 727457   0.42%
GC: 67360020    39.4%
170805979 

7
A: 47058248 29.5%
T: 47215040 29.6%
C: 32317984 20.2%
G: 32378859 20.3%
N: 375842   0.23%
GC: 64696843    40.6%
159345973 

8
A: 43333530 29.8%
T: 43300646 29.8%
C: 29030173 20.0%
G: 29103787 20.0%
N: 370500   0.25%
GC: 58133960    40.0%
145138636 

9
A: 35736329 25.8%
T: 35783748 25.8%
C: 25099811 18.1%
G: 25170662 18.1%
N: 16604167 11.9%
GC: 50270473    36.3%
138394717 

MT
A: 5124 30.9%
T: 4094 24.7%
C: 5181 31.2%
G: 2169 13.0%
N: 1    0.00%
GC: 7350    44.3%
16569 

X
A: 46754807 29.9%
T: 46916701 30.0%
C: 30523780 19.5%
G: 30697741 19.6%
N: 1147866  0.73%
GC: 61221521    39.2%
156040895 

Y
A: 7155845  12.5%
T: 7217789  12.6%
C: 4632232  8.09%
G: 4630489  8.09%
N: 33591060 58.6%
GC: 9262721 16.1%
57227415 
上一篇下一篇

猜你喜欢

热点阅读