RNA-seq重点关注我自己的生信百宝箱

Python根据注释表实现基于超几何分布的Go/KEGG, en

2022-01-24  本文已影响0人  瓶瓶瓶平平
9d2cb7c78bc70d9c553732f167992b0.jpg
拿着一个DEGs的list和基因functional annotation竟然没有找到简单的脚本一健生成Go或者Kegg的富集结果,啊这!(可能是我懒得找吧)
有需求就自己写一个好了,基于超几何分布和 Benjamini - Hochberg的P值矫正方法。
有关数学得详情点击这个https://guangchuangyu.github.io/cn/2012/04/enrichment-analysis/

需要原料:

1642996388(1).jpg

这个类似这样功能注释的格式的表。


1642996467(1).jpg

DEGs的list
依赖包:

import sys
from scipy import stats
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
statsr = importr('stats')

主要代码:
两个函数

def goread(goingo):#Go counts
    totalgenenum = 0
    filego = open(goingo,"r")
    linessgo = filego.readlines()
    linessgo = list(linessgo)
    dicgonum = {}
    dicgenegolist = {}
    for i in linessgo:
        totalgenenum +=1
        i = i.strip()
        i =i.split()
        #print(i)
        dicgenegolist[i[0]] = []
        try:
            for j in i[1].split(";"):
                if j.find("GO") != -1: #only for Go
                    j = j.strip()
                    dicgenegolist[i[0]].append(j)
                    if j not in list(dicgonum.keys()):
                        dicgonum[j] = 1
                    else:
                        dicgonum[j] +=1
        except:
            continue
    print(dicgenegolist)                
    return (dicgonum,dicgenegolist,totalgenenum)
                    
                    
                    
def enrichment(m,n,a,b):#enrichment test adjustment 
    pvalue = stats.hypergeom.sf(m,n,a,b)
    return pvalue

输入输出统计啥的:

goin = sys.argv[1]
goingo = sys.argv[2]
goout = sys.argv[3]
gogoout = open(goout,"w")
file2 = open(goin,"r")
liness = file2.readlines()
liness = list(liness)

dicgonum, dicgenegolist, totalgenenum  = goread(goingo)
pvaluelist = []
mapnum =0
dicquerygonum = {}
samplenum = 0
print(dicgenegolist)
dicquerylist =[]

for i in liness:
    i = i.strip()
    samplenum +=1
    try:
        
        for j in dicgenegolist[i]:
            if j not in dicquerylist:
                dicquerygonum[j] = 1
                dicquerylist.append(j)
            else:
                dicquerygonum[j] += 1
        mapnum +=1
    except:
        continue
        
if mapnum == 0:
    print("No gene mapped, check you data!")
else:
    print(mapnum,"genes mapped")
    
for i in dicquerygonum.keys():
    m = dicquerygonum[i]-1
    n = totalgenenum
    a = dicgonum[i]
    b = samplenum
    pvalue = stats.hypergeom.sf(m,n,a,b)
    pvaluelist.append(pvalue)

P值矫正(BH)

p_adjust = statsr.p_adjust(FloatVector(pvaluelist), method = 'BH') #P value adjustment

输出:

diclist = list(dicquerygonum.keys())
for i in range(len(list(dicquerygonum.keys()))):
    name = diclist[i]
    oa = dicquerygonum[diclist[i]]
    ob = samplenum
    oc = dicgonum[diclist[i]]
    od = totalgenenum-dicgonum[diclist[i]]
    op = pvaluelist[i]
    opadj  = p_adjust[i]
    print(name,oa,ob,oc,od,op,opadj,file = gogoout) 
    
gogoout.close()

输出样品:

1642996800(1).png
完整版看看Github:https://github.com/lifangpings/enrichmentgo.py
上一篇下一篇

猜你喜欢

热点阅读