KEGG分析

2022-02-02  本文已影响0人  余绕
library('R2HTML')
Rice KEGG 分析
#读取背景基因(所有注释到KEGG的基因)和差异基因列表,以K号表示

allgene=read.delim('Rice_all_gene_KOs.txt',header=F,sep="\t",stringsAsFactors = F)#读取水稻所有pathways

head(allgene) #展示allgene数据
##       V1
## 1 K02989
## 2 K02989
## 3 K01634
## 4 K09880
## 5 K13151
## 6 K13151
deggene=read.delim('Degs_KOs.txt',header=F,sep="\t",stringsAsFactors = F) #取差异基因的pathway信息

head(deggene) #展示deggene数据
##       V1
## 1 K02989
## 2 K01595
## 3 K15414
## 4 K11252
## 5 K07874
## 6 K09489
pathinfo=read.delim('kegg.pathway.id',header=T,sep="\t",stringsAsFactors = F) #取所有的pathway信息

head(pathinfo) #展示pathinfo数据
##     KOID                       PATHWAY     PID Category
## 1 K00844 glycolysis / Gluconeogenesis  ko00010     PATH
## 2 K12407 glycolysis / Gluconeogenesis  ko00010     PATH
## 3 K00845 glycolysis / Gluconeogenesis  ko00010     PATH
## 4 K01810 glycolysis / Gluconeogenesis  ko00010     PATH
## 5 K06859 glycolysis / Gluconeogenesis  ko00010     PATH
## 6 K13810 glycolysis / Gluconeogenesis  ko00010     PATH
#读取背景基因(所有注释到KEGG的基因)和差异基因列表,以K号表示

pathway=unique(pathinfo[,c('PID','PATHWAY')]) #unique()去除重复函数,获得单一的Pathway用于后面for循环计数每个pathway的Ko数目

head(pathway) #展示pathway数据
##         PID                                   PATHWAY
## 1   ko00010             glycolysis / Gluconeogenesis 
## 105 ko00020                citrate cycle (TCA cycle) 
## 164 ko00030                pentose phosphate pathway 
## 247 ko00040 pentose and glucuronate interconversions 
## 326 ko00051          fructose and mannose metabolism 
## 435 ko00052                     galactose metabolism
allgene=unique(allgene$V1)   #unique()去除重复函数,获得单一的Pathway用于后面for循环计数每个pathway的Ko数目

head(allgene) #展示allgene数据
## [1] "K02989" "K01634" "K09880" "K13151" "K01114" "K08066"
deggene=unique(deggene$V1)   #unique()去除重复函数,获得单一的Pathway用于后面for循环计数每个pathway的Ko数目

head(deggene)  #展示deggene数据
## [1] "K02989" "K01595" "K15414" "K11252" "K07874" "K09489"
#==============设定输出的文件名称============

outfile="kegg.enrichment"
#=======将基因与Pathway中有注释的基因取交集========

deginfo=pathinfo[pathinfo[,1]%in%deggene,] #相当于merge

head(deginfo)#展示deginfo数据
##      KOID                       PATHWAY     PID Category
## 1  K00844 glycolysis / Gluconeogenesis  ko00010     PATH
## 4  K01810 glycolysis / Gluconeogenesis  ko00010     PATH
## 8  K00850 glycolysis / Gluconeogenesis  ko00010     PATH
## 12 K00895 glycolysis / Gluconeogenesis  ko00010     PATH
## 18 K01623 glycolysis / Gluconeogenesis  ko00010     PATH
## 24 K01803 glycolysis / Gluconeogenesis  ko00010     PATH
allinfo=pathinfo[pathinfo[,1]%in%allgene,]  #相当于merge

head(allinfo) #展示allinfo数据
##      KOID                       PATHWAY     PID Category
## 1  K00844 glycolysis / Gluconeogenesis  ko00010     PATH
## 4  K01810 glycolysis / Gluconeogenesis  ko00010     PATH
## 8  K00850 glycolysis / Gluconeogenesis  ko00010     PATH
## 12 K00895 glycolysis / Gluconeogenesis  ko00010     PATH
## 13 K03841 glycolysis / Gluconeogenesis  ko00010     PATH
## 18 K01623 glycolysis / Gluconeogenesis  ko00010     PATH
k=length(unique(deginfo[,1]))  #差异基因总数,length函数求基因数目

k  #展示K数据
## [1] 353
N=length(unique(allinfo[,1]))  #总基因数   #length函数求基因数目

N  #展示N数据
## [1] 2012
#=========对每个通路基于超几何分布计算富集p值=======

p<-data.frame() #新建data frame用于储存KO数目


urlColor<-NULL # used to color the genes in pathway
geneLink<-NULL # used to set hyperlink to DEGs in pathway

#通过循环计算每个KEGGpathways上的Ko号

for (i in seq(1,nrow(pathway))) {
 
  pid=pathway[i,'PID'] #获取Pathway的KO号码,例如:pathway[1,'PID']=》 "ko00010";pathway[1,'PID']=》"ko00020"等
  
  allInPath=allinfo[allinfo[,'PID']%in%pid,] #获得每一个pathway所有的KO数目,具体见备注
  
  m=length(unique(allInPath[,1]))  #计算i Pathway基因总数
  
  degInPath=deginfo[deginfo[,'PID']%in%pid,] #获得每一个pathway所有的KO数目
  
  x=length(unique(degInPath[,1]))  #计算i Pathway中差异基因数
  
  p[i,1]=x #数据框p的第i行,第一列值
  p[i,2]=m #数据框p的第i行,第二列值
  p[i,3]=phyper(x-1,m,N-m,k,lower.tail=FALSE)  #lower.tail=FALSE,计算的是P[X>(x-1)]的概率,其中x-1:某一pathway中差异基因数目(或KO数),m:背景数(或KO数),N:所有基因数(或KO数),K:所有差异数(或KO数)
  
  urlColor[i]=apply(as.matrix(paste("/",t(degInPath[,1]),'%09red',sep="")),2,paste,collapse="")
  Link=paste('<a href=', shQuote(paste('https://www.kegg.jp/dbget-bin/www_bget?ko:',degInPath[,1],sep="")),'/','>', degInPath[,1],'</a>',sep="")
  if(x==0){
    geneLink[i]="--"
  }else{
    geneLink[i]=apply(as.matrix(Link),2,paste,collapse=", ")
  }
}
#============计算FDR,整理结果==============

fdr=p.adjust(p[,3],method="BH") #利用第三列的p值计算FDR值,!!!可以借鉴!!!!

output=cbind(pathway,p,fdr)#合并数据库框,产生新的数据框,因为for循环按照pathway内容从上到下依次运行的,所以顺序是相同的,可以进行列合并

head(output)#展示数据内容
##         PID                                   PATHWAY V1 V2           V3
## 1   ko00010             glycolysis / Gluconeogenesis  21 34 7.896160e-09
## 105 ko00020                citrate cycle (TCA cycle)  16 20 1.469740e-09
## 164 ko00030                pentose phosphate pathway  10 17 1.484026e-04
## 247 ko00040 pentose and glucuronate interconversions   6 14 2.381463e-02
## 326 ko00051          fructose and mannose metabolism   7 18 2.628347e-02
## 435 ko00052                     galactose metabolism   7 15 8.569568e-03
##              fdr
## 1   5.152244e-07
## 105 1.278674e-07
## 164 5.533295e-03
## 247 2.702443e-01
## 326 2.840186e-01
## 435 1.574602e-01
colnames(output)=c('ID','Pathway','DEGsInPathway','GenesInPathway','Pvalue','FDR') #重新给数据框命名

ind=order(output[,"Pvalue"])#按照P值从小到大对结果排序
output=output[ind,]       #用于输出到表格文件
urlColor=urlColor[ind]
DEGs=geneLink[ind]

output2=cbind(output,DEGs) #用于输出到HTML文件
#==============导出到表格文件================

write.table(output,file=paste(outfile,".xls",sep=""),quote = F,row.names = F,col.names = T,sep="\t")
#==============导出结果到HTML文件============

#==============导出结果到HTML文件============
urlTitles <- output[,'ID']
url=paste('https://www.genome.jp/kegg-bin/show_pathway?',gsub("ko", "map", urlTitles),urlColor,sep="")

htmlOUT <- transform(output2, ID = paste('<a href = ', shQuote(url),'/','>', urlTitles, '</a>'))

target <- HTMLInitFile(Title = "KEGG Pathway Ernichment Results",outdir=getwd(),filename=outfile, BackGroundColor="#f8f8f8",useGrid = T,useLaTeX = F,HTMLframe = F)
write("<style>table{border-collapse:collapse;width:1280px;}a:link,a:visited {color: #3366FF;text-decoration: none; }a:hover,a:active {color: #ff3366;text-decoration: none; };</style>",target,append = T)
HTML(as.title("KEGG Pathway Ernichment Results"),file=target)
HTML(htmlOUT,file=target,innerBorder = 1,row.names = F,digits=3)
HTMLEndFile()
#============================================
#——————备注——————-

#------------------备注-------------------
pid=pathway[2,'PID']#获得第二个pathway

pid #pid内容
## [1] "ko00020"
allInPath=allinfo[allinfo[,'PID']%in%pid,]#计算第二个pathway中所有背景Ko号

allInPath #展示allInPath内容
##       KOID                    PATHWAY     PID Category
## 105 K01647 citrate cycle (TCA cycle)  ko00020     PATH
## 106 K01648 citrate cycle (TCA cycle)  ko00020     PATH
## 110 K01681 citrate cycle (TCA cycle)  ko00020     PATH
## 112 K00031 citrate cycle (TCA cycle)  ko00020     PATH
## 113 K00030 citrate cycle (TCA cycle)  ko00020     PATH
## 115 K00164 citrate cycle (TCA cycle)  ko00020     PATH
## 116 K00658 citrate cycle (TCA cycle)  ko00020     PATH
## 117 K00382 citrate cycle (TCA cycle)  ko00020     PATH
## 124 K01899 citrate cycle (TCA cycle)  ko00020     PATH
## 125 K01900 citrate cycle (TCA cycle)  ko00020     PATH
## 127 K00234 citrate cycle (TCA cycle)  ko00020     PATH
## 128 K00235 citrate cycle (TCA cycle)  ko00020     PATH
## 129 K00236 citrate cycle (TCA cycle)  ko00020     PATH
## 144 K01679 citrate cycle (TCA cycle)  ko00020     PATH
## 145 K00025 citrate cycle (TCA cycle)  ko00020     PATH
## 146 K00026 citrate cycle (TCA cycle)  ko00020     PATH
## 152 K01610 citrate cycle (TCA cycle)  ko00020     PATH
## 155 K00161 citrate cycle (TCA cycle)  ko00020     PATH
## 156 K00162 citrate cycle (TCA cycle)  ko00020     PATH
## 157 K00627 citrate cycle (TCA cycle)  ko00020     PATH

参考:

Beijing PlantTech Technologies
Email:tech@planttech.com.cn
上一篇 下一篇

猜你喜欢

热点阅读