注释和富集ATAC

生信 | 构建物种的OrgDb

2021-06-07  本文已影响0人  生信卷王

写在前面

1. 使用eggnog-mapper生成注释文件

1.1. 下载eggnog-mapper软件

git clone https://github.com/jhcepas/eggnog-mapper.git

1.2. 下载EggNOG数据库

选择最新的文件下载
gunzip eggnog.db.gz eggnog.proteins.dmnd.gz
解压

1.3. 将eggnog-mapper文件夹添加至环境变量【conda装的可忽略】

echo 'export PATH=/public/user/software/eggnog-mapper:$PATH' >> ~/.bashrc
source ~/.bashrc

1.4. 配置eggnog-mapper的运行环境(可省略)

conda create -n python3 python=3.7.8 -y
activate python3

1.5. 运行eggnog-mapper

注意:蛋白文件的中基因名必须要和你以后富集分析的基因名格式相同,如果不同,后面的操作都是徒劳,千万注意。

emapper.py -i mm39.pep.fa -o mm39 -d euk --cpu 10 --dbmem
Eukaryota Bacteria Bacteria Bacteria Archaea
Arthropoda Acidimicrobiia Oceanospirillales Gammaproteobacteria Archaeoglobi
Nematoda Acidithiobacillales Opitutae Gemmatimonadetes Crenarchaeota
Chordata Aeromonadales Pasteurellales Planctomycetes Halobacteria
Ascomycota Bacilli Rhodocyclales Tenericutes Methanobacteria
Basidiomycota Bacteroidetes Rhodospirillales Verrucomicrobia Methanococci
Bilateria Bdellovibrionales Rickettsiales Acidobacteria Methanomicrobia
Coccidia Caulobacterales Rubrobacteria Aquificae Thaumarchaeota
Aconoidasida Chlorobi Sphingomonadales Deferribacteres Thermococci
Peronosporales Chloroflexia Thermomicrobia Fusobacteria Thermoplasmata
Pythiales Chromatiales Thiotrichales Nitrospirae Euryarchaeota
Apicomplexa Clostridia Verrucomicrobiae Proteobacteria -
Bacillariophyta Coriobacteriia Vibrionales Spirochaetes -
Chlorophyta Cyanobacteria Xanthomonadales Synergistetes -
Ciliophora Dehalococcoidia Acidobacteriia Thermodesulfobacteria -
Fungi Erysipelotrichia Actinobacteria Thermotogae -
Kinetoplastida Hydrogenophilales Alphaproteobacteria - -
Metazoa Legionellales Betaproteobacteria - -
Streptophyta Methylococcales Chlamydiae - -
Amoebozoa Negativicutes Chloroflexi - -
Opisthokonta Neisseriales Deinococcus-Thermus - -
Viridiplantae Nitrosomonadales Firmicutes - -

1.6. 漫长的等待

1.7. 运行结果

结果文件

1.8. 处理数据

id=mm39  #刚才的前缀
sed '/^##/d' ${id}.emapper.annotations| sed 's/#//g'| awk -vFS="\t" -vOFS="\t" '{print $1,$9,$10,$12}' > ${id}.annotations

2. 构建OrgDb

2.1. 安装并导入所需R包

#install.packages("")
library(dplyr)
library(stringr)
library(jsonlite)
library(AnnotationForge)
#顺手设置一下options
options(stringsAsFactors = F)

2.2. 读入生成的annotations文件

emapper <- read.table("mm39.annotations", header=TRUE, sep = "\t",quote = "")
#将空值替换为NA,方便后续使用na.omit()函数提出没有注释到的行
emapper[emapper==""]<-NA

2.3. 提取GO信息

# library(dplyr)
gene_info <- emapper %>% dplyr::select(GID = query, GENENAME = Preferred_name) %>% na.omit()
gos <- emapper %>% dplyr::select(query, GOs) %>% na.omit()
gene2go = data.frame(GID = character(),
                     GO = character(),
                     EVIDENCE = character())
# library(stringr)
gos_list <- function(x){
  the_gos <- str_split(x[2], ",", simplify = FALSE)[[1]]
  df_temp <- data.frame(GID = rep(x[1], length(the_gos)),
                        GO = the_gos,
                        EVIDENCE = rep("IEA", length(the_gos)))
  return(df_temp)
}
gene2gol <- apply(as.matrix(gos),1,gos_list)
gene2gol_df <- do.call(rbind.data.frame, gene2gol)
gene2go <- gene2gol_df
gene2go$GO[gene2go$GO=="-"]<-NA
gene2go<-na.omit(gene2go)
#                                               query1  GO:XXXXXX1
#query1 GO:XXXXXX1,GO:XXXXXX2,GO:XXXXXX3   =>   query1  GO:XXXXXX2
#                                               query1  GO:XXXXXX3

2.4. 同理,提取KEGG信息

gene2ko <- emapper %>% dplyr::select(GID = query, Ko = KEGG_ko)
gene2ko$Ko[gene2ko$Ko=="-"]<-NA
gene2ko<-na.omit(gene2ko)
gene2kol <- apply(as.matrix(gene2ko),1,gos_list)
gene2kol_df <- do.call(rbind.data.frame, gene2kol)
gene2ko <- gene2kol_df[,1:2]
colnames(gene2ko) <- c("GID","Ko")
gene2ko$Ko <- gsub("ko:","",gene2ko$Ko)
# library(jsonlite)
# 下面的json = "ko00001.json",如果你下载到其他地方,记得加上路径
update_kegg <- function(json = "ko00001.json") {
  pathway2name <- tibble(Pathway = character(), Name = character())
  ko2pathway <- tibble(Ko = character(), Pathway = character())
  kegg <- fromJSON(json)
  for (a in seq_along(kegg[["children"]][["children"]])) {
    A <- kegg[["children"]][["name"]][[a]]
    for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
      B <- kegg[["children"]][["children"]][[a]][["name"]][[b]] 
      for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
        pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
        pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
        pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
        pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
        kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
        kos <- str_match(kos_info, "K[0-9]*")[,1]
        ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))}}}
  save(pathway2name, ko2pathway, file = "kegg_info.RData")}
# 调用函数后在本地创建kegg_info.RData文件,以后只需要载入 "kegg_info.RData"即可
update_kegg()
# 载入kegg_info.RData文件
load(file = "kegg_info.RData")
gene2pathway <- gene2ko %>% left_join(ko2pathway, by = "Ko") %>% dplyr::select(GID, Pathway) %>% na.omit()

2.5. 获取物种信息

tax_id = "10090"
genus = "Mus" 
species = "musculus"
# tax_id = "9606"
# genus = "Homo" 
# species = "sapiens"
# tax_id = "3702"
# genus = "Arabidopsis" 
# species = "thaliana"
# gene2go <- unique(gene2go)
# gene2go <- gene2go[!duplicated(gene2go),]
# gene2ko <- gene2ko[!duplicated(gene2ko),]
# gene2pathway <- gene2pathway[!duplicated(gene2pathway),]
# gene_info <- gene_info[!duplicated(gene_info),]

2.6. 构建OrgDb

makeOrgPackage(gene_info=gene_info,
               go=gene2go,
               ko=gene2ko,
               pathway=gene2pathway,
               version="1.34.1",  #版本,使用?makeOrgPackage,拉到最下面查看
               maintainer = "your name <swuthor@163.com>",  #修改为你的名字和邮箱
               author = "your name <swuthor@163.com>",  #修改为你的名字和邮箱
               outputDir = ".",  #输出文件位置
               tax_id=tax_id,  #你在NCBI上查并定义的id
               genus=genus,  #你在NCBI上查并定义的属名
               species=species,  #你在NCBI上查并定义的种名
               goTable="go")

3. 导入OrgDb库

install.packages("./org.Mmusculus.eg.db", repos=NULL)
library(org.Mmusculus.eg.db)
# 查看所有列的信息
columns(org.Mmusculus.eg.db)
# 查看所有基因
keys(org.Mmusculus.eg.db)
# 查看特定基因的信息
# library(dplyr)
select(org.Mmusculus.eg.db, keys = "CW07G09620", columns = c("GO"))

4. 导出背景文件

# 做一些常规格式化
pathway2name$Name <- gsub(" \\[BR:ko[0-9]{5}\\]", "",pathway2name$Name)
pathway2name<- na.omit(pathway2name)
pathway2gene <-gene2pathway[, c("Pathway","GID")]
# 输出
write.table(pathway2name,file = "./pathway2name", sep = "\t", quote = F, row.names = F)
write.table(pathway2gene,file = "./pathway2gene", sep = "\t", quote = F, row.names = F)

5. 使用clusterProfiler富集分析

5.1 KEGG富集分析(不需要OrgDB)

library(clusterProfiler)
# 每次只需导入下面两个文件即可
pathway2gene <- read.table("./pathway2gene",header = T,sep = "\t")
pathway2name <- read.table("./pathway2name",header = T,sep = "\t")
# 导入你鉴定到的差异基因列表,并转化为向量
gene <- read.csv("/root/total_diff_gene.csv")
gene_list <- gene[,1]
# KEGG pathway 富集
ekp <- enricher(gene_list, 
                TERM2GENE = pathway2gene, 
                TERM2NAME = pathway2name, 
                pvalueCutoff = 1,  # 表示全部保留,可以设为0.05作为阈值
                qvalueCutoff = 1, # 表示全部保留,可以设为0.05作为阈值
                pAdjustMethod = "BH",
                minGSSize = 1)
dotplot(ekp)

5.2 GO富集分析(需要OrgDB)

library(org.Mmusculus.eg.db)
ego <- enrichGO(gene=gene_list,
                OrgDb=org.Mmusculus.eg.db,
                keyType="GID",
                ont="ALL",   #CC/BP/MF可选
                qvalueCutoff = 0.05,
                pvalueCutoff =0.05)
dotplot(ego)
# 无论是ekp还是ego都可以选择将数据导出,然后可以自己使用ggplot画
ego_results<-as.data.frame(ego)
write.table(ego_results, file = "./ego_results.txt", quote = F)
image.png
上一篇下一篇

猜你喜欢

热点阅读