转录组下游分析

2022-11-08  本文已影响0人  略略咯

(linux)上游分析得到count矩阵,进行下游分析(R)
流程:各组count合并矩阵--去除Ensemble ID版本号(.)--去重过滤--ID转换(1、AnnoProbe包annoGene函数转换 2clusterProfiler包bitr函数转换)--

# 清空环境
rm(list = ls())
## 报错设置为英文
Sys.setenv(LANGUAGE = "en")

######2.1. 软件准备
if (!requireNamespace("BiocManager", quietly = TRUE))install.packages("BiocManager")
if (!requireNamespace("stringr", quietly = TRUE))install.packages("stringr")
if (!requireNamespace("biomaRt", quietly = TRUE))install.packages("biomaRt")
if (!requireNamespace("filelock", quietly = TRUE))install.packages("filelock")
if (!requireNamespace("AnnotationDbi", quietly = TRUE))install.packages("AnnotationDbi")
if (!requireNamespace("clusterProfiler", quietly = TRUE))install.packages("clusterProfiler")
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE))install.packages("org.Hs.eg.db")
if (!requireNamespace("AnnoProbe", quietly = TRUE))install.packages("AnnoProbe")

######2.2.多个count矩阵的合并(merge)
setwd("D:\\fsdownload\\学校服务器\\psj\\SW620 LOVO 测序数据\\SW620\\bam文件")
v1<- read.table("SW620-V-1.txt",header = TRUE,sep = "\t")
v2<- read.table("SW620-V-2.txt",header = TRUE,sep = "\t")
v3<- read.table("SW620-V-3.txt",header = TRUE,sep = "\t")
Con_count<- merge(merge(v1,v2,by="Geneid"),v3,by="Geneid")
x1<- read.table("SW620-82-1.txt",header = TRUE,sep = "\t")
x2<- read.table("SW620-82-2.txt",header = TRUE,sep = "\t")
x3<- read.table("SW620-82-3.txt",header = TRUE,sep = "\t")
UL82_count<- merge(merge(x1,x2,by="Geneid"),x3,by="Geneid")
counts <- merge(Con_count,UL82_count,by="Geneid")#merge函数通过相同的Geneid列合并矩阵
names(counts)<-c("ENSEMBL","Con_1","Con_2","Con_3","UL82_1","UL82_2","UL82_3")#数据框name()即列名
##简化
#counts <- merge(merge(merge(v1,v2,by="Geneid"),v3,by="Geneid"),merge(merge(x1,x2,by="Geneid"),x3,by="Geneid"),by="Geneid")
#names(counts)<-c("ENSEMBL","Con_1","Con_2","Con_3","UL82_1","UL82_2","UL82_3")
##

######2.3. 去除Ensemble ID版本号
counts$ENSEMBL=unlist(str_split(counts$ENSEMBL,"[.]",simplify=T))[,1]###!!!
#counts$ENSEMBL列根据字符串的点号进行分割。
#a = str_split(a,"\\.",simplify = T)[,1] #不带\\直接写"."是正则表达式任意字符的意思,需要\\来转义。

######2.4.去重:保留平均值大的重复行
dim(counts)
counts <- na.omit(counts)#删除NA值  
counts<- counts[apply(counts,1,max)>=10, ]#剔除count值小于10的行
dup_gene<-data.frame(gene=counts$ENSEMBL,mean=apply(counts[,2:7],1,mean))#创建矩阵,计算每行的平均值(不能包括第一列)
counts1 <-counts[order(dup_gene$mean,decreasing = T),]#按平均值降序排列
counts1<- counts1[!duplicated(counts1 $ENSEMBL),]#保留平均值大的重复行
dim(counts1)
counts1 <- counts1[order(counts1 $ENSEMBL),]
setwd("D:\\fsdownload\\学校服务器\\psj\\SW620 LOVO 测序数据\\SW620")
write.table(counts1,file=".\\sw620-counts.txt" , sep ="\t", row.names =F,col.names =T,quote = FALSE)
write.csv(counts1,file=".\\sw620-counts.csv",row.names = F)

#######2.5 ID转换(如果提取蛋白编码基因,建议用AnnoProbe包)
###2.5.1. 基因类型注释:AnnoProbe包annoGene函数提取protein_coding基因,利用自带gene_symbol完成ID转换
library(AnnoProbe)
IDs <- counts1$ENSEMBL
ID_type = "ENSEMBL"
type <- annoGene(IDs, ID_type = "ENSEMBL",out_file ='ID_biotypes.csv')



![cdbf2c0d47d1641dd7ca250404e5035.png](https://img.haomeiwen.com/i28015006/c48c3b201c17b871.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)

#提取protein_coding基因
library(dplyr)
pro_coding<- filter(type,biotypes=="protein_coding")#提取type中biotypes=="protein_coding"的行
pro_coding <- pro_coding[!duplicated(pro_coding$ENSEMBL), ]#去除重复蛋白基因注释
protein_coding <- merge(counts1, pro_coding, by = c("ENSEMBL", "ENSEMBL"))#通过ENSEMBL合并数据框
protein_ENSEMBL <- protein_coding[,1:7]#保留"ENSEMBL","Con_1","Con_2","Con_3","UL82_1","UL82_2","UL82_3"列
write.csv(protein_ENSEMBL, file = "./procode_ENSEMBL.csv",row.names = F)
#x <- protein_coding[duplicated(protein_coding$SYMBOL),] 报错原因:多个ENSEMBL对应一个SYMBLE;或SYMBL未标出亚家族
##ENSEMBL转为SYMBOL
#保留"Con_1","Con_2","Con_3","UL82_1","UL82_2","UL82_3","SYMBOL"列,并将SYMBOL列作为第一列
protein_SYMBOL <- protein_coding %>% select(8,2,3,4,5,6,7)
#简化 protein_SYMBOL <- protein_coding[,c(8,2,3,4,5,6,7)]
write.csv(protein_SYMBOL, file = "./protein_SYMBOL_AnnoProbe.csv",row.names = F)

###2.5.2. clusterProfiler包bitr()函数转换ID
library(clusterProfiler)
library(org.Hs.eg.db)
table(duplicated(protein_ENSEMBL$ENSEMBL)) # 检查是否有重复的ensemble ID
dim(protein_ENSEMBL)
gene_id <- bitr(protein_ENSEMBL$ENSEMBL, 
            fromType = "ENSEMBL",
            toType = "SYMBOL",
            OrgDb = org.Hs.eg.db)#将ENSEMBL转为SYMBOL
dim(gene_id)
protein_symbol <- merge(gene_id, protein_ENSEMBL, by = c("ENSEMBL", "ENSEMBL"))
write.csv(protein_symbol, file = "./protein_symbol_bitr.csv",row.names = F)
dim(protein_SYMBOL)
dim(protein_symbol)
上一篇下一篇

猜你喜欢

热点阅读