#多个数据集差异基因整合#

2021-03-23 本文已影响0人辛晓红

image.png

加载包，设置差异倍数logFC和P值

#=======================================================

#=======================================================
setwd('D:\\SCIwork\\F34TNBC\\s5deg')
library(dplyr)
library(tidyr)
library(tibble)
#清除环境变量
rm(list=ls()) 
padj=0.05
logFC=0.5

读取各个数据集的差异基因

setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE18864')
a <- read.csv("allDiff.csv", header = T, row.names = 1)

setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE58812')
b <- read.csv("allDiff.csv", header = T, row.names = 1)

setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76124')
c <-read.csv("allDiff.csv", header = T, row.names = 1)

setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76250')
d <- read.csv("allDiff.csv", header = T, row.names = 1)

setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE83937')
e <- read.csv("allDiff.csv", header = T, row.names = 1)

setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE95700')
f <- read.csv("allDiff.csv", header = T, row.names = 1)

setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE106977')
g  <- read.csv("allDiff.csv", header = T, row.names = 1)

setwd('D:\\SCIwork\\F34TNBC\\s5deg\\TCGA')
h  <- read.csv("allDiff.csv", header = T, row.names = 1)
setwd('D:\\SCIwork\\F34TNBC\\s5deg')

读取各个数据集的差异基因

image.png

读取这些差异基因的数据框，其必须含有的列是logfc和p值两类

image.png

保存符合我们设置的差异基因标准基因

# --------------------------------------------------------
# 
# --------------------------------------------------------

a <- a %>% 
  rownames_to_column(var='gene')%>% 
  dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
  write.csv(a, file = "allDiff1.csv", row.names = F)

b <- b %>% 
  rownames_to_column(var='gene')%>% 
  dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
  write.csv(b, file = "allDiff2.csv", row.names = F)

c <- c %>% 
  rownames_to_column(var='gene')%>% 
  dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
  write.csv(c, file = "allDiff3.csv", row.names = F)


d <- d%>% 
  rownames_to_column(var='gene')%>% 
  dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
  write.csv(d, file = "allDiff4.csv", row.names = F)


e <- e%>% 
  rownames_to_column(var='gene')%>% 
  dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
  write.csv(e, file = "allDiff5.csv", row.names = F)



f <- f%>%
  rownames_to_column(var='gene')%>% 
  dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
  write.csv(f, file = "allDiff6.csv", row.names = F)


g <- g%>% 
  rownames_to_column(var='gene')%>% 
  dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
  write.csv(g, file = "allDiff7.csv", row.names = F)


h <- h%>% 
  rownames_to_column(var='gene')%>% 
  dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
  write.csv(h, file = "allDiff8.csv", row.names = F)

得到合并后的差异基因

# --------------------------------------------------------
# 
# --------------------------------------------------------
#读取差异基因分析结果，其必须包含两列：基因名和差异倍数#
files=c("allDiff1.csv","allDiff2.csv",
        "allDiff3.csv","allDiff4.csv",
        "allDiff5.csv","allDiff6.csv",
        "allDiff7.csv", "allDiff8.csv")
upList=list()
downList=list()
allFCList=list()

for(i in 1:length(files)){
  inputFile=files[i]
  rt=read.csv(inputFile,header=T,sep = ',', row.names = 1) # 注意文件读取
  rt$Gene <- rownames(rt)
  rt <- rt %>%
    distinct(Gene, .keep_all = T)%>%
    arrange(logFC)%>%
    dplyr::select(Gene, logFC)
  header=unlist(strsplit(inputFile,"_"))
  downList[[header[1]]]=as.vector(rt[,1])
  upList[[header[1]]]=rev(as.vector(rt[,1]))
  fcCol=rt[,1:2]
  colnames(fcCol)=c("Gene",header[[1]])
  allFCList[[header[1]]]=fcCol}

#合并文件
mergeLe=function(x,y){
  merge(x,y,by="Gene",all=T)}
newTab=Reduce(mergeLe,allFCList)
rownames(newTab)=newTab[,1]
newTab=newTab[,2:ncol(newTab)]
newTab[is.na(newTab)]=0

#计算共同上调基因
library(RobustRankAggreg)
upMatrix = rankMatrix(upList)
upAR = aggregateRanks(rmat=upMatrix)
colnames(upAR)=c("Name","Pvalue")
upAdj=p.adjust(upAR$Pvalue,method="bonferroni")
upXls=cbind(upAR,adjPvalue=upAdj)
upFC=newTab[as.vector(upXls[,1]),]
upXls=cbind(upXls,logFC=rowMeans(upFC))
write.table(upXls,file="up.xls",sep="\t",quote=F,row.names=F)
upSig=upXls[(upXls$Pvalue<padj & upXls$logFC>logFC),]
write.table(upSig,file="upSig.xls",sep="\t",quote=F,row.names=F)

#计算共同下调基因
downMatrix = rankMatrix(downList)
downAR = aggregateRanks(rmat=downMatrix)
colnames(downAR)=c("Name","Pvalue")
downAdj=p.adjust(downAR$Pvalue,method="bonferroni")
downXls=cbind(downAR,adjPvalue=downAdj)
downFC=newTab[as.vector(downXls[,1]),]
downXls=cbind(downXls,logFC=rowMeans(downFC))
write.table(downXls,file="down.xls",sep="\t",quote=F,row.names=F)
downSig=downXls[(downXls$Pvalue<padj & downXls$logFC< -logFC),]
write.table(downSig,file="downSig.xls",sep="\t",quote=F,row.names=F)

#合并上调与下调基因
allSig = rbind(upSig,downSig)
colnames(allSig)
write.csv(allSig,file = 'allSign.csv')

绘图准备：仅仅保留那些在所有数据存在的基因,以及删除TCGA和GEO趋势不一致的基因

setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE18864')
a <- read.csv("allDiff.csv", header = T, row.names = 1) %>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE58812')
b <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76124')
c <-read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76250')
d <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE83937')
e <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE95700')
f <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE106977')
g  <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg\\TCGA')
h  <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
setwd('D:\\SCIwork\\F34TNBC\\s5deg')
target <- allSig$Name
length(target)
a <- subset(a, select =c("gene",   "logFC" ))
b <- subset(b, select =c("gene",   "logFC" ))
c <- subset(c, select =c("gene",   "logFC" ))
d <- subset(d, select =c("gene",   "logFC" ))
e <- subset(e, select =c("gene",   "logFC" ))
f <- subset(f, select =c("gene",   "logFC" ))
g <- subset(g, select =c("gene",   "logFC" ))
h <- subset(h, select =c("gene",   "logFC" ))
gene <- intersect(a$gene, b$gene)
gene <- intersect(c$gene, gene)
gene <- intersect(d$gene, gene)
gene <- intersect(e$gene, gene)
gene <- intersect(f$gene, gene)
gene <- intersect(g$gene, gene)
gene <- intersect(h$gene, gene)
gene <- intersect(allSig$Name, gene)
dat <-  data.frame()
for (i in 1:length(gene)) {   
  dat[i,1] <- a[which(a$gene %in% gene[i]),'logFC']
  dat[i,2] <- b[which(b$gene %in% gene[i]),'logFC']
  dat[i,3] <- c[which(c$gene %in% gene[i]),'logFC']
  dat[i,4] <- d[which(d$gene %in% gene[i]),'logFC']
  dat[i,5] <- e[which(e$gene %in% gene[i]),'logFC']
  dat[i,6] <- f[which(f$gene %in% gene[i]),'logFC']
  dat[i,7] <- g[which(g$gene %in% gene[i]),'logFC']
  dat[i,8] <- h[which(g$gene %in% gene[i]),'logFC']
}
colnames(dat) <- c("GSE18864",  "GSE58812",
                   "GSE76124",  "GSE76250",
                   "GSE83937",  "GSE95700",
                   "GSE106977",'TCGA'
)
rownames(dat) <- gene
dat <- dat[which(rownames(dat) %in% rownames(allSig)),]
hminput <- dat
for (i in 1:dim(hminput)[1]) {  
  hminput$re[i] <- hminput[i,1]*hminput[i,2]*hminput[i,3]*hminput[i,4]*hminput[i,5]*hminput[i,6]*hminput[i,7]*hminput[i,8]
  hminput$re1[i] <- hminput[i,1]*hminput[i,2]*hminput[i,3]*hminput[i,4]*hminput[i,5]*hminput[i,6]*hminput[i,7]  
}
table(hminput$re )
hminput <- subset(hminput, hminput$re != 0)
hminput$re <- NULL
hminput$re1 <- hminput$re1*hminput$TCGA
hminput <- subset(hminput, hminput$re1 >0)
hminput$re1 <- NULL
for (x in 1:nrow(hminput )) {  
  for ( y in 1:ncol(hminput )) {    
    if(hminput[x,y] > 2){
      hminput[x,y] <- 2}
    if(hminput[x,y] < -2){
      hminput[x,y] <- -2}   
    
  }
  
}
bk = unique(c(seq(-2,2, length=100)))
setwd('D:\\SCIwork\\F34TNBC\\s5deg')
library(pheatmap)
pdf(file="logFC.pdf",width = 4,height = 4)
pheatmap(hminput,display_numbers = F,
         cluster_cols = T,
         cluster_row = T,
         border_color  = NA,
         breaks = bk,
         show_colnames     = T,
         show_rownames     = T,
         color = colorRampPalette(c("#20B6E2",
                                    "#020303",
                                    "#F5EB17"))(100), 
         fontsize_row=1,
         fontsize_col=5 )
dev.off()

代码数据私信我

#多个数据集差异基因整合#

加载包，设置差异倍数logFC和P值

读取各个数据集的差异基因

读取各个数据集的差异基因

读取这些差异基因的数据框，其必须含有的列是logfc和p值两类

保存符合我们设置的差异基因标准基因

得到合并后的差异基因

绘图准备：仅仅保留那些在所有数据存在的基因,以及删除TCGA和GEO趋势不一致的基因

猜你喜欢

热点阅读