WGCNA分析-单细胞转录组高阶分析01

2023-09-10 本文已影响0人一车小面包人

背景：在umap聚类的基础上，进行WGCNA的分析。

WGCNA
（1）两个部分：表达量聚类分析和表型关联
（2）四个步骤：基因之间相关系数计算、基因模块的确定、共表达网络、模块与性状关联
预处理：将原始细胞基因表达矩阵转换为伪细胞基因表达矩阵
所谓的伪细胞就是将原来10(size=10)个细胞当作是1个细胞来分析，当然size是可以改变的。
这里写一个过程：

library(Seurat)
library(dplyr)
sc<-readRDS("./sc_umap.rds")
sc<-sc
#' 构建原始细胞名字和伪细胞名字的对照表格cell.id_table
old_cell.id<-c()
new_cell.id<-c()
for(i in unique(sc$seurat_clusters)){
        sub_meta.data<-sc@meta.data[sc@meta.data$seurat_clusters==i,]
        sub_cell.id<-rownames(sub_meta.data)
        int_num<-floor(length(sub_cell.id)/10)
        remain_num<-length(sub_cell.id)%%10

        for(j in 1:int_num){
                sub_new<-rep(paste0(i,"_cell_",j),10)
                new_cell.id<-c(new_cell.id,sub_new)
        }
        sub_new<-rep(paste0(i,"_cell_",j+1),remain_num)
        new_cell.id<-c(new_cell.id,sub_new)
        old_cell.id<-c(old_cell.id,sample(sub_cell.id)) #'这里sample函数相当重要，它打乱了原始的细胞顺序，增加随机性
}
cell.id_table<-data.frame(new.id=new_cell.id,old.id=old_cell.id)
head(cell.id_table)

cell.id_table.png
table(cell.id_table$new.id)可以看见每个伪细胞里其实包含了原始的10个细胞：

table.png

#' 提取原始seurat对象的标准化表达矩阵
t.counts<-data.table(t(as.matrix(slot(sc@assays[["RNA"]],"data"))),keep.rownames=T) #'这样提取避免行名是字符串
rownames(cell.id_table)<-cell.id_table$old.id
cell.id_table<-cell.id_table[t.counts$rn,]
t.counts$rn<-cell.id_table$new.id
t.counts<- t.counts[,lapply(.SD,mean), by=rn] #’根据rn那一列的值，合并基因表达量，mean代表使用平均表达量
gene.name <- t.counts$rn
cell.id <- rownames(t.counts)
my.counts <- t(t.counts[,rn:=NULL])
colnames(my.counts)<-cell.id)
new.sc<- CreateSeuratObject(my.counts)
saveRDS(new.sc,"pseudo_sc.rds")

my.counts.png

可以看到细胞名字已经变成了伪细胞的名字。

WGCNA分析

library(WGCNA)
options(stringsAsFactors = FALSE)
enableWGCNAThreads() #' 打开多线程
datExpr0<-as.data.frame(t(my.counts)) #' 行为伪细胞，列为基因
#' 检查缺失值
gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK #' 这个值为TRUE就没问题，反之进行以下筛选
if (!gsg$allOK){
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0)
    printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")))
  if (sum(!gsg$goodSamples)>0)
    printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")))
  # Remove the offending genes and samples from the data:
  datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}

这里也许过滤了一些细胞或者基因，接下来进一步过滤基因：

#' 过滤，根据阈值筛选基因列
meanFPKM= 0.5  #' 过滤标准，可以修改
n=nrow(datExpr0)
datExpr0[n+1,]=apply(datExpr0[c(1:nrow(datExpr0)),],2,mean) #' 计算每一列基因的平均值
datExpr0=datExpr0[1:n,datExpr0[n+1,] > meanFPKM] #' 只保留平均值大于阈值的列
# for meanFpkm in row n+1 and it must be above what you set--select meanFpkm>opt$meanFpkm(by rp)
filtered_fpkm=t(datExpr0) #' 转置，变为行为基因，列为细胞
filtered_fpkm=data.frame(rownames(filtered_fpkm),filtered_fpkm) #' 转换为数据框
names(filtered_fpkm)[1]="sample"
head(filtered_fpkm)
write.table(filtered_fpkm, file="mRNA.symbol.uniq.filter.txt",
            row.names=F, col.names=T,quote=FALSE,sep="\t")

nGenes = ncol(datExpr0)
nSamples = nrow(datExpr0)

head(filtered_fpkm).png

接下来过滤一些细胞：

#' 聚类，对伪细胞进行聚类,筛选伪细胞,去除聚类关系上较远的伪细胞
sampleTree = hclust(dist(datExpr0), method = "average")
pdf(file = "sampleClustering_fig1.pdf", width = 15, height = 8)
par(cex = 0.6)
par(mar = c(0,6,6,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2,
     cex.axis = 1.5, cex.main = 2)
### Plot a line to show the cut
#abline(h = 180, col = "red")##剪切高度不确定，故无红线
dev.off()

sampleclustering.png

接下来是较为关键的一步，寻找合适的软阈值power：

#' 寻找软阈值power,power是对相关性的值进行幂次运算（所谓加权网络的边）的值
#' abs(cor(geneX, geneY)) ^ power
powers=c(c(1:10),seq(from = 12, to = 20, by = 2))
sft = pickSoftThreshold(datExpr0, powerVector = powers, verbose = 5)
#' 可视化power，拐点位置就是合适的值
pdf('powers_fig2.pdf',18,10)
par(mfrow = c(1,2))
cex1 = 0.9
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",
     ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red")
abline(h=0.85,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,
     cex=cex1, col="red")
dev.off()

powers_fig2.png

通常，选取在拐点处的值为软阈值，这里power=4。

#' 获取预测的power值，如果没有则根据以下规则获取
power = sft$powerEstimate
if (is.na(power)){
  power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18),
          ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16),
          ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14),
          ifelse(type == "unsigned", 6, 12))
          )
          )
}

接下来计算基因模块：

#' 确定模块modules
#' TOMType = "unsigned"构建无尺度网络；minModuleSize = 30最小模块基因数为30；
#' TOM矩阵(topological overlap matrix,TOM拓扑重叠矩阵):数值都在[0,1]间，
#' 是用于反应两两基因共表达的相似度，值越高，说明共表达相似度越高
cor = WGCNA::cor #' 指定使用WGCNA包中的相关性计算函数
net = blockwiseModules(datExpr0, power = power, maxBlockSize = nGenes,
                       TOMType = "unsigned", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       saveTOMs=TRUE, corType = "pearson",
                       maxPOutliers=1, loadTOMs=TRUE,
                       saveTOMFileBase = paste0("first", ".tom"),
                       verbose = 3)
cor = stats::cor
table(net$colors)
#' TOM矩阵的层次聚类及模块颜色结果可视化
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels) #' 将标签转换为颜色color
pdf('modules_fig3.pdf',18,10)
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
dev.off()

modules_fig3.png

这个图呢，上面部分是基因聚类，下面颜色部分是不同的基因模块。
接下来绘制校正矩阵的热图：

#' 校正矩阵热图
MEs = net$MEs
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
pdf('adjacency_fig4.pdf',18,10)
plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap",
                      marDendro = c(3,3,2,4),
                      marHeatmap = c(3,4,2,2), plotDendrograms = T,
                      xLabelsAngle = 90)
dev.off()

adjacency_fig4.png

模块与模块之间相关性可视化：

#’ TOM矩阵热图可视化，模块与模块之间的相关性
load(net$TOMFiles[1], verbose=T)
TOM <- as.matrix(TOM)
dissTOM = 1-TOM
plotTOM = dissTOM^7
diag(plotTOM) = NA
pdf('TOM_fig5.pdf',18,10)
TOMplot(plotTOM, net$dendrograms, moduleColors,
                main = "Network heatmap plot, all genes")
dev.off()

TOM_fig5.png

绘制每一个模块中关键基因的网状图：

#' 模块里的关键基因
probes = colnames(datExpr0)
dimnames(TOM) <- list(probes, probes)
cyt = exportNetworkToCytoscape(TOM,
             edgeFile = paste("first", ".edges.txt", sep=""),
             nodeFile = paste("first", ".nodes.txt", sep=""),
             weighted = TRUE, threshold = 0,
             nodeNames = probes, nodeAttr = moduleColors)
#' 可视化每个模块的关键基因的网络图，去掉了grey颜色的模块
edges <- read.table(paste("first", ".edges.txt", sep=""),sep='\t',header=T)
nodes <- read.table(paste("first", ".nodes.txt", sep=""),sep='\t',header=T)
library(RColorBrewer)
coul<-c(brewer.pal(8, "Dark2"),brewer.pal(12, "Paired"),brewer.pal(8, "Set2"),brewer.pal(9, "Set1"))
colnames(nodes) <- c(colnames(nodes)[1:2],'group')
lgroup <- unique(nodes$group)
lgroup <- lgroup[-grep('grey',lgroup)]
for (i in lgroup){
nodes1 <- subset(nodes,group==i)
c1 <- which(edges[,1] %in% nodes1[,1])
edges1 <- edges[c1,]
c1 <- which(edges1[,2] %in% nodes1[,1])
edges1 <- edges1[c1,]
mtitle <- dim(nodes1)[1]
plot_network(edges=edges1,nodes=nodes1,group.by=NULL,weight=NULL,coul = coul,layout=layout.sphere,pf=i,main=mtitle)
}

#' 筛选HUG基因，并绘制每个模块中的hug基因网络图
library(dplyr)
edgeData1 <- cyt$edgeData[,c(1,2,3)]
edgeData2 <- cyt$edgeData[,c(2,1,3)]
nodeattrib <- cyt$nodeData[,c(1,3)]
colnames(nodeattrib) <- c("nodeName", "Module")
colnames(edgeData1) <- c("Node1","Node2","Weight")
colnames(edgeData2) <- c("Node1","Node2","Weight")
edgeData <- rbind(edgeData1, edgeData2)
edgeData$Module1 <- nodeattrib[match(edgeData$Node1, nodeattrib$nodeName), 2]
edgeData$Module2 <- nodeattrib[match(edgeData$Node2, nodeattrib$nodeName), 2]
edgeData <- edgeData[edgeData$Module1==edgeData$Module2,c(1,3,4)] #'同一个模块内的基因网络信息
nodeTotalWeight <- edgeData %>% group_by(Node1, Module1) %>% summarise(weight=sum(Weight))
nodeTotalWeight <- nodeTotalWeight[with(nodeTotalWeight, order(Module1, -weight)),]
nodeTotalWeightTop = nodeTotalWeight %>% group_by(Module1) %>% top_n(10, weight) %>% data.frame()
write.table(nodeTotalWeightTop,'hug_gene_list.xls',sep='\t',quote=F)
lgroup <- unique(nodeTotalWeightTop$Module1)
lgroup <- lgroup[-grep('grey',lgroup)]
for (i in lgroup) {
nodes1 <- subset(nodeTotalWeightTop,Module1==i)
c1 <- which(edges[,1] %in% nodes1[,1])
edges1 <- edges[c1,]
c1 <- which(edges1[,2] %in% nodes1[,1])
edges1 <- edges1[c1,]

c1 <- which(nodes[,1] %in% nodes1[,1])
nodes1 <- nodes[c1,]
nodes1 <- subset(nodes1,group==i)
mtitle <- dim(nodes1)[1]
plot_network(edges=edges1,nodes=nodes1,group.by=NULL,weight='weight',coul = coul,layout=layout.sphere,pf=paste0("hug_genes_",i),main=mtitle)
}

接下来是将性状信息与基因模块关联起来的一些分析：

#' 计算基因模块与性状相关性
#' 需要性状相关的数据框信息
trait=new.sc@meta.data #' 性状相关的数据框信息
#trait=trait[,-1]
corType = "pearson"
robustY = ifelse(corType=="pearson",T,F)
if(!is.null(trait)) {
  traitData = trait
  sampleName = rownames(datExpr0)
  traitData = traitData[match(sampleName, rownames(traitData)), ]

if (corType=="pearsoon") {
  modTraitCor = cor(MEs_col, traitData, use = "p")
  modTraitP = corPvalueStudent(modTraitCor, nSamples)
} else {
  modTraitCorP = bicorAndPvalue(MEs_col, traitData, robustY=robustY)
  modTraitCor = modTraitCorP$bicor
  modTraitP   = modTraitCorP$p
}
textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
pdf('corMetaGene_fig6.pdf',18,10)
labeledHeatmap(Matrix = modTraitCor, xLabels = colnames(traitData),
               yLabels = colnames(MEs_col),
               cex.lab = 0.5,
               ySymbols = colnames(MEs_col), colorLabels = FALSE,
               colors = blueWhiteRed(50),
               textMatrix = textMatrix, setStdMargins = FALSE,
               cex.text = 0.5, zlim = c(-1,1),
               main = paste("Module-trait relationships"))
dev.off()
}

module='brown'
pheno='orig.ident'
if (corType=="pearsoon") {
  geneModuleMembership = as.data.frame(cor(datExpr0, MEs_col, use = "p"))
  MMPvalue = as.data.frame(corPvalueStudent(
             as.matrix(geneModuleMembership), nSamples))
}else {
  geneModuleMembershipA = bicorAndPvalue(datExpr0, MEs_col, robustY=robustY)
  geneModuleMembership = geneModuleMembershipA$bicor
  MMPvalue   = geneModuleMembershipA$p
}

if (corType=="pearsoon") {
  geneTraitCor = as.data.frame(cor(datExpr0, traitData, use = "p"))
  geneTraitP = as.data.frame(corPvalueStudent(
             as.matrix(geneTraitCor), nSamples))
} else {
  geneTraitCorA = bicorAndPvalue(datExpr0, traitData, robustY=robustY)
  geneTraitCor = as.data.frame(geneTraitCorA$bicor)
  geneTraitP   = as.data.frame(geneTraitCorA$p)
}
modNames = substring(colnames(MEs_col), 3)
module_column = match(module, modNames)
pheno_column = match(pheno,colnames(traitData))
moduleGenes = moduleColors == module
sizeGrWindow(7, 7)
par(mfrow = c(1,1))
pdf(paste0(module,"_",pheno,'_verboseScatterplot_fig7.pdf'),18,10)
verboseScatterplot(abs(geneModuleMembership[moduleGenes, module_column]),
                   abs(geneTraitCor[moduleGenes, pheno_column]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = paste("Gene significance for", pheno),
                   main = paste("Module membership vs. gene significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
dev.off()

总结：是稍微复杂了一点。

WGCNA分析-单细胞转录组高阶分析01

猜你喜欢

热点阅读