WGCNA学习笔记1

2021-09-01 本文已影响0人 jiarf

现在想要做一下dc细胞类型的WGCNA，参考教程是周运来小红书，大家可以去找一下，我的笔记如下：

# 37.WGCNA of  DC and  MA -------------------------------------------------
rm(list=ls())
setwd('/data1/nyy/HGPS_scRNAseq/seurat/integrate/cell_tyle/DC_WGCNA')
library(WGCNA)
library(Seurat)
library(tidyverse)
library(reshape2)
library(stringr)
library(dplyr)

load('/data1/nyy/HGPS_scRNAseq/seurat/integrate/Rdata/LS.sample.FindClusters.sub.cluster.group.sample.abb.RData')
names(sample.integrated@meta.data)
cell.type <- sample.integrated@meta.data[,c(1,20,24)]

table(cell.type$cell.type)
dc <- rownames(cell.type[cell.type$cell.type=='Myeloid dendritic cells',])
data <- as.data.frame(sample.integrated@assays$RNA@data)

dc_data <- data[,which(names(data)%in%dc)]#18060
head(dc_data[1:4,1:4])
dim(dc_data)#18060   280 这就是表达谱

# 进一步筛选细胞和基因
m.mad <- apply(dc_data,1,mad)
head(m.mad)
#dataExprVar <- dataExpr[which(m.mad > 
                               # max(quantile(m.mad, probs=seq(0, 1, 0.25))[2],0.01)),]

dataExprVar <- dc_data[which(m.mad > 0),]#2039
## 转换为样品在行，基因在列的矩阵
dataExpr <- as.data.frame(t(dataExprVar))
dim(dataExpr)#280 2039
head(dataExpr)[,1:8]

## 检测缺失值
gsg = goodSamplesGenes(dataExpr, verbose = 3)
gsg$allOK
gsg$goodSamples

if (!gsg$allOK){
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0) 
    printFlush(paste("Removing genes:", 
                     paste(names(dataExpr)[!gsg$goodGenes], collapse = ",")));
  if (sum(!gsg$goodSamples)>0) 
    printFlush(paste("Removing samples:", 
                     paste(rownames(dataExpr)[!gsg$goodSamples], collapse = ",")));
  # Remove the offending genes and samples from the data:
  dataExpr = dataExpr[gsg$goodSamples, gsg$goodGenes]
}

nGenes = ncol(dataExpr)
nSamples = nrow(dataExpr)

dim(dataExpr)#280 2039
## 查看是否有离群样品
sampleTree = hclust(dist(dataExpr), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")#这要是几千个细胞的话，简直一团黑

真的是一团黑

powers = c(c(1:10), seq(from = 12, to=30, by=2))
sft = pickSoftThreshold(dataExpr, powerVector=powers, 
                        networkType="signed", verbose=5)
par(mfrow = c(1,2))
cex1 = 0.9
# 横轴是Soft threshold (power)，纵轴是无标度网络的评估参数，数值越高，
# 网络越符合无标度特征 (non-scale)
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",
     ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red")
# 筛选标准。R-square=0.85
abline(h=0.85,col="red")

# Soft threshold与平均连通性
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, 
     cex=cex1, col="red")

power = sft$powerEstimate
softPower  = power
softPower#9
#参数beta取值默认是1到30，上述图形的横轴均代表权重参数β，左图纵轴代表对应的网络中log(k)与log(p(k))相关系数的平方。相关系数的平方越高，说明该网络越逼近无网路尺度的分布。右图的纵轴代表对应的基因模块中所有基因邻接函数的均值。最佳的beta值就是sft$powerEstimate，已经被保存到变量了，不需要知道具体是什么，后面的代码都用这个即可，在本例子里面是9。

最后画出来的图

#即使你不理解它，也可以使用代码拿到合适“软阀值（soft thresholding power）”beta进行后续分析。
# 无向网络在power小于15或有向网络power小于30内，没有一个power值可以使
# 无标度网络图谱结构R^2达到0.8，平均连接度较高如在100以上，可能是由于
# 部分样品与其他样品差别太大。这可能由批次效应、样品异质性或实验条件对
# 表达影响太大等造成。可以通过绘制样品聚类查看分组信息和有无异常样品。
# 如果这确实是由有意义的生物变化引起的，也可以使用下面的经验power值。
if (is.na(power)){
  power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18),
                 ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16),
                        ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14),
                               ifelse(type == "unsigned", 6, 12))       
                 )
  )
}

power
#一步法网络构建：One-step network construction and module detection##
# power: 上一步计算的软阈值
# maxBlockSize: 计算机能处理的最大模块的基因数量 (默认5000)；
#  4G内存电脑可处理8000-10000个，16G内存电脑可以处理2万个，32G内存电脑可
#  以处理3万个
#  计算资源允许的情况下最好放在一个block里面。
# corType: pearson or bicor
# numericLabels: 返回数字而不是颜色作为模块的名字，后面可以再转换为颜色
# saveTOMs：最耗费时间的计算，存储起来，供后续使用
# mergeCutHeight: 合并模块的阈值，越大模块越少

# type = unsigned

cor <- WGCNA::cor

 net = blockwiseModules(dataExpr, power = power, maxBlockSize = nGenes,#nGenes
+                        TOMType = "unsigned", minModuleSize = 10,
+                        reassignThreshold = 0, mergeCutHeight = 0.25,
+                        numericLabels = TRUE, pamRespectsDendro = FALSE,
+                        saveTOMs=TRUE, corType = 'pearson', 
+                         loadTOMs=TRUE,
+                        saveTOMFileBase = paste0("dataExpr", ".tom"),
+                        verbose = 3)
 Calculating module eigengenes block-wise from all genes
   Flagging genes and samples with too many missing values...
    ..step 1
 ..Working on block 1 .
    TOM calculation: adjacency..
    ..will not use multithreading.
     Fraction of slow calculations: 0.000000
    ..connectivity..
    ..matrix multiplication (system BLAS)..
    ..normalization..
    ..done.
   ..saving TOM for block 1 into file dataExpr.tom-block.1.RData
 ....clustering..
 ....detecting modules..
 ....calculating module eigengenes..
 ....checking kME in modules..
 ..merging modules that are too close..
     mergeCloseModules: Merging modules whose distance is less than 0.25
       Calculating new MEs...
table(net$colors)
net$unmergedColors
head(net$MEs)
net$goodSamples
net$goodGenes
net$TOMFiles
net$blockGenes
net$blocks
net$MEsOK
head(net$MEs)
table(net$colors)

#这里用不同的颜色来代表那些所有的模块，其中灰色默认是无法归类于任何模块的那些基因，如果灰色模块里面的基因太多，那么前期对表达矩阵挑选基因的步骤可能就不太合适。
## 灰色的为**未分类**到模块的基因。
# Convert labels to colors for plotting
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
moduleColors
# Plot the dendrogram and the module colors underneath
# 如果对结果不满意，还可以recutBlockwiseTrees，节省计算时间
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

灰色的也太多了……

让我做下去的原因是还有不是灰色的，哈哈哈哈，全程代码都很好用，直接点点点就行，不会报错

#这里用不同的颜色来代表那些所有的模块，其中灰色默认是无法归类于任何模块的那些基因，如果灰色模块里面的基因太多，那么前期对表达矩阵挑选基因的步骤可能就不太合适。
## 灰色的为**未分类**到模块的基因。
# Convert labels to colors for plotting
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
moduleColors
# Plot the dendrogram and the module colors underneath
# 如果对结果不满意，还可以recutBlockwiseTrees，节省计算时间
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)


# module eigengene, 可以绘制线图，作为每个模块的基因表达趋势的展示
MEs = net$MEs

### 不需要重新计算，改下列名字就好
### 官方教程是重新计算的，起始可以不用这么麻烦
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(
  as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)

# 根据基因间表达量进行聚类所得到的各模块间的相关性图
# marDendro/marHeatmap 设置下、左、上、右的边距
head(MEs_col)
?plotEigengeneNetworks
plotEigengeneNetworks(MEs, "Eigengene adjacency heatmap", 
                      marDendro = c(3,3,2,4),
                      marHeatmap = c(3,4,2,2),
                      plotDendrograms = T,
                      xLabelsAngle = 90)

image.png

WGCNA学习笔记1

猜你喜欢

热点阅读