1 介绍

针对基因大量样本量表达数据的分析方法(RNA-seq 、微阵列)，建议样本数量大于15，可以挖掘具有相似表达模式的模块并计算与形状的相关性。

2 构建WGCNA的步骤

✓ 构建共表达网络
✓ 基因模块划分
✓ 模块与性状的关联分析
✓ 模块之间的关联分析
✓ 模块中核心基因的鉴定

数据链接，提取码：xhg3

3 代码展示

3.1 首先读入数据并对数据进行预处理

library(WGCNA)
options(stringsAsFactors = FALSE);
femData = read.csv("LiverFemale3600.csv")
#Take a quick look at what is in the data set:
dim(femData)
[1] 3600  143
names(femData)
head(femData)

#筛选方差前5000的基因
#femData<- as.data.frame(femData[order(apply(femData,1,mad),decreasing = T)[1:5000],])

# femData0 <- femData[rowSums(femData) > 0]
#2.删除不必要的表型信息
datExpr0 = as.data.frame(t(femData[, -c(1:8)]));
names(datExpr0) = femData$substanceBXH;
rownames(datExpr0) = names(femData)[-c(1:8)]; 

#检查基因个样本并过滤掉有缺失值的基因和样本
#检查并标记缺失值
gsg = goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK

#删除具有缺失值的基因和样本
#这里如果存在缺失值，gsg$goodGenes显示为F，之后使用paste函数将‘Removing #genes:’与具有缺失值的基因名和模块名粘连在一起并print出来
if (!gsg$allOK)
{
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0) 
    printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
  if (sum(!gsg$goodSamples)>0) 
    printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
  # Remove the offending genes and samples from the data:
  datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}

#层次聚类观察离群值
#首先构树
sampleTree = hclust(dist(datExpr0), method = "average");
#将图片输出到本地
sizeGrWindow(12,9)
pdf("Plots_sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, 
     cex.axis = 1.5, cex.main = 2)
abline(h = 15, col = "red");
clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)
#别忘了dev.off()
dev.off()
table(clust)
clust
  0   1 
  1 134

sample clustering


#设定阈值之后我们发现有一个离群值，随后我们要将其去除
#删除离群值，保留第一聚类簇
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)



#载入表型数据
traitData = read.csv("ClinicalTraits.csv");
dim(traitData)
[1] 361  38
names(traitData)

# 删除不必要的表型数据
allTraits = traitData[, -c(31, 16)];
allTraits = allTraits[, c(2, 11:36) ];
dim(allTraits)
names(allTraits)

# match表型信息与基因表达数据，并删除没有对应的样本
femaleSamples = rownames(datExpr);
traitRows = match(femaleSamples, allTraits$Mice);
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1];
collectGarbage();


#删除离群值后重新进行聚类
pdf("Sample dendrogram and trait heatmap.pdf",height=10,width=20)
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits), 
                    main = "Sample dendrogram and trait heatmap")
dev.off()

3.2 软阈值的筛选

#设定画图显示时的阈值，1-20
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
sizeGrWindow(9, 5)# 设定窗口大小
par(mfrow = c(1,2));
cex1 = 0.9;
# 计算软阈值，并画图展示，设定abline寻找最适合的阈值
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# 根据0.9的R^2值筛选阈值
abline(h=0.90,col="red")
# 同时观察不同阈值的连接度
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

从这两个图可以看出最佳阈值是6

3.3 构建网络

#选择6作为软阈值
softPower = 6;
#构建邻接矩阵
adjacency = adjacency(datExpr, power = softPower);
#构建TOM矩阵
TOM = TOMsimilarity(adjacency);
dissTOM = 1-TOM
#使用TOM矩阵进行聚类
geneTree = hclust(as.dist(dissTOM), method = "average");
# 作图展示
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04);
# 设定模块基因数量的下限-30
minModuleSize = 30;
# 鉴定模块
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                            deepSplit = 2, pamRespectsDendro = FALSE,
                            minClusterSize = minModuleSize);
table(dynamicMods)
  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22 
 88 614 316 311 257 235 225 212 158 153 121 106 102 100  94  91  78  76  65  58  58  48  34

3.4 为模块划分颜色

dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# 层次聚类下添加颜色注释
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")

3.5 合并相似模块

MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
# 计算特征值相异性
MEDiss = 1-cor(MEs);
# 聚类模块基因
METree = hclust(as.dist(MEDiss), method = "average");
# 结果可视化
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")
MEDissThres = 0.25
# 在0.25处添加abline，并认为abline下方的模块相似度较高，后续将合并成一个模块
abline(h=MEDissThres, col = "red")
# 调用合并函数，并对abline下的模块进行合并
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# 别忘了更改合并后模块的颜色注释
mergedColors = merge$colors;
mergedMEs = merge$newMEs;
#作图展示合并前后模块
sizeGrWindow(12, 9)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
按照每个模块基因数量对模块进行排序
colorOrder = c("grey", standardColors(50));
#match排序前后的模块顺序
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;

3.6 计算模块与表型的相关性及其p值

nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# 按照颜色重新计算模块
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
#利用皮尔森相关系数计算相关性
moduleTraitCor = cor(MEs, datTraits, use = "p");
#计算p值
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);

3.7针对表型和模块的相关性做热图展示结果

sizeGrWindow(10,6)
# 匹配相关性及其p值
textMatrix =  paste(signif(moduleTraitCor, 2), "\n(",
                    signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# 热图展示
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(datTraits),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))

3.8 提取模块

通过上一步我们发现weight表型与magenta模块具有最高的相关性，所以我们提取相关模块进行进一步分析，首先针对与weight表型相关的基因进行热图展示

#提取weight表型信息
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
# 提取模块信息
modNames = substring(names(MEs), 3)
#皮尔森系数计算模块与基因的相关性
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
#皮尔森系数计算基因与weight表型相关性
geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) = paste("GS.", names(weight), sep="");
names(GSPvalue) = paste("p.GS.", names(weight), sep="");
write.csv(GSPvalue, file = "GSPvalue")

3.9 计算关键模块中基因与表型的相关性

#确定模块magenta
module = "magenta"
column = match(module, modNames);
moduleGenes = moduleColors==module;
#作图展示相关性
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene significance for body weight",
                   main = paste("Module membership vs. gene significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
names(datExpr)
names(datExpr)[moduleColors=="magenta"]

3.10 对TOM矩阵的网络相关性计算

#随机筛选400个基因
nSelect = 400
# 数据的可重复性，使用set.seed()
set.seed(10);
select = sample(nGenes, size = nSelect);
selectTOM = dissTOM[select, select];
# 对于随机选择的基因我们无法对以前的聚类树进行剪枝，所以我们进行重新聚类
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select];
# 作图展示
sizeGrWindow(9,9)
#说实话这个TOM矩阵的热图我也看不出什么意义，取7次方的意义是为了热图更好看吗
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")

3.11 对模块与表型进行聚类

MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
# 提取表型信息
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
# 在模块中添加weight表型信息
MET = orderMEs(cbind(MEs, weight))
# 随后对模块和感兴趣表型进行聚类
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)

这里取消了层次聚类效果
sizeGrWindow(6,6);
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
                      plotHeatmaps = FALSE)
# 作图展示
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
                      plotDendrograms = FALSE, xLabelsAngle = 90)

3.12 导出模块中文件

#重新计算TOM矩阵
TOM = TOMsimilarityFromExpr(datExpr, power = 6);
# 选择模块，可以选1至多个
modules = c('magenta');
# 选择模块中基因
probes = names(datExpr)
inModule = is.finite(match(moduleColors, modules));
modProbes = probes[inModule];
#modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# 以cytoscape可读的方式导出基因列表以及相关性
cyt = exportNetworkToCytoscape(modTOM,
                               edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
                               nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
                               weighted = TRUE,
                               threshold = 0.02,
                               nodeNames = modProbes,
                               #altNodeNames = modGenes,
                               nodeAttr = moduleColors[inModule]);

WGCNA