WGCNA
rm(list = ls())
#加载R包
library(WGCNA)
library(tidyr)
library(ggplot2)
library(gridExtra)
setwd("D:\\项目探索\\水稻EDP\\文献\\WGCNA数据结果20190426")
#=================================================================#
#
# 1.导入表达数据
#
#=================================================================#
gene_exp <- read.table('D:\\项目探索\\水稻EDP\\文献\\WGCNA数据结果20190426\\DSC.txt',
sep = '\t',
header = T,
stringsAsFactors = F,
#第一列作为行名
row.names = 1)
gene_exp[is.na(gene_exp)] <- 0 # 将NA转换为0
# 根据方差筛选
m.vars=apply(gene_exp,1,var)
expro.upper=gene_exp[which(m.vars>quantile(m.vars, probs = seq(0, 1, 0.25))[4]),]
dim(expro.upper)
datExpr <- t(expro.upper) # 在WGCNA中需要转置表达矩阵
gsg = goodSamplesGenes(datExpr,verbose = 3)
gsg$allOK
#查看行和列
dim(datExpr)
# 聚类分析是否存在离群值
sampleTree = hclust(dist(datExpr),method = "average")
sizeGrWindow(12,9)
par(cex=0.6)
par(mar = c(0,4,2,0))
plot(sampleTree,
main = "Sample clustering to detect outliners",
sub = "",xlab = "",cex.lab=1.5,
cex.axis=1.5,cex.main = 2)
# 存在离群样本时可以剔除
if(F){abline(h = 15,col = "red")
clust = cutreeStatic(sampleTree,cutHeight = 15,minSize = 10)
table(clust)
keepSamples = (clust==1)
datExpr =datExpr0[keepSamples,]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
}
#=================================================================#
#
# 2.寻找最佳β值
#
#=================================================================#
# 选择并构建一个软阈值向量
powers = c(c(1:10),seq(from = 12,to = 20,by=2)
# 执行TOM功能
sft = pickSoftThreshold(datExpr,powerVector = powers,verbose = 5)
# 绘图
sizeGrWindow(9,5)
par(mfrow = c(1,2)) # 一页多图
cex1 = 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1],-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold(power)",ylab = "Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1],-sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red")
abline(h = 0.90,col = "red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1],sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab ="Mean Connectivity",type = "n",main = paste("Mean connectivity"))
text(sft$fitIndices[,1],sft$fitIndices[,5],labels=powers,cex = cex1,col="red")
#=================================================================#
#
# 3.一步法构建网络模块
#3.1 minModuleSize指定Module所含基因数最少
#3.2 mergeCutHeight指定合并Module的阈值
#3.3 列表net中包含了许多信息
#3.4 可以通过recutBlockwiseTrees函数来修改模块的一些参数,而不用重新计算聚类树
#
#=================================================================#
net = blockwiseModules(datExpr,power = sft$powerEstimate,
TOMType = "unsigned",minModuleSize = 30,reassignThreshold = 0,mergeCutHeight = 0.25,
numericLabels =TRUE,pamRespectsDendro = FALSE,saveTOMs = TRUE,saveTOMFileBase = "RiceTOM",
verbose = 3)
table(net$colors)
# 绘图
sizeGrWindow(12,9)
# 将标签转换为颜色
mergedColors = labels2colors(net$colors)
# 绘制树图及模块颜色图
plotDendroAndColors(net$dendrograms[[1]],mergeColors[net$blockGenes[[1]],"Module colors",dendroLabels = FALSE,hang = 0.03,addGuide = TRUE,guideHang = 0.05)
# 保存相关数据
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs
geneTree = net$dendrograms[[1:2]] # 可能会存在基因较多,默认将基因分成了两部分
#=================================================================#
# 3.步步法构建
# 3.构建网络
# 3.1.计算相关系数
# 3.2.计算邻接矩阵
# 3.3.计算TOM矩阵
# 3.4.聚类并且划分模块
# 3.5.合并相似模块
#=================================================================#
# 计算邻接矩阵
softpower = sft$powerEstimate
adjacency = adjacency(datExpr,power = softPower)
# 通过邻接矩阵计算TOM矩阵
TOM = TOMsimilarity(adjacency)
dissTOM = 1 - TOM # 此为距离矩阵
# 通过距离矩阵进行聚类分析基因
geneTree = hclust(as.dist(dissTOM),method = "average")
sizeGrWindow(12,9)
plot(geneTree,xlab="",sub="",main="Gene clustering on TOM-based dissimilarity",labels=FALSE,hang = 0.04)
# 设置模块中基因最小数
minModuleSize = 30
# 通过动态树剪切鉴定出模块
dynamicMods = cutreeDynamic(dendro = geneTree,disM = dissTOM,deepSplit = 2,pamRespectsDendro = FALSE,minClusterSize = minModuleSize)
table(dynamicMods)
# 绘图
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
sizeGrWindow(8,6)
plotDendroAndColors(geneTree,dynamicColors,"Dynamic Tree Cut",dendroLabels = FALSE,hang = 0.03,addGuide = TRUE,guideHang = 0.05,main = " Gene dendrogram and module colors")
# 合并表达近似的Modules
# 计算Modules的特征基因
MEList = moduleEigengenes(datExpr,color = dynamicColors)
MEs = MEList$eigengenes
# 计算Modules之间的相关性
MEDiss = 1 - cor(MEs)
# 聚类特征基因
METree = hclust(as.dist(MEDiss),method = "average")
# 绘图
sizeGrWindow(7,6)
plot(METree,main = "Clustering of eigengenes",xlab="",sub="")
# 自动合并
MEDissThres = 0.25
abline(h=MEDissThres,col = "red")
merge = mergeCloseModules(datExpr,dynamicColors,cutHeight = MEDissThres,verbose = 3)
mergedColors = merge$colors
mergedMEs = merge$newMEs
# 绘图对比合并前后模块
sizeGrWindow(12,9)
plotDendroAndColors(geneTree,cbind(dynamicColors,mergedColors),c("Dynamic Tree Cut","Merged dynamic"),dendroLabels = FALSE,hang =0.03,addGuide = TRUE,guideHang = 0.05)
# 保存相关的变量数据
moduleColors = mergedColors
# 转换为数字标签
colorOrder = c("grey",standardColors(50))
moduleLabels = match(moduleColors,colorOrder)-1
MEs = mergedMEs
#=================================================================#
#
# 3.大数据集处理
# 基本思想:使用两层聚类
#
#=================================================================#
# Block-wise network construction and module detection
bwnet <- blockwiseModules(datExpr,maxBlockSize = 2000, # 指定电脑最大可运行的模块
power = sft$powerEstimate,TOMtype ="unsigned",minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,numericLabels = TRUE,
saveTOMs = TRUE,saveTOMFileBase = "RiceTOM-blockwise",
verbose = 3)
bwLabels = matchLabels(bwnet$colors,moduleLabels)
bwModuleColors = labels2colors(bwLabels)
table(bwLabels)
# 绘图
sizeGrWindow(6,6)
# block1绘图
plotDendroAndColors(bwnet$dendrograms[[1]], bwModuleColors[bwnet$blockGenes[[1]]], "Module colors", main = "Gene dendrogram and module colors in block 1", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)
# block2绘图
plotDendroAndColors(bwnet$dendrograms[[2]], bwModuleColors[bwnet$blockGenes[[2]]], "Module colors", main = "Gene dendrogram and module colors in block 2", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)
# 绘图比较single block和block-wise的区别
sizeGrWindow(12,9)
plotDendroAndColors(geneTree, cbind(moduleColors, bwModuleColors), c("Single block", "2 blocks"), main = "Single block gene dendrogram and module colors", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)
# 通过Eigengenes比较single block和block-wise的区别
singleBlockMEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
blockwiseMEs = moduleEigengenes(datExpr, bwModuleColors)$eigengenes
single2blockwise = match(names(singleBlockMEs), names(blockwiseMEs))
signif(diag(cor(blockwiseMEs[, single2blockwise], singleBlockMEs)), 3)
#=================================================================#
#
# 4.导入性状数据
#
#=================================================================#
traitData = read.csv("ClinicalTraits.csv")
dim(traitData)
names(traitData)
# 与表达数据进行校准后构建性状数据框
RiceSamples = rownames(datExpr)
traitRows = match(RiceSamples, allTraits$Sample)
datTraits = allTraits[traitRows, -1]
rownames(datTraits) = allTraits[traitRows, 1]
#=================================================================#
#
# 5.模块与性状的相关关系
#
#=================================================================#
# 量化Module-trait的关系
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
MEs0 = moduleEigengenes(datExpr,moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs,datTraits,use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTrsitCor,nSamples)
# 绘图
sizeGrWindow(10,6)
textMatrix = paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = names(MEs), ySymbols = names(MEs), colorLabels = FALSE, colors = greenWhiteRed(50), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1), main = paste("Module-trait relationships"))
#=================================================================#
#
# 6.基因与模块、性状的相关关系
#
#=================================================================#
# 确定基因与模块的关系-MM
modNames = substring(names(MEs),3) # 提取模块数字标签
geneModuleMembership = as.data.frame(cor(datExpr,MEs,use = "p"))
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples)) # 通过计算每个Module的Eigengenes与基因的关系
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
# 提取某个特定的性状
Treatment = as.data.frame(datTraits$Treatment)
names(weight) = "Treatment"
# 确定基因与性状的关系-GS
geneTraitSignificance = as.data.frame(cor(datExpr, Treatment, use = "p"))
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) = paste("GS.", names(Treatment), sep="")
names(GSPvalue) = paste("p.GS.", names(Treatment), sep="")
#=================================================================#
#
# 7.获取与性状关联度大的特定模块中GS高且MM高的基因
#
#=================================================================#
# 绘制特定模块GS和MM热图
module = "saddlebrown"
column = match(module,modNames)
moduleGenes = moduleColors==module # 获取特定列的数据,只是一种数据提取方法而已
sizeGrWindow(7, 7)
par(mfrow = c(1,1))
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, 1]), xlab = paste("Module Membership in", module, "module"), ylab = "Gene significance for body weight", main = paste("Module membership vs. gene significance\n"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
# 获取特定模块的特定基因
probes = names(datExpr)[moduleColors==module]
# 输出总体结果数据框
geneInfo0 = data.frame(substanceBXH = probes, geneSymbol = annot$gene_symbol[probes2annot], LocusLinkID = annot$LocusLinkID[probes2annot], moduleColor = moduleColors, geneTraitSignificance, GSPvalue)
# 根据与性状显著相关进行排序
modOrder = order(-abs(cor(MEs, weight, use = "p")))
# 添加模块关系信息
for (mod in 1:ncol(geneModuleMembership))
{ oldNames = names(geneInfo0) geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]], MMPvalue[, modOrder[mod]]); names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""), paste("p.MM.", modNames[modOrder[mod]], sep="")) }
# 第一根据颜色第二根据基因性状显著性进行排序
geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight))
geneInfo = geneInfo0[geneOrder, ]
#=================================================================#
#
# 8.进行后续分析GO功能注释等
#
#=================================================================#
# 选择感兴趣的Modules
intModules = c("brown", "red", "salmon")
for (module in intModules)
{
# Select module probes
modGenes = (moduleColors==module)
# Get their entrez ID codes
modLLIDs = allLLIDs[modGenes]
# Write them into a file
fileName = paste("LocusLinkIDs-", module, ".txt", sep="")
write.table(as.data.frame(modLLIDs), file = fileName, row.names = FALSE, col.names = FALSE)
}