1.安装
BiocManager::install("WGCNA")
library(WGCNA)
载入WGCNA包时会发现部分包没有安装需要手动安装
BiocManager::install(c("AnnotationDbi", "impute","GO.db", "preprocessCore"))
打开多线程
allowWGCNAThreads()#允许R语言程序最大线程运行
enableWGCNAThreads()# 打开多线程
2.导入数据
myfiles <- list.files(pattern = "*FPKM.csv")
myfiles
resdata<-read.table(myfiles[1],sep=',',header=T,row.names=1)
2.1 矩阵转置
Expr <-as.data.frame(t(resdata[,10:ncol(resdata)]))#
names(Expr) = resdata[,1]
2.2 查看
dim(Expr)
Expr[1:10,1:8]#查看前十行的前八列
3 检查离群样本
sampleTree = hclust(dist(Expr), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")
4 选择合适的软阈值
powers = c(c(1:10), seq(from = 12, to=30, by=2))
powers
#[1] 1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 22 24 26 28 30
sft = pickSoftThreshold(Expr, powerVector = powers,networkType = "signed hybrid", verbose = 5)
#pickSoftThreshold: will use block size 2776.
# pickSoftThreshold: calculating connectivity for given powers...
#..working on genes 1 through 2776 of 16116
#..working on genes 2777 through 5552 of 16116
#..working on genes 5553 through 8328 of 16116
#..working on genes 8329 through 11104 of 16116
#..working on genes 11105 through 13880 of 16116
#..working on genes 13881 through 16116 of 16116
# Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
#1 1 0.0314 0.422 0.833 1710.000 1.71e+03 2890.0
#2 2 0.2030 -0.675 0.759 597.000 5.70e+02 1380.0
#3 3 0.5500 -1.060 0.870 266.000 2.31e+02 814.0
#4 4 0.7340 -1.310 0.943 138.000 1.06e+02 557.0
#5 5 0.7990 -1.520 0.964 79.200 5.26e+01 422.0
#6 6 0.8590 -1.610 0.982 49.100 2.76e+01 334.0
#7 7 0.9050 -1.630 0.991 32.300 1.53e+01 274.0
#8 8 0.9310 -1.630 0.989 22.300 8.83e+00 230.0
#9 9 0.9480 -1.630 0.988 16.000 5.22e+00 200.0
#10 10 0.9560 -1.620 0.984 11.800 3.18e+00 176.0
#11 12 0.9780 -1.570 0.986 6.980 1.25e+00 141.0
#12 14 0.9850 -1.520 0.986 4.480 5.31e-01 118.0
#13 16 0.9880 -1.470 0.988 3.060 2.37e-01 100.0
#14 18 0.9900 -1.420 0.989 2.200 1.10e-01 87.3
#15 20 0.9930 -1.380 0.991 1.640 5.32e-02 77.1
#16 22 0.9960 -1.340 0.995 1.270 2.64e-02 68.8
#17 24 0.9940 -1.310 0.992 1.010 1.35e-02 62.1
#18 26 0.9940 -1.280 0.993 0.819 7.03e-03 56.7
#19 28 0.9900 -1.260 0.988 0.678 3.73e-03 52.3
#20 30 0.9950 -1.240 0.994 0.569 2.00e-03 48.5
sft为一个列表
SFT.R.sq为R2
也可以函数估计软阈值
power = sft$powerEstimate
power
#[1] 6
4.1 绘制选择软阈值的参考图
par(mfrow = c(1,2))
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
abline(h=0.85,col="red")
sft$fitIndices[,1]为Power这一列,sign函数如果为正数,返回1,如果为负数,返回-1,如果为0,返回0。
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,
cex=cex1, col="red")
左图为R2随候选β值变化趋势图,右图为每个候选β值的平均连接度。
5. 加权基因共表达网络的构建
net = blockwiseModules(Expr, power = power, maxBlockSize = 5000,
TOMType = "signed", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.1,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs=TRUE, corType = "pearson",
loadTOMs=TRUE,saveTOMFileBase = "data.tom",
verbose = 3)
mergeCutHeight越大,模块个数越少。
6. 模块划分结果的可视化
#查看总共有多少模块
0
7. 导出网络用于cytoscape 可视化及特征分析
TOM = TOMsimilarityFromExpr(Expr, power = power)
cyt = exportNetworkToCytoscape(TOM,
edgeFile = paste("edges.txt", sep=""),
nodeFile = paste("nodes.txt", sep=""),
weighted = TRUE, threshold = 0.5,
nodeNames = probes, nodeAttr = moduleColors)
8. 绘制模块之间相关性图
library(stringr)
MEs = net$MEs
colnames(MEs_col) = paste0("ME", labels2colors(
as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
# 根据基因间表达量进行聚类所得到的各模块间的相关性图
# marDendro/marHeatmap 设置下、左、上、右的边距
plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap",
marDendro = c(3,3,2,4),
marHeatmap = c(3,4,2,2), plotDendrograms = T,
xLabelsAngle = 90)
9.模块与表型相关性
library(stringr)
nSamples = nrow(Expr)
subtype<-c(rep('A',12),rep('B',14),rep('C',15),rep('D',16))
row.names(Expr)
datTraits = data.frame(gsm=row.names(Expr),
subtype)
rownames(datTraits)=datTraits[,1]
head(datTraits)
#确定临床表型与样本名字
sampleNames = rownames(Expr);
traitRows = match(sampleNames, datTraits$gsm)
rownames(datTraits) = datTraits[traitRows, 1]
design=model.matrix(~0+ datTraits$subtype)
colnames(design)=levels(datTraits$subtype)
##另一种方法
moduleLabelsAutomatic = net$colors
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
moduleColorsWW = moduleColorsAutomatic
MEs0 = moduleEigengenes(Expr, moduleColorsWW)$eigengenes
MEsWW = orderMEs(MEs0)
modTraitCor = cor(MEsWW, design, use = "p")
colnames(MEsWW)
modlues=MEsWW
modTraitP = corPvalueStudent(modTraitCor, nSamples)
textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
labeledHeatmap(Matrix = modTraitCor,
xLabels = colnames(design),
yLabels = names(MEsWW),
cex.lab = 0.5,
yColorWidth=0.01,
xColorWidth = 0.01,
ySymbols = colnames(modlues),
colorLabels = FALSE, colors = blueWhiteRed(50),
textMatrix = textMatrix, setStdMargins = FALSE,
cex.text = 0.3, zlim = c(-1,1),
main = paste("Module-trait relationships")
library(stringr)
MEs = net$MEs
colnames(MEs) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs = orderMEs(MEs)
nSamples = nrow(Exprdata)
NW<-c(rep(1,14),rep(0,43))
OB<-c(rep(0,14),rep(1,12),rep(0,31))
NAFL<-c(rep(0,26),rep(1,15),rep(0,16))
NASH<-c(rep(0,41),rep(1,16))
traitData<-data.frame(NW,OB,NAFL,NASH)
#nrow(Exprdata)
row.names(traitData)<-rownames(Exprdata)
head(traitData)
#traitData <- read.table("trait.txt", sep='\t', header=T, row.names=1,check.names=FALSE, comment='',quote="",skipNul=TRUE)
sampleName = rownames(Exprdata)
label = colnames(traitData)
traits = traitData[match(sampleName, rownames(traitData)), ]
modTraitCor = cor(MEs, traits, use = "p")
modTraitP = corPvalueStudent(modTraitCor, nSamples)
textMatrix = paste(signif(modTraitCor, 3), "\n(", signif(modTraitP, 1), ")", sep = "")
#dim(textMatrix) = dim(modTraitCor)
#par(mar = c(6, 8.5, 3, 3))
sizeGrWindow(18,6)
par(mar = c(2, 6, 2, 1))
labeledHeatmap(Matrix = modTraitCor,
xLabels = label,
yLabels = names(MEs),
cex.lab = 0.5,
ySymbols = colnames(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.28,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
cex.lab可以更改X轴Y轴label字体的大小,cex.text可以更改热图中字体的大小,colors可以改变颜色。
10.提取指定模块的基因名
# Recalculate topological overlap
TOM = TOMsimilarityFromExpr(Expr, power = 6);
# Select module
module = "brown";
module = "blue";
module = "yellow";
probes = colnames(Expr) ## 我们例子里面的probe就是基因名
inModule = (moduleColors==module);
modProbes = probes[inModule];
modTOM = TOM[inModule, inModule]
dimnames(modTOM) = list(modProbes, modProbes)
cyt = exportNetworkToCytoscape(
modTOM,
edgeFile = paste("CytoscapeInput-edges-", paste(module, collapse="-"), ".txt", sep=""),
nodeFile = paste("CytoscapeInput-nodes-", paste(module, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = modProbes,
nodeAttr = moduleColors[inModule]
)
11 计算模块与基因的相关性矩阵
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
## 算出每个模块跟基因的皮尔森相关系数矩阵
## MEs是每个模块在每个样本里面的值
## datExpr是每个基因在每个样本的表达量
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
23