WGCNA(Weighted Correlation Network Analysis)是一个基于基因表达网络权重构建,描述基因表达的关联模式的R包。简单来说,就是一种基于网络的计算基因间表达的相关性的方法,网络上关于该方法相关原理的解释有很多,本博客主要描述具体做法,不再对原理进行详细讲解。
提示:以下是本篇文章正文内容,下面案例可供参考
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("WGCNA")
#导入的时候可能存在部分依赖包没有导进来的情况,把这些包单独导进来就好
#如果因为网络原因一直无法成功下载,试试本地安装
代码如下(示例):
#数据提取#
GE<-read.table('TCGA-OV.htseq_fpkm.tsv',header=T,sep='\t',stringsAsFactors = F)#379个样本
sig_genes<-read.table('sig_genes.txt',header = T,stringsAsFactors = F)#11072个感兴趣的基因
gencode<-read.table('gencode.v22.annotation.gene.probeMap',header = T,sep = '\t',stringsAsFactors = F)
#先提取出感兴趣基因的表达
sigE<-merge(GE,sig_genes,by.x = 'Ensembl_ID',by.y = 'x')
#转换基因ID#
sigSymbolE<-merge(gencode[,1:2],sigE,by.x='id',by.y='Ensembl_ID')
sigSymbolE<-sigSymbolE[,-1]
#处理多个探针对应一个基因的情况#
probe2gene = aggregate(sigSymbolE[,-1], by=list(sigSymbolE[,"gene"]),mean)
write.table(probe2gene,'sig_genes_exp.txt',sep='\t',row.names = F,quote = F)#the genes that we are interested has been exacted sucessfully.
#the phenotype and survival data
survival<-read.table('TCGA-OV.survival.tsv',header=T,stringsAsFactors=F)
#phenotype<-read.table('TCGA-OV.GDC_phenotype.tsv',header=T,stringsAsFactors=F)#有问题,先放这儿
samples<-data.frame(colnames(probe2gene)[-1])
samples<-data.frame(apply(samples,1,function(x){re=gsub('.','-',x,fixed=T)}))
colnames(samples)<-'sample ID'
survival1<-merge(samples,survival,by.x='sample ID',by.y='sample',all.x=T)#只有378个患者有相应的生存数据,因此剔除表达谱中该患者的数据
survival1<-survival1[-9,]
index<-which(samples=='TCGA-04-1357-01A')
probe2gene<-probe2gene[,-(index+1)]
#更新输出
write.table(probe2gene,'sig_genes_exp.txt',sep='\t',row.names = F,quote = F)#the genes that we are interested has been exacted sucessfully.
save.image('first_part.Rdata')
代码如下(示例):
#主要分为对样本进行清洗,即剔除异常样本;对基因进行清洗,即去除无用的基因(在超过80%的样本中不表达的基因)
library(WGCNA)
library(flashClust)
library(iterators)
options(stringsAsFactors=F)
enableWGCNAThreads()
#数据清洗,去除低表达的基因和异常的样本
countExpr<-data.frame(apply(probe2gene[,-1],1,function(x) {re=sum(x==0)}))
colnames(countExpr)<-'count Expr==0'
rmORnot<-data.frame(apply(countExpr,1,function(x) {if(x>302){re=0}else{re=1}}))#1保留,0去掉
colnames(rmORnot)<-'remove gene or not'
femData<-probe2gene[which(rmORnot==1),]
colnames(femData)[1]<-'gene_id'
datExpr0 = as.data.frame(t(log2(femData[,-1]+1)))
names(datExpr0) = femData$gene_id #列标签添加基因名称
rownames(datExpr0) = names(femData[,-1]) #行标签添加为样本名称
gsg = goodSamplesGenes(datExpr0, verbose = 3) #检测缺失值
gsg$allOK #结果为TRUE,则所有选定基因都用于后续WGCNA
datExpr0=datExpr0[gsg$goodGenes] #如果gsg$allOK结果为FALSE,则后续选择好的基因用于WGCNA。
# if (!gsg$allOK)
# {
# # Optionally, print the gene and sample names that were removed:
# if (sum(!gsg$goodGenes)>0)
# printFlush(paste("Removing genes:", paste(names(datExpr)[!gsg$goodGenes], collapse = ", ")));
# if (sum(!gsg$goodSamples)>0)
# printFlush(paste("Removing samples:", paste(rownames(datExpr)[!gsg$goodSamples], collapse = ", ")));
# # Remove the offending genes and samples from the data:
# datExpr = datExpr[gsg$goodSamples, gsg$goodGenes]
# }
#之后对样本进行聚类看是否有异常的样本
sampleTree = hclust(dist(datExpr0), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
sizeGrWindow(12,9)
#pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
# Plot a line to show the cut
abline(h = 35, col = "red");
# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 35, minSize = 10)
table(clust)
# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#临床数据清洗
traitData = survival1[-which(survival1[,1]=='TCGA-61-2088-01A'),];
traitData[,1]<-gsub('-','.',traitData[,1],fixed=T)
colnames(traitData)[1]<-'sample'
dim(traitData)
names(traitData)
# remove columns that hold information we do not need.
allTraits = traitData[, -c(2:3)];
dim(allTraits)
names(allTraits)
# Form a data frame analogous to expression data that will hold the clinical traits.
samples = rownames(datExpr);
traitRows = match(samples, allTraits$sample);
datTraits = allTraits[traitRows,];
rownames(datTraits) = allTraits[traitRows, 1];
collectGarbage();
colnames(datTraits)[2]<-'OStime'
# Re-cluster samples
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits$OStime, signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datTraits),
main = "Sample dendrogram and trait heatmap")
save.image('second_part.Rdata')
#选择合适的软阈值β进行后续的基因共表达网络构建
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
#构建基因共表达网络:使用加权的表达相关性。
softPower = 5;
adjacency = adjacency(datExpr, power = softPower);
# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency);
dissTOM = 1-TOM
# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04);
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 30;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
#网络构建完成后即可基于网络进行自己想要的分析,如最常见的加入表型,分析与表型相关的基因模块
# Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = 0.25
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs;
sizeGrWindow(12, 9)
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
#dev.off()
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;
# Save module colors and labels for use in subsequent parts
save(MEs, moduleLabels, moduleColors, geneTree, file = "third_part.RData")
# Define numbers of genes and samples
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
# Define variable time containing the time column of datTrait
time = as.data.frame(datTraits$OStime);
names(time) = "time"
# names (colors) of the modules
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
geneTraitSignificance = as.data.frame(cor(datExpr, time, use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) = paste("GS.", names(time), sep="");
names(GSPvalue) = paste("p.GS.", names(time), sep="");
module = "yellow"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for body time",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
names(datExpr)
names(datExpr)[moduleColors=="yellow"]
annot = read.table('gencode.v22.annotation.gene.probeMap',header = T,sep = '\t',stringsAsFactors = F)
dim(annot)
names(annot)
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
# The following is the number or probes without annotation:
sum(is.na(probes2annot))
# Should return 0.
# Create the starting data frame
geneInfo0 = data.frame(substanceBXH = probes,
geneSymbol = annot$gene_symbol[probes2annot],
LocusLinkID = annot$LocusLinkID[probes2annot],
moduleColor = moduleColors,
geneTraitSignificance,
GSPvalue)
# Order modules by their significance for time
modOrder = order(-abs(cor(MEs, time, use = "p")));
# Add module membership information in the chosen order
for (mod in 1:ncol(geneModuleMembership))
{
oldNames = names(geneInfo0)
geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]],
MMPvalue[, modOrder[mod]]);
names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""),
paste("p.MM.", modNames[modOrder[mod]], sep=""))
}
# Order the genes in the geneInfo variable first by module color, then by geneTraitSignificance
geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.time));
geneInfo = geneInfo0[geneOrder, ]
WGCNA方法总的来说就是先将基因划分成模块,即具有相似表达量的基因被划分到相同模块中,之后通过计算表型与各个模块的相关性,来寻找与我们感兴趣的表型相关的基因集。更加详细的说明参见官方文档:WGCNA官方说明