rm(list = ls())
library(WGCNA)
library(data.table)
library(stringr)
library(openxlsx)
allowWGCNAThreads()
####step1 数据预处理
if(T){
#read expression data and pre-processing
exprset0 <- read.delim('LiverFemale3600.csv',header = T,sep = ',')
exprset <- exprset0[,c(2,9:ncol(exprset0))]
dim(exprset)
exprset <- exprset[exprset$gene_symbol != 0,]
dim(exprset)
table(duplicated(exprset[,1]))
exprset <- exprset[!duplicated(exprset$gene_symbol),]
dim(exprset)
table(duplicated(exprset[,1]))
rownames(exprset) <- exprset[,1];exprset <- exprset[,-1]
#read phenotype data and pre-processing
datTraits0 <- read.delim('ClinicalTraits.csv',header = T,sep = ',')
datTraits <- datTraits0[,c(2,9:15)]
datTraits <- datTraits[,c(1,4:8)]
dim(datTraits)
rownames(datTraits) <- datTraits[,1];datTraits <- datTraits[,-1]
#match expression data and phenotype data
sample <- colnames(exprset) #sample 是表达矩阵的样品名排序
datTraits <- datTraits[rownames(datTraits) %in% sample,] #删除临床表型中多余的sample
datTraits <- datTraits[match(sample,rownames(datTraits)),] #将数量匹配的sample进行排序
identical(colnames(exprset),rownames(datTraits)) ##validation:数量及排序是否一致
#转置expression
exprset <- t(exprset)
save(exprset,datTraits,file = 'WGCNA_HUALIN_DATApreprocessing.RData')
}
#判断sample里面是否有离群值
if(F){
#sample clust 其实我觉得这里需要放到最前面去判断样本是否有离群值
#明确样本数和基因
datExpr <- exprset
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#首先针对样本做个系统聚类
datExpr_tree<-hclust(dist(datExpr), method = "average")
sampleTree <-datExpr_tree
# 画出样本聚类图(上)与样本性状热图(下):
traitColors = numbers2colors(datTraits, signed = TRUE,centered=TRUE);
plotDendroAndColors(sampleTree,
traitColors,
groupLabels = names(datTraits),
rowTextAlignment = "right-justified",
addTextGuide = TRUE ,
hang = 0.03,
dendroLabels = NULL, # 是否显示树labels
addGuide = FALSE, # 显示虚线
guideHang = 0.05,
main = "Sample dendrogram and trait heatmap")
}
#####step2 选择一个β值建立临近矩阵根据连接度使我们的基因分布符合无尺度网络
if(T){
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# 获得各个阈值下的 R方 和平均连接度
sft = pickSoftThreshold(exprset, powerVector = powers, verbose = 5)
# 作图:
sizeGrWindow(9, 5)
#sizeGrWindow(width, height) size in inches
#sizeGrWindow: Opens a graphics window with specified dimensions
par(mfrow = c(1,2)); #par设置作图参数:tag = value
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
}
####step2.1 根据连接度和R^2判断softPower是否合理
if(F){
ADJ1_cor <- abs(WGCNA::cor(exprset,use = "p" ))^softPower
#ADJ1_cor是一个nGene*nGene的matrix,无向网络的边属性计算
# 基因少(<5000)的时候使用下面的代码:
k <- as.vector(apply(ADJ1_cor,2,sum,na.rm=T))#k表示把同一个基因与其他所有基因相关性的sum
# 基因多的时候使用下面的代码:
# k <- softConnectivity(datE=exprset,power=softPower)
sizeGrWindow(10, 5)
par(mfrow=c(1,2))
hist(k)
scaleFreePlot(k,main="Check Scale free topology\n")
## figure legends :
#图一可以看出只有较少的点具有较高的连接性,而大多数的点具有较低的连接性。
#图二可以看出k与p(k)成负相关(相关性系数0.91,0.9以上最好),说明选择的β值是没有问题的,(相关系数r的绝对值一般在0.8以上,认为A和B有强的相关性。0.3到0.8之间,可以认为有弱的相关性。0.3以下,认为没有相关性)
}
#### step3 根据β值获得临近矩阵adjacency和拓扑矩阵TOM
####分步法构建网络
if(T){
# 获得临近矩阵:
adjacency = adjacency(exprset, power = softPower);
# 将临近矩阵转为 Tom 矩阵
TOM = TOMsimilarity(adjacency);
# 计算基因之间的相异度dissTOM,然后根据dissTOM层次聚类得到geneTree,然后动态剪切法对树进行剪切成不同的模块
dissTOM = 1-TOM
geneTree = hclust(as.dist(dissTOM),method="average")
sizeGrWindow(12,9)
par(mfrow = c(2,1))
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04);
# 使用动态剪切树挖掘模块:
minModuleSize = 30;
# 动态切割树:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)#这个时候还是数字代表module,下一步将数字转为color
dynamicColors=labels2colors(dynamicMods)
table(dynamicColors)
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
}
####step3.1 随机选择400个基因画拓扑重叠热图
if(F){
# 拓扑热图:
nSelect = 400
# For reproducibility, we set the random seed
set.seed(10);
nGenes <- ncol(exprset)
select = sample(nGenes, size = nSelect);
selectTOM = dissTOM[select, select];
# There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = dynamicColors[select];
# Open a graphical window
sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^softPower;
diag(plotDiss) = NA;
TOMplot(plotDiss,
selectTree, #selected geneTree
selectColors,
main = "Network heatmap plot, selected genes")
}
####step4 MEs and merged_MEs
if(T){
MEList = moduleEigengenes(exprset, colors = dynamicColors)
MEs = MEList$eigengenes ##MEs是一个列为模块特征基因,行为sample的dataframe
# 计算根据模块特征向量基因计算模块相异度:
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
plotEigengeneNetworks(MEs,
"Eigengene adjacency heatmap",
marHeatmap = c(3,4,2,2),
plotDendrograms = T,
xLabelsAngle = 90)
plot(METree,
main = "Clustering of module eigengenes",
xlab = "",
sub = "")
# 在聚类图中画出剪切线,红线以下的模块表示相关性>0.8,将被合并
MEDissThres = 0.2
abline(h=MEDissThres, col = "red")
# 合并模块:
merge_modules = mergeCloseModules(exprset, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# 合并后的颜色:
mergedColors = merge_modules$colors;
# 新模块的特征向量基因:
mergedMEs = merge_modules$newMEs;
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
}
##step5 模块与样本性状相关性热图
if(T){
mergedMEs[1:4,1:4];datTraits[1:4,1:4]
moduleTraitCor_noFP <- cor(mergedMEs, datTraits, use = "p");
moduleTraitPvalue_noFP = corPvalueStudent(moduleTraitCor_noFP, nSamples);
textMatrix_noFP <- paste(signif(moduleTraitCor_noFP, 2), "\n(", signif(moduleTraitPvalue_noFP, 1), ")", sep = ""); #signif保留几位有效的digital
sizeGrWindow(12,8)
par(mar = c(5, 8.5, 3, 3));
labeledHeatmap(Matrix = moduleTraitCor_noFP,
xLabels = names(datTraits),
yLabels = names(mergedMEs),
ySymbols = names(mergedMEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix_noFP,
setStdMargins = FALSE,
cex.text = 0.65,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
}
##step5.1 单个形状进行画GS
if(F){
ph <- 'weight_g'
#Gene significance and module significance
#geneSignificance表示基因的表达量与临床形状Pearson相关系数
GS1 <- as.numeric(WGCNA::cor(datTraits[,ph],exprset,use="p",method="pearson"))
# 显著性是绝对值:
GeneSignificance <- abs(GS1)
# 获得该性状在每个模块中的显著性:
ModuleSignificance <- tapply(GeneSignificance,mergedColors,mean,na.rm=T)
which.max(ModuleSignificance[names(ModuleSignificance != 'MEgrey')])
plotModuleSignificance(GeneSignificance,mergedColors)
##批量出图《代码不完美》
if(F){for (i in 1:ncol(datTraits)){
file = paste('GS',colnames(datTraits)[i],sep = '_')
tiff(filename = paste(file,'.tiff',sep = ''))
GS1 <- as.numeric(WGCNA::cor(datTraits[,i],exprset,use="p",method="pearson"))
# 显著性是绝对值:
GeneSignificance <- abs(GS1)
# 获得该性状在每个模块中的显著性:
ModuleSignificance <- tapply(GeneSignificance,mergedColors,mean,na.rm=T)
#MS_max = which.max(ModuleSignificance[names(ModuleSignificance != 'MEgrey')])
plotModuleSignificance(GeneSignificance,mergedColors)
dev.off()
}}
}
#####step6 挑选自己感兴趣的临床表型画:内部连接度VS MM
if(T){
ph <- 'weight_g'
cor_ADR <- signif(WGCNA::cor(datTraits,mergedMEs,use="p",method="pearson"),5)
p.values <- corPvalueStudent(cor_ADR,nSamples=nrow(datTraits))
Max_cor <- which.max(abs(cor_ADR[ph,-which(colnames(cor_ADR) == "MEgrey")])) # which 返回满足条件(true值)的下标
Min_p_value <- which.min(p.values[ph,-which(colnames(p.values) == "MEgrey")])
Max_cor == Min_p_value
#寻找与该性状相关的枢纽基因(hub genes)
#计算基因的inter connectivity和module membership,内部连接度衡量的是基因在模块内部的地位,而模块身份表明基因属于哪个模块。
# 计算每个基因模块内部连接度,也就是基因直接两两加权相关性。
load('softPower.RData')
ADJ1=abs(cor(exprset,use="p"))^softPower
# 根据上面结果和基因所属模块信息获得连接度:
# 整体连接度 kTotal,模块内部连接度:kWithin,kOut=kTotal-kWithin, kDiff=kIn-kOut=2*kIN-kTotal
colorh1=mergedColors ###教程中没有给出这个color的赋值,但是通过帮助文档可以知道:
#color: module labels. A vector of length ncol(ADJ1) giving a module label for each gene (node) of the network.
Alldegrees1=intramodularConnectivity(ADJ1, colorh1)
# 注意模块内基于特征向量基因连接度评估模块内其他基因: de ne a module eigengene-based connectivity measure for each gene as the correlation between a the gene expression and the module eigengene
# 如 brown 模块内:kM Ebrown(i) = cor(xi, MEbrown) , xi is the gene expression pro le of gene i and M Ebrown is the module eigengene of the brown module
# 而 module membership 与内部连接度不同。MM 衡量了基因在全局网络中的位置。
datKME=signedKME(exprset, mergedMEs, outputColumnName="MM.")
which.color=substring(names(Max_cor == Min_p_value),3);
which.color ##which.color就是返回感兴趣的形状所对应的最有意义的module颜色
restrictGenes=colorh1==which.color
# pdf(file = 'MM VS connec.pdf')
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes],
(datKME[restrictGenes, paste("MM.", which.color, sep="")])^4,
col=which.color,
xlab="Intramodular Connectivity",
ylab="(Module Membership)^4")
# dev.off()
}
####step7 挑选module中的hub gene
if(T){
ph <- 'weight_g'
GS_spe=as.numeric(cor(datTraits[,ph],exprset, use="p")) #选择的样品性状datTraits$weight_g与基因之间的相关性
GeneSignificance_spe <- abs(GS_spe)
#基于显著性和MM计算每个基因与 指定trait 的关联,结果包括p, q, cor, z,
NS1=networkScreening(y=datTraits[,ph],
datME=mergedMEs,
datExpr=exprset,
oddPower=3,
blockSize=1000,
minimumSampleSize=4,
addMEy=TRUE,
removeDiag=FALSE,
weightESy=0.5)
rownames(NS1) <- colnames(exprset)
#根据基因与指定性状的直接相关性(biserial.cor),模块身份,和加权相关性筛选基因:
#网上的教程推荐使用的是使用基因与临床形状的直接相关性函数:biserial.cor;但是我这里用的是Pearson相关系数算的
FilterGenes_spe = ((GeneSignificance_spe > 0.2) & (abs(datKME[paste("MM.",which.color,sep="")])>0.8) & (NS1$q.Weighted < 0.01) )
table(FilterGenes_spe)
# 找到满足上面条件的基因:
trait_hubGenes_spe <- colnames(exprset)[FilterGenes_spe]
# hub 基因热图:
plotNetworkHeatmap(exprset,
plotGenes = trait_hubGenes_spe,
networkType = "unsigned",
useTOM = TRUE,
power=softPower,
main="unsigned correlations")
}
####step8 enrichment analysis
if(T){
#hub genes GO and KEGG analysis.
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
# GO 分析:
ego <- enrichGO(gene = trait_hubGenes_spe,
# universe = names(geneList), #background genes. If missing, the all genes listed in the database (eg TERM2GENE table) will be used as background.
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
GO_BP <- as.data.frame(ego)
GO_BP$point_shape<-"0"
GO_BP$point_size<-"15"
# write.xlsx(GO_BP,"./results/392_genes_GO_BP.xlsx")
ggplot(data=GO_BP)+
geom_bar(aes(x=reorder(Description,Count),y=Count, fill=-log10(qvalue)), stat='identity') +
coord_flip() +
scale_fill_gradient(expression(-log["10"]("q value")),low="red", high = "blue") +
xlab("") +
ylab("Gene count") +
scale_y_continuous(expand=c(0, 0))+
theme_bw()+
theme(
axis.text.x=element_text(color="black",size=rel(1.5)),
axis.text.y=element_text(color="black", size=rel(1.6)),
axis.title.x = element_text(color="black", size=rel(1.6)),
legend.text=element_text(color="black",size=rel(1.0)),
legend.title = element_text(color="black",size=rel(1.1))
# legend.position=c(0,1),legend.justification=c(-1,0)
# legend.position="top",
)
# KEGG:
kk <- enrichKEGG(gene = trait_hubGenes_spe,
organism = 'hsa',
pvalueCutoff = 0.05)
kegg_DF <- as.data.frame(kk)
}
####step9 export
if(T){
# 导出整个模块基因到 VisANT
modTOM <- TOM[mergedColors==which.color,mergedColors==which.color]
dimnames(modTOM) = list(colnames(exprset)[mergedColors==which.color], colnames(exprset)[which.color])
vis = exportNetworkToVisANT(modTOM,
file = paste("./WGCNA/ADR_drug_exp_new/VisANTInput-Mod-", which.color, ".txt", sep=""),
weighted = TRUE,
threshold = 0
# probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol)
)
# 导出枢纽基因到 Cytoscape
hubGene_TOM <- TOM[FilterGenes_spe,FilterGenes_spe]
dimnames(hubGene_TOM) = list(colnames(exprset)[FilterGenes_spe], colnames(exprset)[FilterGenes_spe])
cyt = exportNetworkToCytoscape(hubGene_TOM,
edgeFile = paste("./WGCNA/ADR_drug_exp_new/CytoscapeInput-edges-", paste(which.color, collapse="-"), ".txt", sep=""),
nodeFile = paste("./WGCNA/ADR_drug_exp_new/CytoscapeInput-nodes-", paste(which.color, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = trait_hubGenes_spe,
altNodeNames = trait_hubGenes_spe,
nodeAttr = mergedColors[FilterGenes_spe]
)
}