WGCNA

rm(list = ls())
library(WGCNA)
library(data.table)
library(stringr)
library(openxlsx)
allowWGCNAThreads()
####step1 数据预处理
if(T){
  #read expression data and pre-processing 
  exprset0 <- read.delim('LiverFemale3600.csv',header = T,sep = ',')
  exprset <- exprset0[,c(2,9:ncol(exprset0))]
  dim(exprset)
  exprset <- exprset[exprset$gene_symbol != 0,]
  dim(exprset)
  table(duplicated(exprset[,1]))
  exprset <- exprset[!duplicated(exprset$gene_symbol),]
  dim(exprset)
  table(duplicated(exprset[,1]))
  rownames(exprset) <- exprset[,1];exprset <- exprset[,-1]
  #read phenotype data and pre-processing 
  datTraits0 <- read.delim('ClinicalTraits.csv',header = T,sep = ',')
  datTraits <- datTraits0[,c(2,9:15)]
  datTraits <- datTraits[,c(1,4:8)]
  dim(datTraits)
  rownames(datTraits) <- datTraits[,1];datTraits <- datTraits[,-1]
  #match expression data and phenotype data
  sample <- colnames(exprset) #sample 是表达矩阵的样品名排序
  datTraits <- datTraits[rownames(datTraits) %in% sample,] #删除临床表型中多余的sample
  datTraits <- datTraits[match(sample,rownames(datTraits)),] #将数量匹配的sample进行排序
  identical(colnames(exprset),rownames(datTraits)) ##validation:数量及排序是否一致
  #转置expression 
  exprset <- t(exprset)
  save(exprset,datTraits,file = 'WGCNA_HUALIN_DATApreprocessing.RData')
}
#判断sample里面是否有离群值
if(F){
  #sample clust 其实我觉得这里需要放到最前面去判断样本是否有离群值
  #明确样本数和基因
  datExpr <- exprset
  nGenes = ncol(datExpr)
  nSamples = nrow(datExpr)
  #首先针对样本做个系统聚类
  datExpr_tree<-hclust(dist(datExpr), method = "average")
  sampleTree <-datExpr_tree
  # 画出样本聚类图(上)与样本性状热图(下):
  traitColors = numbers2colors(datTraits, signed = TRUE,centered=TRUE);
  plotDendroAndColors(sampleTree, 
                      traitColors, 
                      groupLabels = names(datTraits), 
                      rowTextAlignment = "right-justified",
                      addTextGuide = TRUE ,
                      hang = 0.03,
                      dendroLabels = NULL, # 是否显示树labels
                      addGuide = FALSE,  # 显示虚线
                      guideHang = 0.05,
                      main = "Sample dendrogram and trait heatmap") 
}

#####step2 选择一个β值建立临近矩阵根据连接度使我们的基因分布符合无尺度网络
if(T){
  powers = c(c(1:10), seq(from = 12, to=20, by=2))
  # 获得各个阈值下的 R方 和平均连接度
  sft = pickSoftThreshold(exprset, powerVector = powers, verbose = 5)
  # 作图:
  sizeGrWindow(9, 5) 
  #sizeGrWindow(width, height) size in inches
  #sizeGrWindow: Opens a graphics window with specified dimensions
  par(mfrow = c(1,2)); #par设置作图参数:tag = value
  cex1 = 0.9;
  # Scale-free topology fit index as a function of the soft-thresholding power
  plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
       xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
       main = paste("Scale independence"));
  text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
       labels=powers,cex=cex1,col="red");
  # this line corresponds to using an R^2 cut-off of h
  abline(h=0.90,col="red")
  # Mean connectivity as a function of the soft-thresholding power
  plot(sft$fitIndices[,1], sft$fitIndices[,5],
       xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
       main = paste("Mean connectivity"))
  text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
}

####step2.1 根据连接度和R^2判断softPower是否合理
if(F){
  ADJ1_cor <- abs(WGCNA::cor(exprset,use = "p" ))^softPower
  #ADJ1_cor是一个nGene*nGene的matrix,无向网络的边属性计算
  # 基因少(<5000)的时候使用下面的代码:
  k <- as.vector(apply(ADJ1_cor,2,sum,na.rm=T))#k表示把同一个基因与其他所有基因相关性的sum
  # 基因多的时候使用下面的代码:
  # k <- softConnectivity(datE=exprset,power=softPower) 
  sizeGrWindow(10, 5)
  par(mfrow=c(1,2))
  hist(k)
  scaleFreePlot(k,main="Check Scale free topology\n")
  ## figure legends :
  #图一可以看出只有较少的点具有较高的连接性,而大多数的点具有较低的连接性。
  #图二可以看出k与p(k)成负相关(相关性系数0.91,0.9以上最好),说明选择的β值是没有问题的,(相关系数r的绝对值一般在0.8以上,认为A和B有强的相关性。0.3到0.8之间,可以认为有弱的相关性。0.3以下,认为没有相关性)
  
}

#### step3 根据β值获得临近矩阵adjacency和拓扑矩阵TOM
####分步法构建网络
if(T){
  # 获得临近矩阵:
  adjacency = adjacency(exprset, power = softPower);
  # 将临近矩阵转为 Tom 矩阵
  TOM = TOMsimilarity(adjacency);
  # 计算基因之间的相异度dissTOM,然后根据dissTOM层次聚类得到geneTree,然后动态剪切法对树进行剪切成不同的模块
  dissTOM = 1-TOM
  geneTree = hclust(as.dist(dissTOM),method="average")
  sizeGrWindow(12,9)
  par(mfrow = c(2,1))
  plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
       labels = FALSE, hang = 0.04);
  # 使用动态剪切树挖掘模块:
  minModuleSize = 30;
  # 动态切割树:
  dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                              deepSplit = 2, pamRespectsDendro = FALSE,
                              minClusterSize = minModuleSize);
  table(dynamicMods)#这个时候还是数字代表module,下一步将数字转为color
  dynamicColors=labels2colors(dynamicMods)
  table(dynamicColors)
  sizeGrWindow(8,6)
  plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                      dendroLabels = FALSE, hang = 0.03,
                      addGuide = TRUE, guideHang = 0.05,
                      main = "Gene dendrogram and module colors")
}

####step3.1 随机选择400个基因画拓扑重叠热图
if(F){
  # 拓扑热图:
  nSelect = 400 
  # For reproducibility, we set the random seed 
  set.seed(10); 
  nGenes <- ncol(exprset)
  select = sample(nGenes, size = nSelect); 
  selectTOM = dissTOM[select, select]; 
  # There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster. 
  selectTree = hclust(as.dist(selectTOM), method = "average") 
  selectColors = dynamicColors[select]; 
  # Open a graphical window 
  sizeGrWindow(9,9) 
  # Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing 
  # the color palette; setting the diagonal to NA also improves the clarity of the plot 
  plotDiss = selectTOM^softPower; 
  diag(plotDiss) = NA; 
  TOMplot(plotDiss, 
          selectTree, #selected geneTree
          selectColors, 
          main = "Network heatmap plot, selected genes") 
  
}

####step4 MEs and merged_MEs
if(T){
  MEList = moduleEigengenes(exprset, colors = dynamicColors)
  MEs = MEList$eigengenes ##MEs是一个列为模块特征基因,行为sample的dataframe
  # 计算根据模块特征向量基因计算模块相异度:
  MEDiss = 1-cor(MEs);
  # Cluster module eigengenes
  METree = hclust(as.dist(MEDiss), method = "average");
  # Plot the result
  plotEigengeneNetworks(MEs, 
                        "Eigengene adjacency heatmap", 
                        marHeatmap = c(3,4,2,2), 
                        plotDendrograms = T, 
                        xLabelsAngle = 90) 
  
  plot(METree, 
       main = "Clustering of module eigengenes",
       xlab = "", 
       sub = "")
  # 在聚类图中画出剪切线,红线以下的模块表示相关性>0.8,将被合并
  MEDissThres = 0.2
  abline(h=MEDissThres, col = "red")
  # 合并模块:
  merge_modules = mergeCloseModules(exprset, dynamicColors, cutHeight = MEDissThres, verbose = 3)
  # 合并后的颜色:
  mergedColors = merge_modules$colors;
  # 新模块的特征向量基因:
  mergedMEs = merge_modules$newMEs;
  plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                      c("Dynamic Tree Cut", "Merged dynamic"),
                      dendroLabels = FALSE, hang = 0.03,
                      addGuide = TRUE, guideHang = 0.05)
  
}


##step5 模块与样本性状相关性热图
if(T){
  mergedMEs[1:4,1:4];datTraits[1:4,1:4]
  moduleTraitCor_noFP <- cor(mergedMEs, datTraits, use = "p");
  moduleTraitPvalue_noFP = corPvalueStudent(moduleTraitCor_noFP, nSamples); 
  textMatrix_noFP <- paste(signif(moduleTraitCor_noFP, 2), "\n(", signif(moduleTraitPvalue_noFP, 1), ")", sep = ""); #signif保留几位有效的digital 
  sizeGrWindow(12,8)
  par(mar = c(5, 8.5, 3, 3)); 
  labeledHeatmap(Matrix = moduleTraitCor_noFP, 
                 xLabels = names(datTraits), 
                 yLabels = names(mergedMEs), 
                 ySymbols = names(mergedMEs), 
                 colorLabels = FALSE, 
                 colors = blueWhiteRed(50), 
                 textMatrix = textMatrix_noFP,
                 setStdMargins = FALSE, 
                 cex.text = 0.65, 
                 zlim = c(-1,1), 
                 main = paste("Module-trait relationships")) 
}

##step5.1 单个形状进行画GS
if(F){
    ph <- 'weight_g'
    #Gene significance and module significance
    #geneSignificance表示基因的表达量与临床形状Pearson相关系数
    GS1 <- as.numeric(WGCNA::cor(datTraits[,ph],exprset,use="p",method="pearson"))
    # 显著性是绝对值:
    GeneSignificance <- abs(GS1)
    # 获得该性状在每个模块中的显著性:
    ModuleSignificance <- tapply(GeneSignificance,mergedColors,mean,na.rm=T)
    which.max(ModuleSignificance[names(ModuleSignificance != 'MEgrey')])
    plotModuleSignificance(GeneSignificance,mergedColors)
  ##批量出图《代码不完美》
  if(F){for (i in 1:ncol(datTraits)){
    file = paste('GS',colnames(datTraits)[i],sep = '_')
    tiff(filename = paste(file,'.tiff',sep = ''))
    GS1 <- as.numeric(WGCNA::cor(datTraits[,i],exprset,use="p",method="pearson"))
    # 显著性是绝对值:
    GeneSignificance <- abs(GS1)
    # 获得该性状在每个模块中的显著性:
    ModuleSignificance <- tapply(GeneSignificance,mergedColors,mean,na.rm=T)
    #MS_max = which.max(ModuleSignificance[names(ModuleSignificance != 'MEgrey')])
    plotModuleSignificance(GeneSignificance,mergedColors)
    dev.off()
  }}
}

#####step6 挑选自己感兴趣的临床表型画:内部连接度VS MM
if(T){
  ph <- 'weight_g'
  cor_ADR <- signif(WGCNA::cor(datTraits,mergedMEs,use="p",method="pearson"),5)
  p.values <- corPvalueStudent(cor_ADR,nSamples=nrow(datTraits))
  Max_cor <- which.max(abs(cor_ADR[ph,-which(colnames(cor_ADR) == "MEgrey")])) # which 返回满足条件(true值)的下标
  Min_p_value <- which.min(p.values[ph,-which(colnames(p.values) == "MEgrey")])
  Max_cor == Min_p_value
  #寻找与该性状相关的枢纽基因(hub genes)
  #计算基因的inter connectivity和module membership,内部连接度衡量的是基因在模块内部的地位,而模块身份表明基因属于哪个模块。
  # 计算每个基因模块内部连接度,也就是基因直接两两加权相关性。
  load('softPower.RData')
  ADJ1=abs(cor(exprset,use="p"))^softPower 
  # 根据上面结果和基因所属模块信息获得连接度:
  # 整体连接度 kTotal,模块内部连接度:kWithin,kOut=kTotal-kWithin, kDiff=kIn-kOut=2*kIN-kTotal 
  colorh1=mergedColors ###教程中没有给出这个color的赋值,但是通过帮助文档可以知道:
  #color: module labels. A vector of length ncol(ADJ1) giving a module label for each gene (node) of the network.
  Alldegrees1=intramodularConnectivity(ADJ1, colorh1) 
  
  # 注意模块内基于特征向量基因连接度评估模块内其他基因: de ne a module eigengene-based connectivity measure for each gene as the correlation between a the gene expression and the module eigengene
  # 如 brown 模块内:kM Ebrown(i) = cor(xi, MEbrown) , xi is the gene expression pro le of gene i and M Ebrown is the module eigengene of the brown module
  # 而 module membership 与内部连接度不同。MM 衡量了基因在全局网络中的位置。
  datKME=signedKME(exprset, mergedMEs, outputColumnName="MM.")
  
  which.color=substring(names(Max_cor == Min_p_value),3); 
  which.color ##which.color就是返回感兴趣的形状所对应的最有意义的module颜色
  restrictGenes=colorh1==which.color 
  # pdf(file = 'MM VS connec.pdf')
  verboseScatterplot(Alldegrees1$kWithin[ restrictGenes], 
                     (datKME[restrictGenes, paste("MM.", which.color, sep="")])^4, 
                     col=which.color, 
                     xlab="Intramodular Connectivity", 
                     ylab="(Module Membership)^4")
  # dev.off()
  
}

####step7 挑选module中的hub gene
if(T){
  ph <- 'weight_g'
  GS_spe=as.numeric(cor(datTraits[,ph],exprset, use="p")) #选择的样品性状datTraits$weight_g与基因之间的相关性
  GeneSignificance_spe <- abs(GS_spe)
  #基于显著性和MM计算每个基因与 指定trait 的关联,结果包括p, q, cor, z, 
  NS1=networkScreening(y=datTraits[,ph], 
                       datME=mergedMEs, 
                       datExpr=exprset, 
                       oddPower=3, 
                       blockSize=1000, 
                       minimumSampleSize=4, 
                       addMEy=TRUE, 
                       removeDiag=FALSE, 
                       weightESy=0.5) 
  rownames(NS1) <- colnames(exprset)
  
  #根据基因与指定性状的直接相关性(biserial.cor),模块身份,和加权相关性筛选基因:
  #网上的教程推荐使用的是使用基因与临床形状的直接相关性函数:biserial.cor;但是我这里用的是Pearson相关系数算的
  FilterGenes_spe = ((GeneSignificance_spe > 0.2) & (abs(datKME[paste("MM.",which.color,sep="")])>0.8) & (NS1$q.Weighted < 0.01) ) 
  table(FilterGenes_spe)
  # 找到满足上面条件的基因:
  trait_hubGenes_spe <- colnames(exprset)[FilterGenes_spe] 
  # hub 基因热图:
  plotNetworkHeatmap(exprset,
                     plotGenes = trait_hubGenes_spe,
                     networkType = "unsigned",
                     useTOM = TRUE,
                     power=softPower,
                     main="unsigned correlations")
}

####step8 enrichment analysis 
if(T){
  #hub genes GO and KEGG analysis.
  library(clusterProfiler)
  library(org.Hs.eg.db)
  library(ggplot2)
  # GO 分析:
  ego <- enrichGO(gene          = trait_hubGenes_spe,
                  # universe      = names(geneList),  #background genes. If missing, the all genes listed in the database (eg TERM2GENE table) will be used as background.
                  OrgDb         = org.Hs.eg.db,
                  ont           = "BP",
                  pAdjustMethod = "BH",
                  pvalueCutoff  = 0.01,
                  qvalueCutoff  = 0.05,
                  readable      = TRUE)
  
  GO_BP <- as.data.frame(ego)
  GO_BP$point_shape<-"0"
  GO_BP$point_size<-"15"
  # write.xlsx(GO_BP,"./results/392_genes_GO_BP.xlsx")
  
  ggplot(data=GO_BP)+
    geom_bar(aes(x=reorder(Description,Count),y=Count, fill=-log10(qvalue)), stat='identity') + 
    coord_flip() +
    scale_fill_gradient(expression(-log["10"]("q value")),low="red", high = "blue") +
    xlab("") +
    ylab("Gene count") +
    scale_y_continuous(expand=c(0, 0))+
    theme_bw()+
    theme(
      axis.text.x=element_text(color="black",size=rel(1.5)),
      axis.text.y=element_text(color="black", size=rel(1.6)),
      axis.title.x = element_text(color="black", size=rel(1.6)),
      legend.text=element_text(color="black",size=rel(1.0)),
      legend.title = element_text(color="black",size=rel(1.1))
      # legend.position=c(0,1),legend.justification=c(-1,0)
      # legend.position="top",
    )
  
  # KEGG:
  kk <- enrichKEGG(gene         = trait_hubGenes_spe,
                   organism     = 'hsa',
                   pvalueCutoff = 0.05)
  kegg_DF <- as.data.frame(kk)
  
}

####step9 export
if(T){
  # 导出整个模块基因到 VisANT 
  modTOM <- TOM[mergedColors==which.color,mergedColors==which.color]
  dimnames(modTOM) = list(colnames(exprset)[mergedColors==which.color], colnames(exprset)[which.color]) 
  vis = exportNetworkToVisANT(modTOM, 
                              file = paste("./WGCNA/ADR_drug_exp_new/VisANTInput-Mod-", which.color, ".txt", sep=""),
                              weighted = TRUE, 
                              threshold = 0
                              # probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) 
  ) 
  
  # 导出枢纽基因到 Cytoscape 
  hubGene_TOM <- TOM[FilterGenes_spe,FilterGenes_spe]
  dimnames(hubGene_TOM) = list(colnames(exprset)[FilterGenes_spe], colnames(exprset)[FilterGenes_spe]) 
  cyt = exportNetworkToCytoscape(hubGene_TOM, 
                                 edgeFile = paste("./WGCNA/ADR_drug_exp_new/CytoscapeInput-edges-", paste(which.color, collapse="-"), ".txt", sep=""), 
                                 nodeFile = paste("./WGCNA/ADR_drug_exp_new/CytoscapeInput-nodes-", paste(which.color, collapse="-"), ".txt", sep=""), 
                                 weighted = TRUE, 
                                 threshold = 0.02, 
                                 nodeNames = trait_hubGenes_spe, 
                                 altNodeNames = trait_hubGenes_spe, 
                                 nodeAttr = mergedColors[FilterGenes_spe]
  )
  
}

你可能感兴趣的:(WGCNA)