WGCNA

WGCNA(加权基因共表达网络分析) -

WGCNA分析,简单全面的最新教程 -

http://pages.stat.wisc.edu/~yandell/statgen/ucla/WGCNA/wgcna.html

STEP6:WGCNA相关性分析 -

学习WGCNA总结 | Public Library of Bioinformatics



模块数,54个模块,0为不包括在模块内的基因


WGCNA_第1张图片
模块对应的颜色及基因数

rm(list = ls())#清空环境变量

getwd()

setwd("D:/AAA-台式机-LY-WorkSpace/data-wrangling-with-R/")

library(WGCNA)

library(reshape2)

library(stringr)

options(stringsAsFactors = F)

#打开多路线程

allowWGCNAThreads()#enableWGCNAThreads()

mycount <- read.csv("D:/AAA-台式机-LY-WorkSpace/data-wrangling-with-R/htseq-count-readcount - 无注释.csv", row.names=1, header=T, check.names=F)

dim(mycount)

data <- log2(mycount+1)

##筛选方差前25%的基因,上一步是为了减少运算量,因为一个测序数据可能会有好几万个探针,

#而可能其中很多基因在各个样本中的表达情况并没有什么太大变化,为了减少运算量,这里我们筛选方差前25%的基因。

m.vars=apply(data,1,var)

data.upper <- data[which(m.vars > quantile(m.vars, probs = seq(0,1,0.25))[4]),]

datExpr <- as.data.frame(t(data.upper))#行和列转换位置

nGenes <- ncol(datExpr)

nSamples <- nrow(datExpr)

##样本聚类检查离群值##

gsg <- goodSamplesGenes(datExpr, verbose = 3)

gsg$allOK

sampleTree <- hclust(dist(datExpr), method = "average")

plot(sampleTree, main = "sample clustering to detect outliers", sub = "", xlab = "")

save(datExpr, file = "readcount-01-dataInput.RData")

##软阈值筛选##

# 横轴是Soft threshold (power),纵轴是无标度网络的评估参数,数值越高,

# 网络越符合无标度特征 (non-scale)

powers <- c(seq(1,10,by=1), seq(12,20,by=2))

sft <- pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)

par(mfrow=c(1,2))

cex1 = 0.9

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],

    xlab = "Soft Threshold (power)", ylab = "Scale Free Topology Model Fit,signed R^2", type = "n",

    main = paste("scale independence"))

text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],

    labels = powers, cex = cex1, col = "red")

abline(h=0.90, col="red")

plot(sft$fitIndices[,1], sft$fitIndices[,5],

    xlab = "Soft Threshold (power)", ylab = "Mean Connectivity", type = "n",

    main = paste("Mean Connectivity"))

text(sft$fitIndices[,1], sft$fitIndices[,5], labels = powers, cex = cex1, col = "red")

# 运行下列代码,让程序推荐你一个power, 数据质量太差啦,程序给了我"NA",自己设定power=14

sft$powerEstimate

#推荐的power是4

##一步法网络构建:One-step network construction and module detection##

net <- blockwiseModules(datExpr,power = 4,maxBlockSize = nGenes,

                        TOMType = "unsigned", minModuleSize = 30,

                        reassignThreshold = 0, mergeCutHeight = 0.25,

                        numericLabels = T, pamRespectsDendro = F,

                        saveTOM = T,

                        saveTOMFileBase = "AS-green-readcount-TOM",

                        verbose = 3)

table(net$colors)

#层级聚类树展示各个模块

## 灰色的为**未分类**到模块的基因。

# Convert labels to colors for plotting

moduleLabels = net$colors

moduleColors = labels2colors(moduleLabels)

# Plot the dendrogram and the module colors underneath

# 如果对结果不满意,还可以recutBlockwiseTrees,节省计算时间

plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],

                    "Module colors",

                    dendroLabels = F, hang = 0.03,

                    addGuide = T, guideHang = 0.05)

table(moduleColors)

MEs = net$MEs

geneTree = net$dendrograms[[1]]

save(MEs, moduleLabels, geneTree,

    file = "AS-green-readcount-02-networkConstruction-auto.RData")##结果保存###

##表型与模块相关性##(没有,下一步)

moduleLabelsAutomatic = net$colors

moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)

moduleColorsWW = moduleColorsAutomatic

MEs0 = moduleEigengenes(datExpr, moduleColorsWW)$eigengenes

MEsWW = orderMEs(MEs0)

samples <- read.csv()

modTraitor = cor(MEsWW, )

#绘制模块之间相关性图

# module eigengene, 可以绘制线图,作为每个模块的基因表达趋势的展示

MEs = net$MEs

### 不需要重新计算,改下列名字就好

### 官方教程是重新计算的,其实可以不用这么麻烦

MEs_col = MEs

colnames(MEs_col) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME", ""))))

MEs_col = orderMEs(MEs_col)

# 根据基因间表达量进行聚类所得到的各模块间的相关性图

# marDendro/marHeatmap 设置下、左、上、右的边距

plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap",

                      marDendro = c(3,3,2,4),

                      marHeatmap = c(3,4,2,2), plotDendrograms = T,

                      xLabelsAngle = 90)

#可视化基因网络 (TOM plot)

# 如果采用分步计算,或设置的blocksize>=总基因数,直接load计算好的TOM结果

# 否则需要再计算一遍,比较耗费时间

# TOM = TOMsimilarityFromExpr(dataExpr, power=power, corType=corType, networkType=type)

load(net$TOMFiles[1], verbose = T)

TOM <- as.matrix(TOM)

dissTOM = 1-TOM

# Transform dissTOM with a power to make moderately strong

# connections more visible in the heatmap

plotTOM = dissTOM^7

# Set diagonal to NA for a nicer plot

diag(plotTOM) = NA

#call the plot function

# 这一部分特别耗时,行列同时做层级聚类

#TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")

#all genes 耗时太长了,随便选取1000个基因来可视化

nSelect = 1000

# For reproducibility, we set the random seed

set.seed(10)

select = sample(nGenes, size = nSelect)

selectTOM = dissTOM[select, select]

# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster

selectTree = hclust(as.dist(selectTOM), method = "average")

selectColors = moduleColors[select]

# Open a graphical window

sizeGrWindow(9,9)

# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing

# the color palette; setting the diagonal to NA also improves the clarity of the plot

plotDiss = selectTOM^7;

diag(plotDiss) = NA;

TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")

你可能感兴趣的:(WGCNA)