GSE125042+WGCNA

数据下载+预处理

rm(list=ls())
options(stringsAsFactors = F)
library(GEOmirror)
library(AnnoProbe)
library(WGCNA)
library(GEOquery)
gset <- getGEO("GSE125042", GSEMatrix =TRUE, getGPL=FALSE)
if (length(gset) > 1) idx <- grep("GPL22145", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
class(gset)
# extract the expression matrix and phenotype data
probes_expr <- exprs(gset);dim(probes_expr)
head(probes_expr[,1:4])
# GSM3561593 GSM3561594 GSM3561595 GSM3561596
#5    5.713150   5.197305   5.044597   5.066939
#8    8.369724   9.270594   9.166942   9.405062
#9    5.801384   6.428157   6.506716   7.019864
#11   5.520870   5.023730   5.163475   5.578887
#12  14.737638  15.357270  15.471276  15.523424
#13   9.883928   9.808856  10.133667   9.971844
boxplot(probes_expr,las=2)

image-20200208131654645

数据可用，已经是log后的数据了

## pheno info
phenoDat <- pData(gset)
head(phenoDat[,1:4])
###探针
a=getGEO(filename='GPL22145_family.soft.gz')
y=a@dataTable@table[,c('ID','GENE_SYMBOL')]
y=na.omit(y)
ids=y
##id转换
ids=ids[ids$ID %in% rownames(probes_expr),]
ids$ID=as.character(ids$ID)
ids$median=apply(probes_expr,1,median) 
ids=ids[order(ids$GENE_SYMBOL,ids$median,decreasing = T),]
ids=ids[!duplicated(ids$GENE_SYMBOL),]
probes_expr_1=probes_expr_1[ids$ID,] 
rownames(probes_expr_1)=ids$GENE_SYMBOL
probes_expr_1[1:4,1:4]  
#       GSM3561593 GSM3561594 GSM3561595 GSM3561596
#Zzz3     8.867514   8.827228   9.115763   9.433099
#Zzef1    9.916135   9.638419   9.798047   9.747008
#Zyx     11.574405  11.690470  11.856163  11.778055
#Zyg11b  11.540248  12.038339  11.892782  12.005465
save(probes_expr_1,phenoDat,file = 'raw.Rdata')

WGCNA

log2FPKM

###log2FPKM
rm(list=ls())
options(stringsAsFactors = F)
library(WGCNA)
library(knitr)
load(file="raw.Rdata")
#计算FPKM
BiocManager::install("TxDb.Mmusculus.UCSC.mm9.knownGene")
library(TxDb.Mmusculus.UCSC.mm9.knownGene)
txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
## 下面是定义基因长度为 非冗余exon长度之和
  exon_txdb=exons(txdb)
  genes_txdb=genes(txdb)
  o = findOverlaps(exon_txdb,genes_txdb)###找overlap
  t1=exon_txdb[queryHits(o)] #取子集
  t2=genes_txdb[subjectHits(o)]#取子集
  t1=as.data.frame(t1)
  t1$geneid=mcols(t2)[,1]  
  g_l = lapply(split(t1,t1$geneid),function(x){
    # x=split(t1,t1$geneid)[[1]]
    head(x)
    tmp=apply(x,1,function(y){
      y[2]:y[3]
    })
    length(unique(unlist(tmp)))
    # sum(x[,4])
  })
  head(g_l)
#$`100009600`
#[1] 1842

#$`100009609`
#[1] 2538

#$`100009614`
#[1] 553

#$`100012`
#[1] 1854

#$`100017`
#[1] 2736

#$`100019`
#[1] 21654
  g_l=data.frame(gene_id=names(g_l),length=as.numeric(g_l)) #将结果变为矩阵，gene_id与长度对应的矩阵
save(g_l,file = 'step2-g_l.Rdata')
  load(file = 'step7-g_l.Rdata')
  ## 下面是定义基因长度为最长转录本长度
  if(F){
    
    t_l=transcriptLengths(txdb)#转录本长度
    head(t_l)
    t_l=na.omit(t_l)#去除na值
    head(t_l)
    t_l=t_l[order(t_l$gene_id,t_l$tx_len,decreasing = T),] #排序，长度长的转录本在前
    head(t_l)
    str(t_l)
    t_l=t_l[!duplicated(t_l$gene_id),]#去重复
    g_l=t_l[,c(3,5)] #3，5列
    
  }
> head(g_l)
#    gene_id length
#1 100009600   1842
#2 100009609   2538
#3 100009614    553
#4    100012   1854
#5    100017   2736
#6    100019  21654
##得到gene_id与gene symbol
  library(org.Mm.eg.db)
  s2g=toTable(org.Mm.egSYMBOL)
  head(s2g)
# gene_id symbol
#1   11287    Pzp
#2   11298  Aanat
#3   11302   Aatk
#4   11303  Abca1
#5   11304  Abca4
#6   11305  Abca2
library(org.Mm.eg.db)
  s2g=toTable(org.Mm.egSYMBOL)
  head(s2g)
  g_l=merge(g_l,s2g,by='gene_id') #把g_l,s2g两个数据框以'gene_id'为连接进行拼接
  probes_expr_1[1:4,1:4]
  ng=intersect(rownames(probes_expr_1),g_l$symbol) #取genes_expr数据框的行名与g_l数据框的symbol列的交集
  #intersect()取交集
  
  # 有了counts矩阵和对应的基因长度信息，就很容易进行各种计算了：
  exprSet=probes_expr_1[ng,]
  lengths=g_l[match(ng,g_l$symbol),2]
  head(lengths)
  head(rownames(exprSet))
  exprSet[1:4,1:4]
#       GSM3561593 GSM3561594 GSM3561595 GSM3561596
#Zzz3     8.867514   8.827228   9.115763   9.433099
#Zzef1    9.916135   9.638419   9.798047   9.747008
#Zyx     11.574405  11.690470  11.856163  11.778055
#Zyg11b  11.540248  12.038339  11.892782  12.005465
exprSet_1=na.omit(exprSet)
  total_count<- colSums(exprSet_1)
  head(total_count)
  head(lengths)
  rpkm <- t(do.call( rbind,
                     lapply(1:length(total_count),
                            function(i){
                              10^9*exprSet[,i]/lengths/total_count[i]
                            }) ))
  colnames(rpkm)=colnames(exprSet_1)
  rpkm[1:4,1:4] 
#        GSM3561593 GSM3561594 GSM3561595 GSM3561596
# Zzz3    10.603589  10.481371  10.833489  11.203173
# Zzef1    7.447683   7.188318   7.313792   7.270859
# Zyx     28.771974  28.856634  29.291355  29.079051
# Zyg11b   9.216708   9.547067   9.439923   9.523035

提取表达矩阵

  ####提取表达矩阵
  RNAseq_voom <- rpkm
  WGCNA_matrix = t(RNAseq_voom[order(apply(RNAseq_voom,1,mad), decreasing = T)[1:5000],])
  datExpr0 <- WGCNA_matrix  ## top 5000 mad genes
  datExpr <- datExpr0 
  ## 下面主要是为了防止临床表型与样本名字对不上
  sampleNames = rownames(datExpr)
  traitRows = match(sampleNames, rownames(phenoDat)) ##配对
  rownames(phenoDat) = phenoDat[traitRows, 1] #按照顺序排序

筛选sft值

选择合适的“软阀值（soft thresholding power）”

MAD（Median absolute deviation, 中位数绝对偏差）是单变量数据集中样本差异性的稳健度量。mad是一个健壮的统计量，对于数据集中异常值的处理比标准差更具有弹性，可以大大减少异常值对于数据集的影响。

###sft筛选
powers = c(c(1:10), seq(from = 12, to=20, by=2))##随便设置，1：20也可以
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)##选取软阈值
#参数含义为表达矩阵；一系列数值，从中选择最优值；整数的详细程度。零表示沉默，较高的值使输出越来越多，更详细
###画图
png("step2-beta-value.png",width = 800,height = 600)
# Plot the results:
##sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
best_beta=sft$powerEstimate
best_beta
#[1] 9 9是最佳阈值
dev.off()

image-20200208144258413

网络构建

 ###网络构建
  net = blockwiseModules(
    datExpr,
    power = sft$powerEstimate,
    maxBlockSize = 6000,
    TOMType = "unsigned", minModuleSize = 30,
    reassignThreshold = 0, mergeCutHeight = 0.25,
    numericLabels = TRUE, pamRespectsDendro = FALSE,
    saveTOMs = F, 
    verbose = 3
  )
  ###power=sft$powerEstimate=9 即上一步得到的最佳软阈值；maxBlockSize 表示在这个数值内的基因将整体被计算；TOMType为不同的算法，具体参见帮助文档；minModuleSize为最小模块尺寸；reassignThreshold，pvalue值得阈值；mergeCutHeight，合并的阈值；numerricLabels 默认为返回颜色，设置为TRUE则返回数字；pamRespectsDendro，逻辑值，算法判断，通常F；saveTOMs是否存储TOM网络；
  table(net$colors) 
# 0    1    2    3    4    5    6    7    8    9   10   11   12  0为无法分配到任何模块的基因
# 217 1740  895  624  453  358  158  147  110  100   75   62   61

模块可视化

###模块可视化
  # 将标签用颜色表示
  mergedColors = labels2colors(net$colors)
  table(mergedColors)
  moduleColors=mergedColors
  # 画出树状图与模块颜色
  png("step4-genes-modules.png",width = 800,height = 600)
  plotDendroAndColors(dendro=net$dendrograms[[1]], colors=mergedColors[net$blockGenes[[1]]],
                      "Module colors",
                      dendroLabels = FALSE, hang = 0.03,
                      addGuide = TRUE, guideHang = 0.05)
  #参数含义分别为dendro为输入的树状图；colors对应的颜色，颜色模块label为"Module colors"； dendroLabels为树状图的标签；addGuide 为逻辑值，是否添加垂直的“指导线”；guideHang = 0.05为树状图高度的小数，如果指导线与树状图标签重叠，就调到参数
  dev.off()

image-20200208145206591

上半部分就是树状图，下半部分就是模块颜色

模块和性状的关系

当我们得到基因网络与基因模块之后，就可以计算模块与性状之间的相关性

主要通过计算MS（module significance,Ms）来说明

MS: 模块包含的所有基因显著性的平均值。MS越高，说明这个模块与疾病之间的关联度越高

  #明确样本数和基因
  nGenes = ncol(datExpr)
  nSamples = nrow(datExpr)
  nGenes
  #[1] 5000 ##5000个基因
  nSamples
  #[1] 32 #32个样本
  table(phenoDat$source_name_ch1) ##分组
  group_list=c(rep("SHAM_M",5),
               rep("CLP_M",5),
               rep("CLP+LI_M",6),
               rep("SHAM_F",5),
               rep("CLP_F",5),
               rep("CLP+LI_F",6))
 ###将分类变量变为连续变量
  design=model.matrix(~0+ group_list)
  colnames(design)=levels(factor(group_list))
  moduleColors <- labels2colors(net$colors)
  ###计算MS值
  MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes##不同颜色的模块的ME值矩 (样本vs模块)
  MEs = orderMEs(MEs0); ##排序，将相似的模块放在一起
  moduleTraitCor = cor(MEs, design , use = "p");#计算相关性
  moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)#计算P.value值
  # Will display correlations and their p-values
  textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                     signif(moduleTraitPvalue, 1), ")", sep = "");
  dim(textMatrix) = dim(moduleTraitCor)

image-20200208152502838

前面为相关性，后面为p值

###结果可视化
png("step5-Module-trait-relationships.png",width = 1500,height = 1500,res = 120)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = colnames(design),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = blueWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.3,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))

dev.off()

image-20200208153023639

感兴趣性状的模块的具体基因分析

以SHAM_F为例子，最相关的是MEred模块，所以接下来就分析这两个

主要是计算GS（gene significance,GS）、MM（Module Membership MM）

GS：基因与形状之间的相关性，是用t-test计算每个基因在不同组之间的基因差异表达显著性检验p值，并将显著性p值的以10为底的对数值

MM: 给定基因表达谱与给定模型的eigengene的相关性

  ##计算MM与GS
  ##MM
  SHAM_F = as.data.frame(design[,5])#提取SHAM_F
  names(SHAM_F) = "SHAM_F"
  module = "red"
  modNames = substring(names(MEs), 3)#提取MEred
  geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))
  ## 算出每个模块跟基因的皮尔森相关系数矩
  MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
  names(geneModuleMembership) = paste("MM", modNames, sep="")
  names(MMPvalue) = paste("p.MM", modNames, sep="")
  ##GS
  SHAM_F = as.data.frame(design[,5])
  names(SHAM_F) = "SHAM_F"
  geneTraitSignificance = as.data.frame(cor(datExpr, SHAM_F, use = "p"))
  GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
  names(geneTraitSignificance) = paste("GS.", names(SHAM_F), sep="")
  names(GSPvalue) = paste("p.GS.", names(SHAM_F), sep="")
  module = "red"
  column = match(module, modNames)
  moduleGenes = moduleColors==module

将结果可视化

png("step6-Module_membership-gene_significance.png",width = 800,height = 600)
#sizeGrWindow(7, 7);
par(mfrow = c(1,1));
  verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                     abs(geneTraitSignificance[moduleGenes, 1]),
                     xlab = paste("Module Membership in", module, "module"),
                     ylab = "Gene significance for SHAM_F",
                     main = paste("Module membership vs. gene significance\n"),
                     cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
  dev.off()

image-20200208154121339

横坐标为MM,纵坐标为GS。我们需要找出的就是高相关的基因,也就是靠近右上角的基因

相关性越高、越说明基因可能与性状相关

TOM热图

###重新聚类
  geneTree = net$dendrograms[[1]]; 
  dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6)
  plotTOM = dissTOM^7
  diag(plotTOM) = NA
  nSelect = 400
  set.seed(10)
  select = sample(nGenes, size = nSelect)
  selectTOM = dissTOM[select, select]
  selectTree = hclust(as.dist(selectTOM), method = "average")
  selectColors = moduleColors[select]
plotDiss = selectTOM^7
  diag(plotDiss) = NA 
  png("step7-Network-heatmap.png",width = 800,height = 600)
  TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
  dev.off()

image-20200208154518870

通过聚类可以发现对角线是TOP重叠性比较高的点，聚集的色块是TOP重叠性比较高的基因。

可以通过色块判断结果分析的效果

找出具体的基因

通过cytoscape做出网络图，再找出具体的基因

# Recalculate topological overlap
  TOM = TOMsimilarityFromExpr(datExpr, power = 6); 
  # Select module
  module = "red";
  # Select module probes
  probes = colnames(datExpr) 
  inModule = (moduleColors==module);
  modProbes = probes[inModule]; 
  modTOM = TOM[inModule, inModule];
  dimnames(modTOM) = list(modProbes, modProbes)
  ## 模块对应的基因关系矩
  cyt = exportNetworkToCytoscape(
    modTOM,
    edgeFile = paste("CytoscapeInput-edges-", paste(module, collapse="-"), ".txt", sep=""),
    nodeFile = paste("CytoscapeInput-nodes-", paste(module, collapse="-"), ".txt", sep=""),
    weighted = TRUE,
    threshold = 0.1,#通过weight值筛选
    nodeNames = modProbes, 
    nodeAttr = moduleColors[inModule]
  )

筛选weight值大于等于0.1的值基因做网络图

Cytoscape+cytohubba

image-20200208160355299

用方法取交集

###结果取交集
symbols_list = lapply(list.files(pattern = '*.csv'), function(f){
  
  data = read.csv(f, h = T,quote = '', check.names = T)
  
  symbols = as.character(data$X.name.)
  
  return(symbols)
  
})
Reduce(intersect, symbols_list)
#[1] "\"Apoc4\""

2020-02-10