GSE125042+WGCNA
数据下载+预处理
rm(list=ls())
options(stringsAsFactors = F)
library(GEOmirror)
library(AnnoProbe)
library(WGCNA)
library(GEOquery)
gset <- getGEO("GSE125042", GSEMatrix =TRUE, getGPL=FALSE)
if (length(gset) > 1) idx <- grep("GPL22145", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
class(gset)
# extract the expression matrix and phenotype data
probes_expr <- exprs(gset);dim(probes_expr)
head(probes_expr[,1:4])
# GSM3561593 GSM3561594 GSM3561595 GSM3561596
#5 5.713150 5.197305 5.044597 5.066939
#8 8.369724 9.270594 9.166942 9.405062
#9 5.801384 6.428157 6.506716 7.019864
#11 5.520870 5.023730 5.163475 5.578887
#12 14.737638 15.357270 15.471276 15.523424
#13 9.883928 9.808856 10.133667 9.971844
boxplot(probes_expr,las=2)
数据可用,已经是log后的数据了
## pheno info
phenoDat <- pData(gset)
head(phenoDat[,1:4])
###探针
a=getGEO(filename='GPL22145_family.soft.gz')
y=a@dataTable@table[,c('ID','GENE_SYMBOL')]
y=na.omit(y)
ids=y
##id转换
ids=ids[ids$ID %in% rownames(probes_expr),]
ids$ID=as.character(ids$ID)
ids$median=apply(probes_expr,1,median)
ids=ids[order(ids$GENE_SYMBOL,ids$median,decreasing = T),]
ids=ids[!duplicated(ids$GENE_SYMBOL),]
probes_expr_1=probes_expr_1[ids$ID,]
rownames(probes_expr_1)=ids$GENE_SYMBOL
probes_expr_1[1:4,1:4]
# GSM3561593 GSM3561594 GSM3561595 GSM3561596
#Zzz3 8.867514 8.827228 9.115763 9.433099
#Zzef1 9.916135 9.638419 9.798047 9.747008
#Zyx 11.574405 11.690470 11.856163 11.778055
#Zyg11b 11.540248 12.038339 11.892782 12.005465
save(probes_expr_1,phenoDat,file = 'raw.Rdata')
WGCNA
log2FPKM
###log2FPKM
rm(list=ls())
options(stringsAsFactors = F)
library(WGCNA)
library(knitr)
load(file="raw.Rdata")
#计算FPKM
BiocManager::install("TxDb.Mmusculus.UCSC.mm9.knownGene")
library(TxDb.Mmusculus.UCSC.mm9.knownGene)
txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
## 下面是定义基因长度为 非冗余exon长度之和
exon_txdb=exons(txdb)
genes_txdb=genes(txdb)
o = findOverlaps(exon_txdb,genes_txdb)###找overlap
t1=exon_txdb[queryHits(o)] #取子集
t2=genes_txdb[subjectHits(o)]#取子集
t1=as.data.frame(t1)
t1$geneid=mcols(t2)[,1]
g_l = lapply(split(t1,t1$geneid),function(x){
# x=split(t1,t1$geneid)[[1]]
head(x)
tmp=apply(x,1,function(y){
y[2]:y[3]
})
length(unique(unlist(tmp)))
# sum(x[,4])
})
head(g_l)
#$`100009600`
#[1] 1842
#$`100009609`
#[1] 2538
#$`100009614`
#[1] 553
#$`100012`
#[1] 1854
#$`100017`
#[1] 2736
#$`100019`
#[1] 21654
g_l=data.frame(gene_id=names(g_l),length=as.numeric(g_l)) #将结果变为矩阵,gene_id与长度对应的矩阵
save(g_l,file = 'step2-g_l.Rdata')
load(file = 'step7-g_l.Rdata')
## 下面是定义基因长度为最长转录本长度
if(F){
t_l=transcriptLengths(txdb)#转录本长度
head(t_l)
t_l=na.omit(t_l)#去除na值
head(t_l)
t_l=t_l[order(t_l$gene_id,t_l$tx_len,decreasing = T),] #排序,长度长的转录本在前
head(t_l)
str(t_l)
t_l=t_l[!duplicated(t_l$gene_id),]#去重复
g_l=t_l[,c(3,5)] #3,5列
}
> head(g_l)
# gene_id length
#1 100009600 1842
#2 100009609 2538
#3 100009614 553
#4 100012 1854
#5 100017 2736
#6 100019 21654
##得到gene_id与gene symbol
library(org.Mm.eg.db)
s2g=toTable(org.Mm.egSYMBOL)
head(s2g)
# gene_id symbol
#1 11287 Pzp
#2 11298 Aanat
#3 11302 Aatk
#4 11303 Abca1
#5 11304 Abca4
#6 11305 Abca2
library(org.Mm.eg.db)
s2g=toTable(org.Mm.egSYMBOL)
head(s2g)
g_l=merge(g_l,s2g,by='gene_id') #把g_l,s2g两个数据框以'gene_id'为连接进行拼接
probes_expr_1[1:4,1:4]
ng=intersect(rownames(probes_expr_1),g_l$symbol) #取genes_expr数据框的行名与g_l数据框的symbol列的交集
#intersect()取交集
# 有了counts矩阵和对应的基因长度信息,就很容易进行各种计算了:
exprSet=probes_expr_1[ng,]
lengths=g_l[match(ng,g_l$symbol),2]
head(lengths)
head(rownames(exprSet))
exprSet[1:4,1:4]
# GSM3561593 GSM3561594 GSM3561595 GSM3561596
#Zzz3 8.867514 8.827228 9.115763 9.433099
#Zzef1 9.916135 9.638419 9.798047 9.747008
#Zyx 11.574405 11.690470 11.856163 11.778055
#Zyg11b 11.540248 12.038339 11.892782 12.005465
exprSet_1=na.omit(exprSet)
total_count<- colSums(exprSet_1)
head(total_count)
head(lengths)
rpkm <- t(do.call( rbind,
lapply(1:length(total_count),
function(i){
10^9*exprSet[,i]/lengths/total_count[i]
}) ))
colnames(rpkm)=colnames(exprSet_1)
rpkm[1:4,1:4]
# GSM3561593 GSM3561594 GSM3561595 GSM3561596
# Zzz3 10.603589 10.481371 10.833489 11.203173
# Zzef1 7.447683 7.188318 7.313792 7.270859
# Zyx 28.771974 28.856634 29.291355 29.079051
# Zyg11b 9.216708 9.547067 9.439923 9.523035
提取表达矩阵
####提取表达矩阵
RNAseq_voom <- rpkm
WGCNA_matrix = t(RNAseq_voom[order(apply(RNAseq_voom,1,mad), decreasing = T)[1:5000],])
datExpr0 <- WGCNA_matrix ## top 5000 mad genes
datExpr <- datExpr0
## 下面主要是为了防止临床表型与样本名字对不上
sampleNames = rownames(datExpr)
traitRows = match(sampleNames, rownames(phenoDat)) ##配对
rownames(phenoDat) = phenoDat[traitRows, 1] #按照顺序排序
筛选sft值
选择合适的“软阀值(soft thresholding power)”
MAD(Median absolute deviation, 中位数绝对偏差)是单变量数据集中样本差异性的稳健度量。mad是一个健壮的统计量,对于数据集中异常值的处理比标准差更具有弹性,可以大大减少异常值对于数据集的影响。
###sft筛选
powers = c(c(1:10), seq(from = 12, to=20, by=2))##随便设置,1:20也可以
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)##选取软阈值
#参数含义为表达矩阵;一系列数值,从中选择最优值;整数的详细程度。零表示沉默,较高的值使输出越来越多,更详细
###画图
png("step2-beta-value.png",width = 800,height = 600)
# Plot the results:
##sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
best_beta=sft$powerEstimate
best_beta
#[1] 9 9是最佳阈值
dev.off()
网络构建
###网络构建
net = blockwiseModules(
datExpr,
power = sft$powerEstimate,
maxBlockSize = 6000,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = F,
verbose = 3
)
###power=sft$powerEstimate=9 即上一步得到的最佳软阈值;maxBlockSize 表示在这个数值内的基因将整体被计算;TOMType为不同的算法,具体参见帮助文档;minModuleSize为最小模块尺寸;reassignThreshold,pvalue值得阈值;mergeCutHeight,合并的阈值;numerricLabels 默认为返回颜色,设置为TRUE则返回数字;pamRespectsDendro,逻辑值,算法判断,通常F;saveTOMs是否存储TOM网络;
table(net$colors)
# 0 1 2 3 4 5 6 7 8 9 10 11 12 0为无法分配到任何模块的基因
# 217 1740 895 624 453 358 158 147 110 100 75 62 61
模块可视化
###模块可视化
# 将标签用颜色表示
mergedColors = labels2colors(net$colors)
table(mergedColors)
moduleColors=mergedColors
# 画出树状图与模块颜色
png("step4-genes-modules.png",width = 800,height = 600)
plotDendroAndColors(dendro=net$dendrograms[[1]], colors=mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
#参数含义分别为dendro为输入的树状图;colors对应的颜色,颜色模块label为"Module colors"; dendroLabels为树状图的标签;addGuide 为逻辑值,是否添加垂直的“指导线”;guideHang = 0.05为树状图高度的小数,如果指导线与树状图标签重叠,就调到参数
dev.off()
上半部分就是树状图,下半部分就是模块颜色
模块和性状的关系
当我们得到基因网络与基因模块之后,就可以计算模块与性状之间的相关性
主要通过计算MS(module significance,Ms)来说明
MS: 模块包含的所有基因显著性的平均值。MS越高,说明这个模块与疾病之间的关联度越高
#明确样本数和基因
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
nGenes
#[1] 5000 ##5000个基因
nSamples
#[1] 32 #32个样本
table(phenoDat$source_name_ch1) ##分组
group_list=c(rep("SHAM_M",5),
rep("CLP_M",5),
rep("CLP+LI_M",6),
rep("SHAM_F",5),
rep("CLP_F",5),
rep("CLP+LI_F",6))
###将分类变量变为连续变量
design=model.matrix(~0+ group_list)
colnames(design)=levels(factor(group_list))
moduleColors <- labels2colors(net$colors)
###计算MS值
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes##不同颜色的模块的ME值矩 (样本vs模块)
MEs = orderMEs(MEs0); ##排序,将相似的模块放在一起
moduleTraitCor = cor(MEs, design , use = "p");#计算相关性
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)#计算P.value值
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
前面为相关性,后面为p值
###结果可视化
png("step5-Module-trait-relationships.png",width = 1500,height = 1500,res = 120)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = colnames(design),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.3,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
感兴趣性状的模块的具体基因分析
以SHAM_F为例子,最相关的是MEred模块,所以接下来就分析这两个
主要是计算GS(gene significance,GS)、MM(Module Membership MM)
GS:基因与形状之间的相关性,是用t-test计算每个基因在不同组之间的基因差异表达显著性检验p值,并将显著性p值的以10为底的对数值
MM: 给定基因表达谱与给定模型的eigengene的相关性
##计算MM与GS
##MM
SHAM_F = as.data.frame(design[,5])#提取SHAM_F
names(SHAM_F) = "SHAM_F"
module = "red"
modNames = substring(names(MEs), 3)#提取MEred
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))
## 算出每个模块跟基因的皮尔森相关系数矩
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
##GS
SHAM_F = as.data.frame(design[,5])
names(SHAM_F) = "SHAM_F"
geneTraitSignificance = as.data.frame(cor(datExpr, SHAM_F, use = "p"))
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) = paste("GS.", names(SHAM_F), sep="")
names(GSPvalue) = paste("p.GS.", names(SHAM_F), sep="")
module = "red"
column = match(module, modNames)
moduleGenes = moduleColors==module
将结果可视化
png("step6-Module_membership-gene_significance.png",width = 800,height = 600)
#sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for SHAM_F",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
dev.off()
横坐标为MM,纵坐标为GS。我们需要找出的就是高相关的基因,也就是靠近右上角的基因
相关性越高、越说明基因可能与性状相关
TOM热图
###重新聚类
geneTree = net$dendrograms[[1]];
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6)
plotTOM = dissTOM^7
diag(plotTOM) = NA
nSelect = 400
set.seed(10)
select = sample(nGenes, size = nSelect)
selectTOM = dissTOM[select, select]
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]
plotDiss = selectTOM^7
diag(plotDiss) = NA
png("step7-Network-heatmap.png",width = 800,height = 600)
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
dev.off()
通过聚类可以发现对角线是TOP重叠性比较高的点,聚集的色块是TOP重叠性比较高的基因。
可以通过色块判断结果分析的效果
找出具体的基因
通过cytoscape做出网络图,再找出具体的基因
# Recalculate topological overlap
TOM = TOMsimilarityFromExpr(datExpr, power = 6);
# Select module
module = "red";
# Select module probes
probes = colnames(datExpr)
inModule = (moduleColors==module);
modProbes = probes[inModule];
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
## 模块对应的基因关系矩
cyt = exportNetworkToCytoscape(
modTOM,
edgeFile = paste("CytoscapeInput-edges-", paste(module, collapse="-"), ".txt", sep=""),
nodeFile = paste("CytoscapeInput-nodes-", paste(module, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.1,#通过weight值筛选
nodeNames = modProbes,
nodeAttr = moduleColors[inModule]
)
筛选weight值大于等于0.1的值基因做网络图
Cytoscape+cytohubba
用方法取交集
###结果取交集
symbols_list = lapply(list.files(pattern = '*.csv'), function(f){
data = read.csv(f, h = T,quote = '', check.names = T)
symbols = as.character(data$X.name.)
return(symbols)
})
Reduce(intersect, symbols_list)
#[1] "\"Apoc4\""