差异表达基因提取limma+WGCNA分析全代码

用limma包和WGCNA包进行RNA-seq数据分析

#数据提取#
GE<-read.table('TCGA-COAD.htseq_counts.tsv',header=T,sep='\t',stringsAsFactors = F)
#60488*513 512个样本,其中对照组41个
# group_data<-data.frame(colnames(GE)[-1])
group<-rep('tumor',512)
group[grep("11A", colnames(GE)[-1], ignore.case = FALSE, perl = FALSE)]<-"normal"
# group_data<-cbind(group_data,group)
# colnames(group_data)<-c("sample","group")
#提取差异表达基因
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("limma")
library("limma")
#制作分组矩阵
design <- model.matrix(~0+factor(group))
colnames(design)=levels(factor(group))
rownames(design)=colnames(GE)[-1]
head(design)
exprSet<-GE[,-1]
rownames(exprSet)<-GE[,1]
# limma包中voom()函数用于标准化RNA-Seq或芯片数据。该函数将计数(所有计数加0.5以避免对数取零)转换为
#ogCPM值矩阵进行标准化。通过估计对数计数的均值-方差趋势,然后根据其预测方差为每个观察值分配权重,
#用于在线性建模过程中使用权重来调整异方差。
# 数据标准化后,根据每个基因的平均log2CPM构建线性模型,计算残差,并拟合log2CPM与残差标准差平方根的关系,得到的趋势线(红色平滑曲线)可为样本和基因分配权重。
# 对于该趋势线,若左侧0起点处所示残差标准差明显偏高,或者0起点处出现上式趋势,
# 则表明数据中存在较多的低表达(low counts数)基因。若觉得它们可能会对后续的计算带来较大的误差,不妨在标准化前手动过滤下。
norm <- voom(exprSet, design, plot = TRUE)
tag<-apply(exprSet,1,function(x) {
  if (length(which(x==0))>512*0.8) {
     re=0#去除在少于50%的样本中表达的基因
  }else {
     re=1
  }
})
table(tag)
exprSet<-exprSet[which(tag==1),]#36169*512 基因数降低到不到三万
norm <- voom(exprSet, design, plot = TRUE)
#标准化前后数据比较
par(mfrow = c(2, 2))
boxplot(exprSet)
plotDensities(exprSet)
boxplot(norm$E)
plotDensities(norm$E)
#线性拟合,详见 ?lmFit
fit <- lmFit(norm, design, method = 'ls')
#确定比较的两组
#后续将计算标记为 1 的组相对于 -1 的组,基因表达值的上调/下调状态
contrast <- makeContrasts('treat-control', levels = design)
contrast 
#使用经验贝叶斯模型拟合标准误差,详见 ?eBayes
fit2 <- contrasts.fit(fit, contrast)
fit2 <- eBayes(fit2)
qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
abline(0,1) 
#p 值校正、提取差异分析结果,详见 ?topTable
diff_gene <- topTable(fit2, number = Inf, adjust.method = 'fdr')
head(diff_gene, 10)
write.table(diff_gene, 'gene_diff.txt', col.names = NA, sep = '\t', quote = FALSE)
#这里根据 |log2FC| >= 1 & FDR p-value < 0.01 定义“差异”
diff_gene[which(diff_gene$logFC >= 1 & diff_gene$adj.P.Val < 0.01),'sig'] <- 'red'
diff_gene[which(diff_gene$logFC <= -1 & diff_gene$adj.P.Val < 0.01),'sig'] <- 'blue'
diff_gene[which(abs(diff_gene$logFC) < 1 | diff_gene$adj.P.Val >= 0.01),'sig'] <- 'gray' 
log2FoldChange <- diff_gene$logFC
FDR <- diff_gene$adj.P.Val
plot(log2FoldChange, -log10(FDR), pch = 20, col = diff_gene$sig)
abline(v = 1, lty = 2)
abline(v = -1, lty = 2)
abline(h = -log(0.01, 10), lty = 2)
upGenes<-diff_gene[which(diff_gene$logFC >= 1 & diff_gene$adj.P.Val < 0.01),]#549
downGenes<-diff_gene[which(diff_gene$logFC <= -1 & diff_gene$adj.P.Val < 0.01),]#1563
sig_genes<-rbind(upGenes,downGenes)#2112
dim(upGenes)
dim(downGenes)
dim(sig_genes)

#读入注释文件
gencode<-read.table('gencode.v22.annotation.gene.probeMap',header = T,sep = '\t',stringsAsFactors = F)
FPKM<-read.table('TCGA-COAD.htseq_fpkm.tsv',header=T,sep='\t',stringsAsFactors=F)
sig_genes<-data.frame(rownames(sig_genes))
colnames(sig_genes)<-'Ensembl_ID'
#先提取出感兴趣基因的表达
sigE<-merge(FPKM,sig_genes,by.x = 'Ensembl_ID',by.y = 'Ensembl_ID')
#转换基因ID#
sigSymbolE<-merge(gencode[,1:2],sigE,by.x='id',by.y='Ensembl_ID')
sigSymbolE<-sigSymbolE[,-1]
#处理多个探针对应一个基因的情况#
probe2gene = aggregate(sigSymbolE[,-1], by=list(sigSymbolE[,"gene"]),mean)#2107
write.table(probe2gene,'DEG.txt',sep='\t',row.names = F,quote = F)#the genes that we are interested has been exacted sucessfully.

# #deal with the phenotype and survival data
# survival<-read.table('TCGA-OV.survival.tsv',header=T,stringsAsFactors=F)
# #phenotype<-read.table('TCGA-OV.GDC_phenotype.tsv',header=T,stringsAsFactors=F)#有问题,先放这儿
# samples<-data.frame(colnames(probe2gene)[-1])
# samples<-data.frame(apply(samples,1,function(x){re=gsub('.','-',x,fixed=T)}))
# colnames(samples)<-'sample ID'
# survival1<-merge(samples,survival,by.x='sample ID',by.y='sample',all.x=T)#只有378个患者有相应的生存数据,因此剔除表达谱中该患者的数据
# survival1<-survival1[-9,]

# index<-which(samples=='TCGA-04-1357-01A')
# probe2gene<-probe2gene[,-(index+1)]
# #更新输出
# write.table(probe2gene,'sig_genes_exp.txt',sep='\t',row.names = F,quote = F)#the genes that we are interested has been exacted sucessfully.
save.image('.Rdata')

#WGCNA分析
###WGCNA###########
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("WGCNA")
#导入的时候可能存在部分依赖包没有导进来的情况,把这些包单独导进来就好
#如果因为网络原因一直无法成功下载,试试本地安装
#############################


# #首先构造表型相关矩阵
# survival<-read.table('TCGA-OV.survival.tsv',header=T,sep='\t')
# dim(survival)
# survival<-survival[,c(1,4)]
# survival[,1]<-as.data.frame(gsub('-','.',survival[,1]))
# samples_name<-as.data.frame(row.names(datExpr))
# colnames(samples_name)<-'samples'
# survival_OS<-merge(survival,samples_name,by.y='samples',by.x='sample',all.y=T)
# #缺少TCGA.04.1357.01A的临床数据,因此把它删掉




###start##

library(WGCNA)
library(stringr)
library(flashClust)
library(iterators)


femData <- read.table("DEG.txt",header=TRUE,stringsAsFactors=F) #加载数据
datExpr0 = as.data.frame(t(log2(femData[,-1]+1)))#这里我用的时FPKM的数据,根据WGCNA的官方说明,需要对其进行log2(X+1)转换
colnames(datExpr0) = femData[,1] #列标签添加基因名称
rownames(datExpr0) = names(femData[,-1]) #行标签添加为样本名称 
# datExpr<-datExpr[-which(row.names(datExpr)=='TCGA.04.1357.01A'),]
gsg = goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK #结果为TRUE,则所有选定基因都用于后续WGCNA
#datExpr=datExpr[gsg$goodGenes] #如果gsg$allOK结果为FALSE,则后续选择好的基因用于WGCNA。


sampleTree = hclust(dist(datExpr0), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
sizeGrWindow(12,9)
# pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, 
     cex.axis = 1.5, cex.main = 2)
# Plot a line to show the cut
abline(h = 32, col = "red");
# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 32, minSize = 10)
table(clust)
# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#The variable datExpr now contains the expression data ready for network analysis.

# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

net = blockwiseModules(datExpr, power = 6,
                       TOMType = "unsigned", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       saveTOMs = TRUE,
                       saveTOMFileBase = "coadTOM", 
                       verbose = 3)

table(net$colors)
# open a graphics window
sizeGrWindow(12, 9)
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];


#加入表型信息-肿瘤还是正常,进行分析
sample<-rownames(datExpr)
tumor<-rep(1,length(sample))
normal<-rep(0,length(sample))
tumor[grep("11A", sample)]<-0
normal[grep("11A", sample)]<-1
table(tumor)
table(normal)
datTraits<-data.frame(tumor,normal)
rownames(datTraits)<-sample

# Define numbers of genes and samples
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);

sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix =  paste(signif(moduleTraitCor, 2), "\n(",
                           signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(datTraits),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))

# Define variable weight containing the weight column of datTrait
# weight = as.data.frame(datTraits$tumor);
# names(weight) = "weight"
# # names (colors) of the modules
# modNames = substring(names(MEs), 3)

# geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
# MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));

# names(geneModuleMembership) = paste("MM", modNames, sep="");
# names(MMPvalue) = paste("p.MM", modNames, sep="");

# geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p"));
# GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));

# names(geneTraitSignificance) = paste("GS.", names(weight), sep="");
# names(GSPvalue) = paste("p.GS.", names(weight), sep="");

# Calculate topological overlap anew: this could be done more efficiently by saving the TOM
# calculated during module detection, but let us do it again here.
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);
# Transform dissTOM with a power to make moderately strong connections more visible in the heatmap
plotTOM = dissTOM^7;
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA;
# Call the plot function
sizeGrWindow(9,9)
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")

关于WGCNA包中还有更丰富的功能,请参见Bioconductor中的官方说明。

你可能感兴趣的:(生信技能,数据分析)