3大差异分析r包:DESeq2、edgeR和limma

做差异分析需要的数据:表达矩阵分组信息
TCGA的数据只要表达矩阵就够了,因为其TCGA的样本ID比较特殊,样本ID的第14和15位是>=10还是<10就代表了这个样本是正常样本还是肿瘤样本。

三大差异分析的R包起点都是count矩阵(reads计数矩阵),count矩阵是不能直接拿来做差异分析的,因此三个R包都对count矩阵有自己的处理方法 。

如果拿不到count矩阵

  • RSEM:三大R包都可 https://www.jianshu.com/p/46b048220b88
  • tpm:用limma包做差异分析(迫不得已)
  • fpkm, rpkm:转换为tpm,用limma做差异分析(迫不得已)https://mp.weixin.qq.com/s/_DtkxSfLGQHcRju66J4yTQ

1. 使用三大R包进行富集分析

准备工作
表达矩阵和分组消息来自:TCGA数据下载和整理的三种方法

if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(DESeq2))BiocManager::install('DESeq2')
if(!require(edgeR))BiocManager::install('edgeR')
if(!require(limma))BiocManager::install('limma')

rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
table(Group)
1.1 DESeq2

Deseq2分析需要两步:
1. DESeqDataSetFromMatrix()函数是从矩阵生成Deseq2要求的数据类型;
2. DESeq()进行差异分析

library(DESeq2)
colData <- data.frame(row.names =colnames(exp), 
                      condition=Group) #生成行名为样本id,列为分组信息的数据框
dds <- DESeqDataSetFromMatrix(
    countData = exp, #表达矩阵 
    colData = colData, #表达矩阵列名和分组的对应关系
    design = ~ condition) #实验设计,condition对应着colData里的condition,也就是分组信息
dds <- DESeq(dds)  #进行差异分析,生成的dds虽然还是一个Deseq2对象,但是已经可以方便的转换成数据框了
save(dds,file = paste0(proj,"_dd.Rdata"))

从差异分析结果中提取差异表达矩阵

load(file = paste0(proj,"_dd.Rdata"))
# results函数:从dds中提取差异表达结果,生成的还是Deseq2的对象,可以转换成数据框
res <- results(dds, contrast = c("condition",rev(levels(Group)))) # contrast参数必须写成下面的三个元素的向量的格式,且顺序不能反(参考帮助文档)
resOrdered <- res[order(res$pvalue),] # 按照P值排序(只有Deseq2需要,limma和EdgeR会自动排好)
DEG <- as.data.frame(resOrdered) #转换成数据框
DEG = na.omit(DEG) #如果没有这一步,一些表达量很低的基因计算后会出现NA,给后续分析和绘图带来麻烦
View(DEG)

添加change列标记基因上调下调

#设置阈值为mean+2sd
logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )
logFC_cutoff #不同的R包计算的logFC_cutoff 是不一样的
# [1] 3.909132
k1 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
k2 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
# DOWN   NOT    UP 
#  783 28724   841 
head(DEG)

DESeq2_DEG <- DEG #备份结果
1.2 edgeR

进行差异表达分析并提取差异表达矩阵

library(edgeR)

dge <- DGEList(counts=exp,group=Group) #输入表达矩阵和分组信息数据
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 

design <- model.matrix(~0+Group) #写不写0+是一样的
rownames(design)<-colnames(dge)
colnames(design)<-levels(Group)

dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1))  #这里的contrast和DESeq2有差别,这里只需要输入c(-1,1)就好,-1对应着normal,是对照组,1对应着tumor,是实验组。

DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG) #转化为数据框
View(DEG)
FDR是p.adj的一种

添加change列标记基因上调下调

logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff
# [1] 4.290788
k1 = (DEG$PValue < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$PValue < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))

head(DEG)
table(DEG$change)
# DOWN   NOT    UP 
#  533 28643  1172 
edgeR_DEG <- DEG
1.3 limma

使用limma对TCGA的基因表达count矩阵做差异分析和limma对芯片数据进行差异分析的最主要差别在于做了voom标准化

library(limma)

design <- model.matrix(~0+Group) #输入数据Group
colnames(design)=levels(Group)
rownames(design)=colnames(exp)

dge <- DGEList(counts=exp) #输入数据exp
dge <- calcNormFactors(dge)

v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)

constrasts = paste(rev(levels(Group)),collapse = "-") #和上面两个包一样,需要说明是谁比谁
constrasts
# [1] "tumor-normal"
cont.matrix <- makeContrasts(contrasts=constrasts,levels = design) 
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)

DEG = topTable(fit2, coef=constrasts, n=Inf)
DEG = na.omit(DEG)
View(DEG)
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
k1 = (DEG$P.Value < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$P.Value < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
head(DEG)
# DOWN   NOT    UP 
# 1060 28915   373 
limma_voom_DEG <- DEG
tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
           edgeR = as.integer(table(edgeR_DEG$change)),
           limma_voom = as.integer(table(limma_voom_DEG$change)),
           row.names = c("down","not","up")
          );tj
save(DESeq2_DEG,edgeR_DEG,limma_voom_DEG,Group,tj,file = paste0(proj,"_DEG.Rdata"))
1.4 查看3个R包各自得到了多少上调和下调基因
tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
                edgeR = as.integer(table(edgeR_DEG$change)),
                limma_voom = as.integer(table(limma_voom_DEG$change)),
                row.names = c("down","not","up")
tj
#      deseq2 edgeR limma_voom
# down    783   533       1060
# not   28724 28643      28915
# up      841  1172        373

结果还是有相当大的差异,但是不分对错,只是算法不同。

2. 差异分析结果可视化

rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
if(!require(tinyarray))devtools::install_local("tinyarray-master.zip",upgrade = F) #安装tinyarray包
library(ggplot2)
library(tinyarray)
exp[1:4,1:4]
#####2.1 PCA图

这里的矩阵仍然是count数,count矩阵只用来做差异分析,原因详见:Read count、CPM、 RPKM、FPKM和TPM的区别

# cpm 去除文库大小的影响
dat = log2(cpm(exp)+1) #得到用于绘图的矩阵
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(proj,"_pcaplot.Rdata"))
cpm的数据做的pca图
2.2 热图

分别选出三个R包做出来的差异基因并绘制热图

cg1 = rownames(DESeq2_DEG)[DESeq2_DEG$change !="NOT"]
cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
cg3 = rownames(limma_voom_DEG)[limma_voom_DEG$change !="NOT"]

h1 = draw_heatmap(dat[cg1,],Group,n_cutoff = 2) #n_cutoff=2,颜色分配-2到2,超过这个范围都和-2或2颜色一样,以消除极值影响
h2 = draw_heatmap(dat[cg2,],Group,n_cutoff = 2)
h3 = draw_heatmap(dat[cg3,],Group,n_cutoff = 2)
2.3 火山图

计算mean+2sd阈值(前面三个R包计算的都叫做logFC_cutoff,而且没保存,这里重新计算一下)

m2d = function(x){
  mean(abs(x))+2*sd(abs(x))
}

v1 = draw_volcano(DESeq2_DEG,pkg = 1,logFC_cutoff = m2d(DESeq2_DEG$log2FoldChange))
v2 = draw_volcano(edgeR_DEG,pkg = 2,logFC_cutoff = m2d(edgeR_DEG$logFC))
v3 = draw_volcano(limma_voom_DEG,pkg = 3,logFC_cutoff = m2d(limma_voom_DEG$logFC))
# pkg参数:a integer ,means which Differential analysis packages you used,we support three packages by now, 1,2,3,4 respectively means "DESeq2","edgeR","limma(voom)","limma"
拼图
library(patchwork)
(h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(proj,"_heat_vo.png"),width = 15,height = 10)

3. 三大R包差异基因对比

rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
load("TCGA-CHOL_pcaplot.Rdata")
UP=function(df){
  rownames(df)[df$change=="UP"]
} #挑选上调ENSEMBLID
DOWN=function(df){
  rownames(df)[df$change=="DOWN"]
} #挑选下调ENSEMBLID

up = intersect(intersect(UP(DESeq2_DEG),UP(edgeR_DEG)),UP(limma_voom_DEG)) #取上调基因的交集
down = intersect(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG)),DOWN(limma_voom_DEG)) #取下调基因的交集

绘制共同 差异基因的热图

dat = log2(cpm(exp)+1)
hp = draw_heatmap(dat[c(up,down),],Group,n_cutoff = 2)

上调、下调基因分别画维恩图

# 这里的绘图函数接受的list必须是有名字的列表,名字就是将要出现在图上的每个分类的名字
up_genes = list(Deseq2 = UP(DESeq2_DEG),
          edgeR = UP(edgeR_DEG),
          limma = UP(limma_voom_DEG))
down_genes = list(Deseq2 = DOWN(DESeq2_DEG),
          edgeR = DOWN(edgeR_DEG),
          limma = DOWN(limma_voom_DEG))

up.plot <- draw_venn(up_genes,"UPgene") #引号是图的名称
down.plot <- draw_venn(down_genes,"DOWNgene")

拼图

library(patchwork)
#up.plot + down.plot
# 拼图
pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect")
ggsave(paste0(proj,"_heat_ve_pca.png"),width = 15,height = 10)

代码来自2021生信技能树数据挖掘课

你可能感兴趣的:(3大差异分析r包:DESeq2、edgeR和limma)