edgeR、limma、DESeq2三种差异表达包比较(RNA-seq数据)

文章目录

    • 1. 加载R包和输入数据
    • 2. 表达数据整理
    • 3.edgeR包做差异表达
    • 4.limma包做差异表达
    • 5.DESeq2包做差异表达
    • 6.比较三种包差异表达基因筛选结果
    • 总结:

1. 加载R包和输入数据

rm(list = ls())
library("DESeq2")
library("limma")
library("edgeR")
expr = read.csv("mRNA_exprSet.csv",sep = ',',header=T)  
head(expr)
  • TCGA-mRNA数据链接
    链接:https://pan.baidu.com/s/1b6l4qg40NjDNCkyiEow4aA
    提取码:7zii

2. 表达数据整理

  • 对重复基因名取平均表达量,然后将基因名作为行名
expr = avereps(expr[,-1],ID = expr$X) # 自定义
  • 去除低表达的基因
expr = expr[rowMeans(expr)>1,] # 自定义
  • 表达矩阵分组(癌症组织和癌旁组织)
library(stringr)
tumor <- colnames(expr)[as.integer(substr(colnames(expr),14,15)) < 10]
normal <- colnames(expr)[as.integer(substr(colnames(expr),14,15)) >= 10]

tumor_sample <- expr[,tumor]
normal_sample <- expr[,normal]

exprSet_by_group <- cbind(tumor_sample,normal_sample)
group_list <- c(rep('tumor',ncol(tumor_sample)),rep('normal',ncol(normal_sample)))

save(exprSet_by_group, group_list, file = 'exprSet_by_group_list.Rdata')

3.edgeR包做差异表达

  • 表达矩阵
data = exprSet_by_group
  • 分组矩阵
group_list = factor(group_list)
design <- model.matrix(~0+group_list)
rownames(design) = colnames(data)
colnames(design) <- levels(group_list)
  • 差异表达矩阵
DGElist <- DGEList( counts = data, group = group_list)
## Counts per Million or Reads per Kilobase per Million
keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2 ## 自定义
table(keep_gene)
DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]

DGElist <- calcNormFactors( DGElist )
DGElist <- estimateGLMCommonDisp(DGElist, design)
DGElist <- estimateGLMTrendedDisp(DGElist, design)
DGElist <- estimateGLMTagwiseDisp(DGElist, design)

fit <- glmFit(DGElist, design)
results <- glmLRT(fit, contrast = c(-1, 1)) 
nrDEG_edgeR <- topTags(results, n = nrow(DGElist))
nrDEG_edgeR <- as.data.frame(nrDEG_edgeR)
head(nrDEG_edgeR)
  • 提取基因差异显著的差异矩阵
padj = 0.01 # 自定义
foldChange= 2 # 自定义
nrDEG_edgeR_signif  = nrDEG_edgeR[(nrDEG_edgeR$FDR < padj & 
                       (nrDEG_edgeR$logFC>foldChange | nrDEG_edgeR$logFC<(-foldChange))),]
nrDEG_edgeR_signif = nrDEG_edgeR_signif[order(nrDEG_edgeR_signif$logFC),]
save(nrDEG_edgeR_signif,file = 'nrDEG_edgeR_signif.Rdata')

4.limma包做差异表达

  • 表达矩阵
data = exprSet_by_group
  • 分组矩阵
group_list = factor(group_list)
design <- model.matrix(~0+group_list)
rownames(design) = colnames(data)
colnames(design) <- levels(group_list)
  • 差异表达矩阵
DGElist <- DGEList( counts = data, group = group_list )
keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2 # 自定义
table(keep_gene)
DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]

DGElist <- calcNormFactors( DGElist )
v <- voom(DGElist, design, plot = TRUE, normalize = "quantile")
fit <- lmFit(v, design)
cont.matrix <- makeContrasts(contrasts = c('tumor-normal'), levels = design)

fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)

nrDEG_limma_voom = topTable(fit2, coef = 'tumor-normal', n = Inf)
nrDEG_limma_voom = na.omit(nrDEG_limma_voom)
head(nrDEG_limma_voom)
  • 提取基因差异显著的差异矩阵
padj = 0.01 # 自定义
foldChange= 2 # 自定义
nrDEG_limma_voom_signif = nrDEG_limma_voom[(nrDEG_limma_voom$adj.P.Val < padj & 
                          (nrDEG_limma_voom$logFC>foldChange | nrDEG_limma_voom$logFC<(-foldChange))),]
nrDEG_limma_voom_signif = nrDEG_limma_voom_signif[order(nrDEG_limma_voom_signif$logFC),]
save(nrDEG_limma_voom_signif, file = 'nrDEG_limma_voom_signif')

5.DESeq2包做差异表达

  • 表达矩阵
data = exprSet_by_group
  • 分组矩阵
condition = factor(group_list)
coldata <- data.frame(row.names = colnames(data), condition)
dds <- DESeqDataSetFromMatrix(countData = data,
                              colData = coldata,
                              design = ~condition)
dds$condition<- relevel(dds$condition, ref = "normal") # 指定哪一组作为对照组
  • 差异表达矩阵
dds <- DESeq(dds)  
allDEG2 <- as.data.frame(results(dds))
  • 提取基因差异显著的差异矩阵
padj = 0.01 # 自定义
foldChange= 2 # 自定义
nrDEG_DESeq2_signif = allDEG2[(allDEG2$padj < padj & 
                          (allDEG2$log2FoldChange>foldChange | allDEG2$log2FoldChange<(-foldChange))),]
nrDEG_DESeq2_signif = nrDEG_DESeq2_signif[order(nrDEG_DESeq2_signif$log2FoldChange),]
save(nrDEG_DESeq2_signif, file = 'nrDEG_DESeq2_signif')

6.比较三种包差异表达基因筛选结果

edgeR = rownames(nrDEG_edgeR_signif)
dim(nrDEG_edgeR_signif)
limma = rownames(nrDEG_limma_voom_signif)
dim(nrDEG_limma_voom_signif)
DESeq2 = rownames(nrDEG_DESeq2_signif)
dim(nrDEG_DESeq2_signif)
library(VennDiagram)
venn.diagram(
  x = list(
    'edgeR(2675)' = edgeR,
    'limma(2618)' = limma,
    'DESeq2(2913)' = DESeq2
  ),
  filename = 'VN.png',
  col = "black",
  fill = c("dodgerblue", "goldenrod1", "darkorange1"),
  alpha = 0.5,
  cex = 0.8,
  cat.col = 'black',
  cat.cex = 0.8,
  cat.fontface = "bold",
  margin = 0.05,
  main = "三种包的差异表达基因比较",
  main.cex = 1.2
)

edgeR、limma、DESeq2三种差异表达包比较(RNA-seq数据)_第1张图片

总结:

三种包共同筛选出的差异表达基因有1989个,总体来说筛选效果相差不是很大。

本博客内容将同步更新到个人微信公众号生信玩家。欢迎大家关注~~~
在这里插入图片描述

你可能感兴趣的:(转录组学)