最近工作关系,需要重现一个文章的基因芯片数据分析,查找差异基因,花了一天时间跑了limma流程,供大家参考。
论文名字为 Identification of inflammatory mediators in patients with Crohn’s disease unresponsive to anti-TNFα therapy, 是三组之间的差异基因比较。想复现这个图:
我图省事,直接GEO数据库中下载了基因表达量的矩阵文件GSE52746_series_matrix.txt.gz, 直接从matrix 开始分析,用的是rstudio,R4.03
#删除环境中所有变量
rm(list=ls())
Sys.setenv(LANGUAGE = "en") #显示英文报错信息
options(stringsAsFactors = FALSE) #禁止chr转成factor
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
dir.create("GSE52746_DEG1")
setwd("/Users/.../Documents/code_and_graph/gut_microbiome_diff_R") # 转换工作目录
list.files()
#安装Biobase and limma,如果已安装,忽略此步,直接导入
if (!requireNamespace("BiocManager", quietly = TRUE))
+ install.packages("BiocManager") # 安装limma
BiocManager::install("Biobase") # 安装biobase
#导入包
library(Biobase)
library(limma)
#安装GEOquery包,读取matrix
BiocManager::install("GEOquery") # 如果已安装,忽略这一步
library(GEOquery)
#读取基因表达的series_matrix
exprset <- read.table("GSE52746_series_matrix.txt.gz",
comment.char = "!",stringsAsFactors=F,header=T,
row.names = 1)
dim(exprset) # 查看基因数和样本数
head(exprset)
#导入课题分组的disgn,可以excel编辑导入
design <- read.table("design.csv",header = T, row.names = 1,sep=",")
design
# 指定比较哪些组,建立对比举证
contrast.matrix<-makeContrasts(aCDwithoutTNF-con,
aCDwithTNF-con,
iCDwithTNF-con,
levels = design)
contrast.matrix
#拟合线性模型lmFit
fit <- lmFit(exprset,design)
#转换为对比模型contrasts.fit
fit2 <- contrasts.fit(fit, contrast.matrix)
#经验贝叶斯平滑eBayes
fit2 <- eBayes(fit2)
# 提取每一组的对比结果,coef=1是contrast.matrix中第一组,coef=2是第二组,number=inf是提取所有差异基因
deg1_aCDwithoutTNF <- topTable(fit2,adjust='fdr',coef=1,number=Inf)
deg2_aCDwithTNF <- topTable(fit2,adjust='fdr',coef=2,number=Inf)
deg3_iCDwithTNF <- topTable(fit2,adjust='fdr',coef=3,number=Inf)
# 删选显著性 p<0.05, 差异倍数在1.5倍以上的基因
dif1 <- deg1_aCDwithoutTNF[deg1_aCDwithoutTNF[, "P.Value"]<0.05,] # 显著性筛选
head(dif1)
dim(dif1)
summary(dif1) # 产看logFC的范围
nrow(dif1[abs(dif1[,"logFC"])>log2(1.5),]) # 查看差异倍数>1.5倍的基因数目
# 同理筛选第2组和第3组,这儿也可以写成函数,我图省事就直接罗列了
dif2 <- deg2_aCDwithTNF[deg2_aCDwithTNF[, "P.Value"]<0.05,]
dim(dif2)
summary(dif2)
nrow(dif2[abs(dif2[,"logFC"])>log2(1.5),])
# 筛选第3组差异基因
dif3 <- deg3_iCDwithTNF[deg3_iCDwithTNF[, "P.Value"]<0.05,]
dim(dif3)
summary(dif3)
nrow(dif3[abs(dif3[,"logFC"])>log2(1.5),])
# 找出差异基因,画韦恩图
# 提取差异倍数1.5倍以上的基因信息
dif1_gene <- dif1[abs(dif1[,"logFC"])>log2(1.5),]
dif2_gene <- dif2[abs(dif2[,"logFC"])>log2(1.5),]
dif3_gene <- dif3[abs(dif3[,"logFC"])>log2(1.5),]
# 查看各组差异基因数目
dim(dif1_gene)
dim(dif2_gene)
dim(dif3_gene)
# 提取差异基因的名字,就是行名
ge_a <- row.names(dif1_gene)
ge_b <- row.names(dif2_gene)
ge_c <- row.names(dif3_gene)
# 画韦恩图
# 导入包
library(VennDiagram) #没有包的自己安装
# 画图,保存图形文件
venn.diagram(
x = list(
A = ge_a,
B = ge_b,
C = ge_c
),
filename = "venn figure.tiff", # 保存为tiff格式的文件名
col = "transparent",
fill = c("red", "blue", "green"),
alpha = 0.5,
label.col = c("darkred", "white", "darkblue", "white",
"white", "white", "darkgreen"),
cex = 2.5,
fontfamily = "serif",
fontface = "bold",
cat.default.pos = "text",
cat.col = c("darkred", "darkblue", "darkgreen"),
cat.cex = 2.5,
cat.fontfamily = "serif",
cat.dist = c(0.06, 0.06, 0.03),
cat.pos = 0
)