前言
我们统一选择p<0.05而且abs(logFC)大于1的基因为显著差异表达基因集,对这个基因集用R包做KEGG/GO超几何分布检验分析。然后把表达矩阵和分组信息分别作出cls和gct文件,导入到GSEA软件分析。基本任务是完成这个分析。生信技能树
实验操作
首先根据上一节的结果,按p<0.05且abs(logFC)>1作为筛选标准取差异表达基因。
DEG <- DEG[!is.na(DEG$log2FoldChange),] #去除NA行
# 按p<0.05且abs(logFC)>1取差异基因
DEG_filt <- DEG[abs(DEG$log2FoldChange) > 1 & DEG$pvalue < 0.05,]
用Y叔的clusterProfiler包进行富集分析。
GO
library(clusterProfiler)
# 差异基因ID转换,Symbol -> Entrezid
eg <- bitr(rownames(DEG_filt),fromType = "SYMBOL",toType = "ENTREZID",OrgDb = "org.Mm.eg.db")
#### 探索org.Mm.eg.db包,得到所有有GO注释的Entrezid
ids <- keys(org.Mm.eg.db, 'ENTREZID')
id_GO <-select(org.Mm.eg.db,keys = ids,columns = c("ENTREZID","GO"))
id_GO <- subset(id_GO,!is.na(GO)) #去除无GO注释的gene
id_GO <- select(org.Mm.eg.db,keys = ids,columns = c("ENTREZID","GO")) %>% subset(!is.na(GO)) # require(magrittr)
length(unique(id_GO$ENTREZID)) # id去重后还剩24139个
org.Mm.eg() #总共68314个Entrezid,其中24139个有GO注释
# org.Mm.egGO has 24139 mapped keys (of 68314 keys)
str(ego)
@ universe : chr [1:23454]
###
# GO分析使用默认背景基因集
ego <- enrichGO(gene = eg$ENTREZID,
keyType = "ENTREZID",
# universe,
OrgDb = "org.Mm.eg.db",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
ego_df <- as.data.frame(ego) # ego_df <- ego@result
dotplot(ego, showCategory=20)
加universe后报错,强制类型转换错误。
str(ego)
显示universe有23454个gene,而org.Mm.eg.db里有GO注释的有24139个。搜索之后发现了原因。即背景基因集会筛选有对应注释分类的基因,而不是全部基因。
# 上面enrichGO中universe筛选出“BP”注释的id
id_GO %>% subset(ONTOLOGY=="BP") %$% ENTREZID %>% setequal(ego@universe) #TRUE
# 利用全部检测到的基因作为背景
# SYMBOL to Entrezid
DEG_id <- bitr(rownames(DEG),fromType = "SYMBOL",toType = "ENTREZID",OrgDb = "org.Mm.eg.db")
# DEG_ALL_GO <- select(org.Mm.eg.db,keys = DEG_id$ENTREZID,columns = c("ENTREZID","GO")) %>% subset(!is.na(GO)) 筛选有GO注释的,不需要
ego2 <- enrichGO(gene = eg$ENTREZID,
keyType = "ENTREZID",
OrgDb = "org.Mm.eg.db",
ont = "BP",
pAdjustMethod = "BH",
universe = DEG_id$ENTREZID,
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
# 同时分析所有GO分类
ego3 <- enrichGO(gene = eg$ENTREZID,
keyType = "ENTREZID",
OrgDb = "org.Mm.eg.db",
ont = "ALL",
pAdjustMethod = "BH",
universe = DEG_id$ENTREZID,
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
ego3_df <- as.data.frame(ego3)
ego3_BP <- subset(ego3_df,ONTOLOGY=="BP") # 提取其中一个分类结果
require(ggplot2)
dotplot(ego2, showCategory=20) + xlim(NA,0.058) # + scale_size(range=c(2,6))
# dotplot(ego3, showCategory=20)
KEGG
# KEGG
ekk <- enrichKEGG(gene = eg$ENTREZID,
organism = "mmu", # "hsa"
keyType = "kegg",
pAdjustMethod = "BH",
universe = DEG_id$ENTREZID,
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
use_internal_data = TRUE)
GSEA
先将基因按不同标准(比如GO、KEGG、癌症特征基因等)分类成多个基因集(S),依据表达值或者变化倍数等对基因排序(L),计算ES(富集得分)、NES(标准化富集得分)再进行统计检验。
GSEA的目的就在于判断S的成员是随机的分布于L上,还是有序的分布于顶部与尾部。
ES计算基本规则是扫描排序后基因序列L,每出现一个基因集(S)中的基因,则增加ES值,反之则减少ES值。ES>0,表明S基因集排在L中的前方,反之则在后方。
# GSEA需要先降序排列,去重复
genelist <- DEG_filt$log2FoldChange
names(genelist) <- rownames(DEG_filt)
genelist <- sort(genelist, decreasing = TRUE)
gsea_BP <- gseGO(genelist,
OrgDb = "org.Mm.eg.db",
keyType = "ENTREZID",
ont="BP")
ridgesplot(gsea_BP) #joyplot(gsea_BP)
gseaplot(gsea_BP, geneSetID="GO:0050767", by = "runningScore")