clusterProfiler进行GSEA富集分析

不同于ORA富集分析(仅需要差异表达基因列表),GSEA富集分析需要基因排序列表(a ranked list of genes),一般根据logFC对基因排序。

1. 载入包,读入数据

rm(list=ls())
setwd("your_dir")

# read the file
mydata <- read.csv("your_diff_expr_file",row.names=1)
head(mydata)

# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("msigdbr")

library(clusterProfiler)
library(enrichplot)
library(msigdbr)  # Molecular Signatures Database
library(org.Hs.eg.db)  #人类GO注释数据

2.  准备基因排序列表

# SYMBOL,ENTREZID映射
map_df <- bitr(mydata$symbol,fromType="SYMBOL",toType=c("ENTREZID"),
               OrgDb = org.Hs.eg.db)
#head(map_df)
#head(mydata)
merged_df <- merge(mydata,map_df,by.x = "symbol", by.y = "SYMBOL")
#head(merged_df)

FCgenelist <- merged_df$logFC
names(FCgenelist) <- merged_df$ENTREZID # named vector
FCgenelist <- sort(FCgenelist, decreasing = T)  # 按降序排序

3. MSigDb数据富集分析

msigdbr_species()
Hs_msigdbr <- msigdbr(species="Homo sapiens")
colnames(Hs_msigdbr)
Hs_df <- as.data.frame(Hs_msigdbr[,c('gs_name','entrez_gene','gene_symbol')])
head(Hs_df)

# MSigDb数据富集分析
em_msig <- GSEA(FCgenelist,TERM2GENE=Hs_df[,c(1,2)])
#barplot(em_msig,showCategory=10)
head(em_msig,20)  # sorted by pvalue

# enriched in high risk group: 5 gene sets
p1 <- gseaplot2(em_msig,1:5,base_size = 20,subplots = 1:2)
p2 <- gseaplot2(em_msig,c(6:7,11:13),base_size = 20,
                subplots = 1:2)

p3 <- gseaplot2(em_msig,c(6:8,12:13),base_size = 1,color = "white",
                subplots = 1:2)
# Description,enrichmentScore,pvalue,p.adjust,qvalues,rank
#cowplot::plot_grid(p4, p5, p6, ncol=1, labels=LETTERS[1:3])

通用富集分析函数GSEA, 可以自定义ontologies/pathways。富集后结果已经按p值排序。

4.  gseGO 富集分析

#gseGO
egseGO <- gseGO(FCgenelist, OrgDb=org.Hs.eg.db)
class(egseGO)

egseGO[1:5,c('Description','enrichmentScore')]
head(egseGO[,c('Description','enrichmentScore')])

tail(egseGO[,c('Description','enrichmentScore')])
dim(egseGO)
#plotting the first five
gseaplot2(egseGO,1:5,subplots = 1:2, base_size = 20,pvalue_table = T)

gseaplot2(egseGO,1:5)
dev.new()
gseaplot2(egseGO,1)

gseaplot(egseGO,geneSetID=1,subplots = 2:3,title=egseGO$Description[1])
gseaplot(egseGO,geneSetID=1,subplots = 1:2,title=egseGO$Description[1])

p1 <- gseaplot(egseGO,geneSetID=1,by="runningScore",title=egseGO$Description[1])
p2 <- gseaplot(egseGO,geneSetID=1,by="preranked",title=egseGO$Description[1])
p3 <- gseaplot(egseGO,geneSetID=4,title=egseGO$Description[4])
cowplot::plot_grid(p1, p2, p3, ncol=2, labels=LETTERS[1:3])

  5.  gseKEGG 富集分析

#gseKEGG
egseKEGG <- gseKEGG(FCgenelist) # 默认organism="hsa"

egseKEGG[,c('Description','enrichmentScore')]
egseKEGG[1:5,c('Description','enrichmentScore')]

head(egseKEGG,20)
dim(egseKEGG)
#plotting the first five
gseaplot2(egseKEGG,1:5,subplots = 1:2, base_size = 20,pvalue_table = T)
dev.new()
gseaplot2(egseKEGG,1:5)

你可能感兴趣的:(r语言,生物信息学)