准备工作
## ----echo=FALSE, results='hide'
library(DOSE)
library(GO.db)
library(org.Hs.eg.db)
library(GSEABase)
library(clusterProfiler)
这里进行包的导入
基因ID类型的转换
bitr转换
x <- c("GPX3", "GLRX", "LBP", "CRYAB", "DEFB1", "HCLS1", "SOD2", "HSPA2",
"ORM1", "IGFBP1", "PTHLH", "GPC3", "IGFBP3","TOB1", "MITF", "NDRG1",
"NR1H4", "FGFR3", "PVR", "IL6", "PTPRM", "ERBB2", "NID2", "LAMB1",
"COMP", "PLS3", "MCAM", "SPP1", "LAMC1", "COL4A2", "COL4A1", "MYOC",
"ANXA4", "TFPI2", "CST6", "SLPI", "TIMP2", "CPM", "GGT1", "NNMT",
"MAL", "EEF1A2", "HGD", "TCN2", "CDA", "PCCA", "CRYM", "PDXK",
"STC1", "WARS", "HMOX1", "FXYD2", "RBP4", "SLC6A12", "KDELR3", "ITM2B")
keytypes(org.Hs.eg.db)
eg = bitr(x, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
ids <- bitr(x, fromType="SYMBOL", toType=c("UNIPROT", "ENSEMBL"), OrgDb="org.Hs.eg.db")
参数:
- x:基因ID向量
- fromType:目前基因ID类型
- toType:目的转换ID类型,可通过向量模式表示多ID类型
- OrgDb:使用的注释数据包,通过keytypes()函数查看支持的类型
返回:
- 各类型ID组成的数据框
注:
- 对于groupGO, enrichGO和gseGO函数,可以通过keyType参数指定类型,从而省去类型转换的过程(后续)。
bitr_kegg转换
data(gcSample)
hg <- gcSample[[1]]
search_kegg_organism(“human”,by="common_name")
eg2np <- bitr_kegg(hg, fromType='kegg', toType='ncbi-proteinid', organism='hsa')
参数:
- hg:基因的ID向量
- fromType:目前基因ID类型
- toType:目标基因ID类型
- organism:支持的物种类型
返回:
- 各类型ID组成的数据框
注:
- 两个Type参数取值必须为以下四种:
kegg:KEGG库中的ID
ncbi-geneid:NCBI中对因的基因ID
ncbi-proteinid:NCBI中对应的蛋白ID
uniprot:? - organism参数可以通过search_kegg_organism()函数获取取值,常用的“hsa”为人类
GO分析
GO分类
data(geneList, package="DOSE")
gene <- names(geneList)[abs(geneList) > 2]
gene.df <- bitr(gene, fromType = "ENTREZID",
toType = c("ENSEMBL", "SYMBOL"),
OrgDb = org.Hs.eg.db)
ggo <- groupGO(gene = gene,
OrgDb = org.Hs.eg.db,
ont = "CC",
level = 3,
readable = TRUE)
参数:
- gene:基因ID组成的向量
- OrgDb:注释数据库
- ont:GO描述功能的方式
- lebel:GO术语的水平
- readable:T时,以symbols形式输出
返回:
- 数据框
- count:基因数目
- GeneRatio:基因的比例
GO过表达测试
ego <- enrichGO(gene = gene,
universe = names(geneList),
OrgDb = org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
参数:
- gene:基因向量
- universe:背景基因,默认为GO数据库,给定后也是和GO取交集
- OrgDb:数据库
- ont:GO属于方面
- pAdjustMethod:p值矫正反法
- pcalueCutoff:p值阈值
- qvalueCutoff:q值阈值
- readable:是否使用smybols形式输出
返回:
- 数据框
- GeneRatio:样本基因中对因功能基因占样本基因总体的比例(总体是与对应库取交集的结果数目)
- BgRatio:数据库中对因功能基因占数据库基因总体的比例(总体是与对应库取交集的结果数目)
- count:基因数目
注:
- dropGO(),simplify()丢掉想去掉的GO
- gofilter()将GO分析限制在具体的level
- 可以添加keyType参数,不建议使用,有的类型会有重复
GO基因集富集分析
ego3 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "CC",
nPerm = 1000,
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE)
参数:
- geneList:进行富集的基因列表
- OrgDb:注释数据库
- ont:GO术语方面
- nPerm:置换的数目
- minGSSize:用于测试的功能集最小容量
- maxGSSize:用于测试的功能集最大容量
- pvalueCutoff:p值阈值
- verbose:是否打印信息
返回:
- 数据框
- setSize:基因集的基因数目
- enrichmentScore:ES分数
- NES:标准化后的ES分数
- rank:?
- leading_edge:前沿集
- core_enrichment:富集的主要基因
KEGG分析
KEGG过表达分析
kk <- enrichKEGG(gene = gene,
organism = 'hsa',
pvalueCutoff = 0.05)
KEGG基因集富集分析
kk2 <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.05,
verbose = FALSE)
KEGG模块过表达测试
mkk <- enrichMKEGG(gene = gene,
organism = 'hsa')
KEGG模块基因集富集分析
mkk2 <- gseMKEGG(geneList = geneList,
mkk2 <- gseMKEGG(geneList = mydata,
organism = 'hsa')
疾病分析
可视化
就是各种函数
barplot(ggo, drop=TRUE, showCategory=12)
dotplot(ego)
emapplot(ego)
cnetplot(ego, categorySize="pvalue", foldChange=geneList)
goplot(ego)
gseaplot(kk2, geneSetID = "hsa04145")
browseKEGG(kk, 'hsa04110')
library("pathview")
hsa04110 <- pathview(gene.data = geneList,
pathway.id = "hsa04110",
species = "hsa",
limit = list(gene=max(abs(geneList)), cpd=1))
多基因集功能对比
ck <- compareCluster(geneCluster = gcSample, fun = "enrichKEGG")
参数:
- geneCluster:带有命名的列表
- fun:方法取值可以是groupGO,enrichGO,enrichKEGG,enrichDO或者enrichPathway
分类交叉(公式接口)
mydf <- data.frame(Entrez=names(geneList), FC=geneList)
mydf <- mydf[abs(mydf$FC) > 1,]
mydf$group <- "upregulated"
mydf$group[mydf$FC < 0] <- "downregulated"
mydf$othergroup <- "A"
mydf$othergroup[abs(mydf$FC) > 2] <- "B"
formula_res <- compareCluster(Entrez~group+othergroup, data=mydf, fun="enrichKEGG")
参数:
- :通过公式指定分类组合方法
- data:进行分析的数据
- fun:采取的分析方法
功能对比的可视化
dotplot(ck)
dotplot(formula_res)
参数:
- showCategory:每个基因集绘制其多少功能类
- by:点大小的含义,可以设置为geneRatio,count,rowPercentage
注:
- 点的颜色代表p值,越红证明越富集
参考资料:Guangchuang Yu.2018."Statistical analysis and visualization of functional profiles for genes and gene clusters".http://www.bioconductor.org/packages/release/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.html#go-analysis