有时候我们想看某一特定通路的富集情况,这时就需要获取该通路的基因集。
一、从 GO 中获取
下载脚本 getGoTerm.R,获取所有的 GO geneSet,然后保存为RData,因为 get_GO_data
这一步非常耗时。
source("getGoTerm.R")
GO_DATA <- get_GO_data("org.Hs.eg.db", "ALL", "SYMBOL")
save(GO_DATA, file = "GO_DATA.RData")
然后,我写了两个小函数 findGO
和 getGO
分别用于寻找 GO ID 和获取 GO geneSet。
findGO <- function(pattern, method = "key"){
if(!exists("GO_DATA"))
load("GO_DATA.RData")
if(method == "key"){
pathways = cbind(GO_DATA$PATHID2NAME[grep(pattern, GO_DATA$PATHID2NAME)])
} else if(method == "gene"){
pathways = cbind(GO_DATA$PATHID2NAME[GO_DATA$EXTID2PATHID[[pattern]]])
}
colnames(pathways) = "pathway"
if(length(pathways) == 0){
cat("No results!\n")
} else{
return(pathways)
}
}
getGO <- function(ID){
if(!exists("GO_DATA"))
load("GO_DATA.RData")
allNAME = names(GO_DATA$PATHID2EXTID)
if(ID %in% allNAME){
geneSet = GO_DATA$PATHID2EXTID[ID]
names(geneSet) = GO_DATA$PATHID2NAME[ID]
return(geneSet)
} else{
cat("No results!\n")
}
}
用法示例
load("GO_DATA.RData") # 载入数据 GO_DATA
findGO("insulin") # 寻找 含有指定关键字的 pathway name 的 pathway
findGO("INS", method = "gene") # 寻找含有指定基因名的 pathway
getGO("GO:1901142") # 获取指定 GO ID 的 gene set
$`insulin metabolic process`
[1] "CEACAM1" "CPE" "ERN1" "IDE" "PCSK2" "ERO1B"
二、从KEGG 中获取
1. 获取目标通路的ID号
以人类糖尿病相关通路为例,访问 https://www.kegg.jp/kegg/pathway.html ,在 Select prefix
中输入人类的缩写 hsa
,Enter keywords
中输入关键词 diabetes
,检索到了8个相关通路,点进第一个 Type II diabetes mellitus
,可以查看详细信息,ID号为 hsa04930。
2. 开始获取
# 方法一
if(!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("KEGGREST", version = "3.10")
library("KEGGREST")
gsInfo = keggGet('hsa04930')[[1]]
names(gsInfo)
geneSetRaw = sapply(strsplit(gsInfo$GENE, ";"), function(x) x[1])
geneSet = list(geneSetRaw[seq(2, length(geneSetRaw), 2)])
names(geneSet) = gsInfo$NAME
> geneSet
$`Type II diabetes mellitus - Homo sapiens (human)`
[1] "INS" "INSR" "IRS1" "IRS2" "IRS4" "PIK3R1" "PIK3R2"
[8] "PIK3R3" "PIK3CA" "PIK3CD" "PIK3CB" "SLC2A4" "ADIPOQ" "MAPK1"
[15] "MAPK3" "MTOR" "PRKCZ" "SOCS1" "SOCS2" "SOCS3" "SOCS4"
[22] "IKBKB" "MAPK8" "MAPK10" "MAPK9" "TNF" "PRKCD" "PRKCE"
[29] "PDX1" "MAFA" "SLC2A2" "HK3" "HK1" "HK2" "HKDC1"
[36] "GCK" "PKM" "PKLR" "KCNJ11" "ABCC8" "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
# 方法二
library("rjson")
download.file("http://togows.dbcls.jp/entry/pathway/hsa04930/genes.json", "hsa04930.json")
json = fromJSON(file ="hsa04930.json")
geneSet = list(as.character(sapply(json[[1]], function(x) sapply(strsplit(x[1], ";"), function(x) x[1]))))
> geneSet
[[1]]
[1] "INS" "INSR" "IRS1" "IRS2" "IRS4" "PIK3R1" "PIK3R2"
[8] "PIK3R3" "PIK3CA" "PIK3CD" "PIK3CB" "SLC2A4" "ADIPOQ" "MAPK1"
[15] "MAPK3" "MTOR" "PRKCZ" "SOCS1" "SOCS2" "SOCS3" "SOCS4"
[22] "IKBKB" "MAPK8" "MAPK10" "MAPK9" "TNF" "PRKCD" "PRKCE"
[29] "PDX1" "MAFA" "SLC2A2" "HK3" "HK1" "HK2" "HKDC1"
[36] "GCK" "PKM" "PKLR" "KCNJ11" "ABCC8" "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
不过第二种方法不能获取geneSet的 NAME
,因此更推荐第一种方法。
同样将以上步骤包装为函数:
getKEGG <- function(ID){
library("KEGGREST")
gsList = list()
for(xID in ID){
gsInfo = keggGet(xID)[[1]]
if(!is.null(gsInfo$GENE)){
geneSetRaw = sapply(strsplit(gsInfo$GENE, ";"), function(x) x[1])
xgeneSet = list(geneSetRaw[seq(2, length(geneSetRaw), 2)])
NAME = sapply(strsplit(gsInfo$NAME, " - "), function(x) x[1])
names(xgeneSet) = NAME
gsList[NAME] = xgeneSet
} else{
cat(" ", xID, "No corresponding gene set in specific database.\n")
}
}
return(gsList)
}
> getKEGG('hsa04930')
$`Type II diabetes mellitus - Homo sapiens (human)`
[1] "INS" "INSR" "IRS1" "IRS2" "IRS4" "PIK3R1" "PIK3R2"
[8] "PIK3R3" "PIK3CA" "PIK3CD" "PIK3CB" "SLC2A4" "ADIPOQ" "MAPK1"
[15] "MAPK3" "MTOR" "PRKCZ" "SOCS1" "SOCS2" "SOCS3" "SOCS4"
[22] "IKBKB" "MAPK8" "MAPK10" "MAPK9" "TNF" "PRKCD" "PRKCE"
[29] "PDX1" "MAFA" "SLC2A2" "HK3" "HK1" "HK2" "HKDC1"
[36] "GCK" "PKM" "PKLR" "KCNJ11" "ABCC8" "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
> getKEGG(c("hsa04930", "hsa04940", "hsa04950"))
$`Type II diabetes mellitus`
[1] "INS" "INSR" "IRS1" "IRS2" "IRS4" "PIK3R1" "PIK3R2"
[8] "PIK3R3" "PIK3CA" "PIK3CD" "PIK3CB" "SLC2A4" "ADIPOQ" "MAPK1"
[15] "MAPK3" "MTOR" "PRKCZ" "SOCS1" "SOCS2" "SOCS3" "SOCS4"
[22] "IKBKB" "MAPK8" "MAPK10" "MAPK9" "TNF" "PRKCD" "PRKCE"
[29] "PDX1" "MAFA" "SLC2A2" "HK3" "HK1" "HK2" "HKDC1"
[36] "GCK" "PKM" "PKLR" "KCNJ11" "ABCC8" "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
$`Type I diabetes mellitus`
[1] "INS" "GAD1" "GAD2" "PTPRN" "PTPRN2" "CPE"
[7] "HSPD1" "ICA1" "HLA-DMA" "HLA-DMB" "HLA-DOA" "HLA-DOB"
[13] "HLA-DPA1" "HLA-DPB1" "HLA-DQA1" "HLA-DQA2" "HLA-DQB1" "HLA-DRA"
[19] "HLA-DRB1" "HLA-DRB3" "HLA-DRB4" "HLA-DRB5" "CD80" "CD86"
[25] "CD28" "IL12A" "IL12B" "IL2" "IFNG" "HLA-A"
[31] "HLA-B" "HLA-C" "HLA-F" "HLA-G" "HLA-E" "FASLG"
[37] "FAS" "PRF1" "GZMB" "LTA" "TNF" "IL1A"
[43] "IL1B"
$`Maturity onset diabetes of the young`
[1] "HHEX" "MNX1" "ONECUT1" "PDX1" "NR5A2" "NEUROG3" "NKX2-2"
[8] "NKX6-1" "PAX6" "PAX4" "NEUROD1" "RFX6" "HES1" "HNF1B"
[15] "FOXA2" "MAFA" "HNF4A" "HNF1A" "HNF4G" "FOXA3" "PKLR"
[22] "SLC2A2" "INS" "IAPP" "GCK" "BHLHA15"
Ref
用clusterProfiler做GSEA(一)
从KEGG的pathway中提取gene symbol
R-下载某一条通路的所有基因名字(KEGG)