R语言获取 GO/KEGG 中某一通路的基因集

有时候我们想看某一特定通路的富集情况,这时就需要获取该通路的基因集。

一、从 GO 中获取

下载脚本 getGoTerm.R,获取所有的 GO geneSet,然后保存为RData,因为 get_GO_data 这一步非常耗时。

source("getGoTerm.R")
GO_DATA <- get_GO_data("org.Hs.eg.db", "ALL", "SYMBOL")
save(GO_DATA, file = "GO_DATA.RData")

然后,我写了两个小函数 findGOgetGO 分别用于寻找 GO ID 和获取 GO geneSet。

findGO <- function(pattern, method = "key"){

    if(!exists("GO_DATA"))
        load("GO_DATA.RData")
    if(method == "key"){
        pathways = cbind(GO_DATA$PATHID2NAME[grep(pattern, GO_DATA$PATHID2NAME)])
    } else if(method == "gene"){
        pathways = cbind(GO_DATA$PATHID2NAME[GO_DATA$EXTID2PATHID[[pattern]]])
    }

    colnames(pathways) = "pathway"

    if(length(pathways) == 0){
        cat("No results!\n")
    } else{
        return(pathways)
    }
}

getGO <- function(ID){

    if(!exists("GO_DATA"))
        load("GO_DATA.RData")
    allNAME = names(GO_DATA$PATHID2EXTID)
    if(ID %in% allNAME){
        geneSet = GO_DATA$PATHID2EXTID[ID]
        names(geneSet) = GO_DATA$PATHID2NAME[ID]
        return(geneSet)     
    } else{
        cat("No results!\n")
    }
}

用法示例

load("GO_DATA.RData") # 载入数据 GO_DATA
findGO("insulin") # 寻找 含有指定关键字的 pathway name 的 pathway
findGO("INS", method = "gene") # 寻找含有指定基因名的 pathway

getGO("GO:1901142") # 获取指定 GO ID 的 gene set
$`insulin metabolic process`
[1] "CEACAM1" "CPE"     "ERN1"    "IDE"     "PCSK2"   "ERO1B"

二、从KEGG 中获取

1. 获取目标通路的ID号

以人类糖尿病相关通路为例,访问 https://www.kegg.jp/kegg/pathway.html ,在 Select prefix 中输入人类的缩写 hsaEnter keywords 中输入关键词 diabetes,检索到了8个相关通路,点进第一个 Type II diabetes mellitus,可以查看详细信息,ID号为 hsa04930。

2. 开始获取
# 方法一
if(!requireNamespace("BiocManager", quietly = TRUE))      
   install.packages("BiocManager") 
BiocManager::install("KEGGREST", version = "3.10")

library("KEGGREST")
gsInfo = keggGet('hsa04930')[[1]]
names(gsInfo)

geneSetRaw = sapply(strsplit(gsInfo$GENE, ";"), function(x) x[1])
geneSet = list(geneSetRaw[seq(2, length(geneSetRaw), 2)])
names(geneSet) = gsInfo$NAME
> geneSet
$`Type II diabetes mellitus - Homo sapiens (human)`
 [1] "INS"     "INSR"    "IRS1"    "IRS2"    "IRS4"    "PIK3R1"  "PIK3R2"
 [8] "PIK3R3"  "PIK3CA"  "PIK3CD"  "PIK3CB"  "SLC2A4"  "ADIPOQ"  "MAPK1"
[15] "MAPK3"   "MTOR"    "PRKCZ"   "SOCS1"   "SOCS2"   "SOCS3"   "SOCS4"
[22] "IKBKB"   "MAPK8"   "MAPK10"  "MAPK9"   "TNF"     "PRKCD"   "PRKCE"
[29] "PDX1"    "MAFA"    "SLC2A2"  "HK3"     "HK1"     "HK2"     "HKDC1"
[36] "GCK"     "PKM"     "PKLR"    "KCNJ11"  "ABCC8"   "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"

# 方法二
library("rjson")
download.file("http://togows.dbcls.jp/entry/pathway/hsa04930/genes.json", "hsa04930.json")
json = fromJSON(file ="hsa04930.json")
geneSet = list(as.character(sapply(json[[1]], function(x) sapply(strsplit(x[1], ";"), function(x) x[1]))))
> geneSet
[[1]]
 [1] "INS"     "INSR"    "IRS1"    "IRS2"    "IRS4"    "PIK3R1"  "PIK3R2"
 [8] "PIK3R3"  "PIK3CA"  "PIK3CD"  "PIK3CB"  "SLC2A4"  "ADIPOQ"  "MAPK1"
[15] "MAPK3"   "MTOR"    "PRKCZ"   "SOCS1"   "SOCS2"   "SOCS3"   "SOCS4"
[22] "IKBKB"   "MAPK8"   "MAPK10"  "MAPK9"   "TNF"     "PRKCD"   "PRKCE"
[29] "PDX1"    "MAFA"    "SLC2A2"  "HK3"     "HK1"     "HK2"     "HKDC1"
[36] "GCK"     "PKM"     "PKLR"    "KCNJ11"  "ABCC8"   "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"

不过第二种方法不能获取geneSet的 NAME,因此更推荐第一种方法。

同样将以上步骤包装为函数:

getKEGG <- function(ID){

    library("KEGGREST")

    gsList = list()
    for(xID in ID){

        gsInfo = keggGet(xID)[[1]]
        if(!is.null(gsInfo$GENE)){
            geneSetRaw = sapply(strsplit(gsInfo$GENE, ";"), function(x) x[1])   
            xgeneSet = list(geneSetRaw[seq(2, length(geneSetRaw), 2)])          
            NAME = sapply(strsplit(gsInfo$NAME, " - "), function(x) x[1])
            names(xgeneSet) = NAME
            gsList[NAME] = xgeneSet 
        } else{
            cat(" ", xID, "No corresponding gene set in specific database.\n")
        }
    }
    return(gsList)
}

> getKEGG('hsa04930')
$`Type II diabetes mellitus - Homo sapiens (human)`
 [1] "INS"     "INSR"    "IRS1"    "IRS2"    "IRS4"    "PIK3R1"  "PIK3R2"
 [8] "PIK3R3"  "PIK3CA"  "PIK3CD"  "PIK3CB"  "SLC2A4"  "ADIPOQ"  "MAPK1"
[15] "MAPK3"   "MTOR"    "PRKCZ"   "SOCS1"   "SOCS2"   "SOCS3"   "SOCS4"
[22] "IKBKB"   "MAPK8"   "MAPK10"  "MAPK9"   "TNF"     "PRKCD"   "PRKCE"
[29] "PDX1"    "MAFA"    "SLC2A2"  "HK3"     "HK1"     "HK2"     "HKDC1"
[36] "GCK"     "PKM"     "PKLR"    "KCNJ11"  "ABCC8"   "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"

> getKEGG(c("hsa04930", "hsa04940", "hsa04950"))
$`Type II diabetes mellitus`
 [1] "INS"     "INSR"    "IRS1"    "IRS2"    "IRS4"    "PIK3R1"  "PIK3R2"
 [8] "PIK3R3"  "PIK3CA"  "PIK3CD"  "PIK3CB"  "SLC2A4"  "ADIPOQ"  "MAPK1"
[15] "MAPK3"   "MTOR"    "PRKCZ"   "SOCS1"   "SOCS2"   "SOCS3"   "SOCS4"
[22] "IKBKB"   "MAPK8"   "MAPK10"  "MAPK9"   "TNF"     "PRKCD"   "PRKCE"
[29] "PDX1"    "MAFA"    "SLC2A2"  "HK3"     "HK1"     "HK2"     "HKDC1"
[36] "GCK"     "PKM"     "PKLR"    "KCNJ11"  "ABCC8"   "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"

$`Type I diabetes mellitus`
 [1] "INS"      "GAD1"     "GAD2"     "PTPRN"    "PTPRN2"   "CPE"
 [7] "HSPD1"    "ICA1"     "HLA-DMA"  "HLA-DMB"  "HLA-DOA"  "HLA-DOB"
[13] "HLA-DPA1" "HLA-DPB1" "HLA-DQA1" "HLA-DQA2" "HLA-DQB1" "HLA-DRA"
[19] "HLA-DRB1" "HLA-DRB3" "HLA-DRB4" "HLA-DRB5" "CD80"     "CD86"
[25] "CD28"     "IL12A"    "IL12B"    "IL2"      "IFNG"     "HLA-A"
[31] "HLA-B"    "HLA-C"    "HLA-F"    "HLA-G"    "HLA-E"    "FASLG"
[37] "FAS"      "PRF1"     "GZMB"     "LTA"      "TNF"      "IL1A"
[43] "IL1B"

$`Maturity onset diabetes of the young`
 [1] "HHEX"    "MNX1"    "ONECUT1" "PDX1"    "NR5A2"   "NEUROG3" "NKX2-2"
 [8] "NKX6-1"  "PAX6"    "PAX4"    "NEUROD1" "RFX6"    "HES1"    "HNF1B"
[15] "FOXA2"   "MAFA"    "HNF4A"   "HNF1A"   "HNF4G"   "FOXA3"   "PKLR"
[22] "SLC2A2"  "INS"     "IAPP"    "GCK"     "BHLHA15"

Ref

用clusterProfiler做GSEA(一)

从KEGG的pathway中提取gene symbol

R-下载某一条通路的所有基因名字(KEGG)

你可能感兴趣的:(R语言获取 GO/KEGG 中某一通路的基因集)