简介
bioconductor介绍
Github介绍
clusterProfiler-book
分析
模式生物
clusterProfiler提供20种模式物种的GO与KEGG功能富集与注释,可直接下载使用。
Package | Maintainer | Title | 物种 | Index |
---|---|---|---|---|
org.Hs.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Human | 人 | 3 |
org.Mm.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Mouse | 小鼠 | 5 |
org.Rn.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Rat | 大鼠 | 19 |
org.Sc.sgd.db | Bioconductor Package Maintainer | Genome wide annotation for Yeast | 酵母 | 28 |
org.Dm.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Fly | 苍蝇 | 31 |
org.At.tair.db | Bioconductor Package Maintainer | Genome wide annotation for Arabidopsis | 拟南芥 | 32 |
org.Dr.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Zebrafish | 斑马鱼 | 37 |
org.Ce.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Worm | 蠕虫 | 44 |
org.Bt.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Bovine | 牛 | 53 |
org.Gg.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Chicken | 鸡 | 56 |
org.Cf.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Canine | 犬 | 61 |
org.Ss.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Pig | 猪 | 64 |
org.Mmu.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Rhesus | 恒河猴 | 70 |
org.EcK12.eg.db | Bioconductor Package Maintainer | Genome wide annotation for E coli strain K12 | 大肠杆菌菌株K12 | 76 |
org.Xl.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Xenopus | 非洲爪蟾 | 111 |
org.Ag.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Anopheles | 按蚊 | 114 |
org.Pt.eg.db | Bioconductor Package Maintainer | Genome wide annotation for Chimp | 黑猩猩 | 121 |
org.Pf.plasmo.db | Bioconductor Package Maintainer | Genome wide annotation for Malaria | 疟原虫 | 132 |
org.EcSakai.eg.db | Bioconductor Package Maintainer | Genome wide annotation for E coli strain Sakai | 大肠杆菌菌株Sakai | 137 |
org.Mxanthus.db | Eduardo Illueca Fernández | Genome wide annotation for Myxococcus xanthus DK 1622 | 黄色粘球菌DK1622 | 944 |
安装及加载
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("org.Hs.eg.db")
library("org.Hs.eg.db")
非模式生物分为两种,一种是可以在AnnotationHub上在线抓取Org.Db的非模式生物;如果在AnnotationHub上没有抓取到Org.Db,则可以采取自己构建的方式。
非模式生物(一)
AnnotationHub
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("AnnotationHub")
BiocManager::install("clusterProfiler")
library("AnnotationHub")
hub <- AnnotationHub::AnnotationHub()
query(hub,"Rosa chinensis")
Rosa_chinensis <- hub[['AH85494']]
length(keys(Rosa_chinensis))
columns(Rosa_chinensis)
library("clusterProfiler")
# #example
# gene <- as.character(data$V1)
# gene_trans <- mapIds(x = Rosa_chinensis,keys = gene,keytype = "SYMBOL",column = "ENTREZID")
# na.omit(data_id)
# erich.go.BP <- enrichGO(gene=data,OrgDb = Rosa_chinensis,keyType = "SYMBOL",ont = "BP",pvalueCutoff = 0.01,qvalueCutoff = 0.05,readable = T)
非模式生物(二)
利用EggNOG构建
getwd()
setwd("E:/script/R/eggNOG前置文件")
rm(list = ls())
library(tidyr)
library(stringr)
library(dplyr)
#######STEP1 读入文件
egg_f <- "diamond.emapper.annotations"
egg <- read.csv(egg_f, sep = "\t")
egg[egg==""]<-NA #这个代码来自花花的指导(将空行变成NA,方便下面的去除)
#######STEP2 从文件中挑出基因query_name与eggnog注释信息
gene_info <- egg %>%
dplyr::select(GID = query_name, GENENAME = `eggNOG.free.text.desc.`) %>% na.omit()
#######STEP3-1 挑出query_name与GO注释信息
gterms <- egg %>%
dplyr::select(query_name, GOs) %>% na.omit()
#######STEP3-2 我们想得到query_name与GO号的对应信息
# 先构建一个空的数据框(弄好大体的架构,表示其中要有GID =》query_name,GO =》GO号, EVIDENCE =》默认IDA)
# 关于IEA:就是一个标准,除了这个标准以外还有许多。IEA就是表示我们的注释是自动注释,无需人工检查http://wiki.geneontology.org/index.php/Inferred_from_Electronic_Annotation_(IEA)
# 两种情况下需要用IEA:1. manually constructed mappings between external classification systems and GO terms;2.automatic transfer of annotation to orthologous gene products.
gene2go <- data.frame(GID = character(),
GO = character(),
EVIDENCE = character())
# 然后向其中填充:注意到有的query_name对应多个GO,因此我们以GO号为标准,每一行只能有一个GO号,但query_name和Evidence可以重复
for (row in 1:nrow(gterms)) {
gene_terms <- str_split(gterms[row,"GOs"], ",", simplify = FALSE)[[1]]
gene_id <- gterms[row, "query_name"][[1]]
tmp <- data.frame(GID = rep(gene_id, length(gene_terms)),
GO = gene_terms,
EVIDENCE = rep("IEA", length(gene_terms)))
gene2go <- rbind(gene2go, tmp)
}
#####STEP4-1 挑出query_name与KEGG注释信息
gene2ko <- egg %>%
dplyr::select(GID = query_name, KO = KEGG_ko) %>%
na.omit()
####STEP4-2 得到pathway2name, ko2pathway
if(F){
# 需要下载 json文件(这是是经常更新的)
# https://www.genome.jp/kegg-bin/get_htext?ko00001
# 代码来自:http://www.genek.tv/course/225/task/4861/show
library(jsonlite)
library(purrr)
library(RCurl)
update_kegg <- function(json = "ko00001.json") {
pathway2name <- tibble(Pathway = character(), Name = character())
ko2pathway <- tibble(Ko = character(), Pathway = character())
kegg <- fromJSON(json)
for (a in seq_along(kegg[["children"]][["children"]])) {
A <- kegg[["children"]][["name"]][[a]]
for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
B <- kegg[["children"]][["children"]][[a]][["name"]][[b]]
for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
kos <- str_match(kos_info, "K[0-9]*")[,1]
ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))
}
}
}
save(pathway2name, ko2pathway, file = "kegg_info.RData")
}
update_kegg(json = "ko00001.json")
}
######STEP5 利用GO将gene与pathway联系起来,然后挑出query_name与pathway注释信息
load(file = "kegg_info.RData")
colnames(ko2pathway) <- c("KO","Pathway")
colnames(gene2ko) <- c("GID","KO")
gene2ko_1 <- gene2ko %>% separate_rows("KO",sep=",")
gene2ko_2 <- cbind(gene2ko_1[,1],as.data.frame(substr(gene2ko_1[,2],4,9)))
colnames(gene2ko_2) <- c("GID","KO")
gene2pathway <- gene2ko_2 %>% left_join(ko2pathway, by = "KO") %>%
dplyr::select(GID, Pathway) %>%
na.omit()
######STEP6 制作自己的Orgdb
# 查询物种的Taxonomy,例如要查sesame
# https://www.ncbi.nlm.nih.gov/taxonomy/?term=sesame
tax_id = "XXX"
genus = "XXX"
species = "XX "
library(AnnotationForge)
makeOrgPackage(gene_info=gene_info,
go=gene2go,
ko=gene2ko,
pathway=gene2pathway,
version="0.0.1",
outputDir = ".",
tax_id=tax_id,
genus=genus,
species=species,
goTable="go",
maintainer = "[email protected]>",
author = "XXX")
X.orgdb <- str_c("org.", str_to_upper(str_sub(genus, 1, 1)) , species, ".eg.db", sep = "")