#kegg:基因功能存储在pathway数据库里
rm(list = ls()) ## 魔幻操作,一键清空~
load(file = 'deg.Rdata')
head(deg)
## 不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。
logFC_t=1.5
deg=nrDEG
deg$g=ifelse(deg$P.Value>0.05,'stable',
ifelse( deg$logFC > logFC_t,'UP',
ifelse( deg$logFC < -logFC_t,'DOWN','stable') )
)
table(deg$g)
head(deg)
deg$symbol=rownames(deg)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(unique(deg$symbol), fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db)
head(df)
DEG=deg
head(DEG)
#merge函数:将两个数据集合并
#用于指定依据哪个列合并,常用于当两个数据集公共列名不一样的时候;
DEG=merge(DEG,df,by.y='SYMBOL',by.x='symbol')
head(DEG)
#标注好的差异基因
save(DEG,file = 'anno_DEG.Rdata')
gene_up= DEG[DEG$g == 'UP','ENTREZID']
gene_down=DEG[DEG$g == 'DOWN','ENTREZID']
gene_diff=c(gene_up,gene_down)
gene_all=as.character(DEG[ ,'ENTREZID'] )
data(geneList, package="DOSE")
head(geneList)
boxplot(geneList)
boxplot(DEG$logFC)
geneList=DEG$logFC
names(geneList)=DEG$ENTREZID
geneList=sort(geneList,decreasing = T)
## KEGG pathway analysis
### 做KEGG数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
library(org.Hs.eg.db)
library("clusterProfiler")
if(T){
### over-representation test 超几何分布检验分析
#kegg基因富集
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.up)[,1:6]
dotplot(kk.up )
library(ggplot2)
ggsave('kk.up.dotplot.png')
#分析下调基因
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
head(kk.down)[,1:6]
dotplot(kk.down );ggsave('kk.down.dotplot.png')
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.05)
head(kk.diff)[,1:6]
dotplot(kk.diff );ggsave('kk.diff.dotplot.png')
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,]
down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,]
up_kegg$group=1
#调用其他文件中的函数
source('functions.R')
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = 'kegg_up_down.png')
### GSEA 基因富集
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.9,
verbose = FALSE)
head(kk_gse)[,1:6]
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
#处理数据,挑选一部分数据
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,]
down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,]
up_kegg$group=1
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
}
### GO database analysis
### 做GO数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
{
g_list=list(gene_up=gene_up,
gene_down=gene_down,
gene_diff=gene_diff)
if(F){
go_enrich_results <- lapply( g_list , function(gene) {
ego <- enrichGO(gene = gene,
universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
print( head(ego) )
return(ego)
})
})
save(go_enrich_results,file = 'go_enrich_results.Rdata')
load(file = 'go_enrich_results.Rdata')
n1= c('gene_up','gene_down','gene_diff')
n2= c('BP','MF','CC')
for (i in 1:3){
for (j in 1:3){
fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
cat(paste0(fn,'\n'))
png(fn,res=150,width = 1080)
print( dotplot(go_enrich_results[[i]][[j]] ))
dev.off()
}
}
}
#Step5:GSEA基因富集
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
### 对 MigDB中的全部基因集 做GSEA分析。
# http://www.bio-info-trainee.com/2105.html
# http://www.bio-info-trainee.com/2102.html
{
load(file = 'anno_DEG.Rdata')
geneList=DEG$logFC
names(geneList)=DEG$symbol
geneList=sort(geneList,decreasing = T)
#选择gmt文件(MigDB中的全部基因集)
d='./GEO-master/MsigDB/symbols'
gmts <- list.files(d,pattern = 'all')
gmts
#GSEA分析
library(GSEABase) # BiocManager::install('GSEABase')
## 下面使用lapply循环读取每个gmt文件,并且进行GSEA分析
## 如果存在之前分析后保存的结果文件,就不需要重复进行GSEA分析。
f='gsea_results.Rdata'
if(!file.exists(f)){
#laaply循环读取,要求输入数据必须是list
#laaply 高批量处理函数, 遍历列表向量内的每个元素,并且使用指定函数来对其元素进行处理。返回列表向量。
gsea_results <- lapply(gmts, function(gmtfile){
# gmtfile=gmts[2]
geneset <- read.gmt(file.path(d,gmtfile))
print(paste0('Now process the ',gmtfile))
egmt <- GSEA(geneList, TERM2GENE=geneset, verbose=FALSE)
head(egmt)
# gseaplot(egmt, geneSetID = rownames(egmt[1,]))
return(egmt)
})
# 上面的代码耗时,所以保存结果到本地文件
save(gsea_results,file = f)
}
load(file = f)
#提取gsea结果,熟悉这个对象
gsea_results_list<- lapply(gsea_results, function(x){
cat(paste(dim(x@result)),'\n')
x@result
})
gsea_results_df <- do.call(rbind, gsea_results_list)
gseaplot(gsea_results[[2]],'KEGG_CELL_CYCLE')
gseaplot(gsea_results[[2]],'FARMER_BREAST_CANCER_CLUSTER_6')
gseaplot(gsea_results[[5]],'GO_CONDENSED_CHROMOSOME_OUTER_KINETOCHORE')
}