KEGG基因富集分析

#kegg:基因功能存储在pathway数据库里
rm(list = ls())  ## 魔幻操作,一键清空~

load(file = 'deg.Rdata')

head(deg)

## 不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。

logFC_t=1.5
deg=nrDEG

deg$g=ifelse(deg$P.Value>0.05,'stable',
             
             ifelse( deg$logFC > logFC_t,'UP',
                     
                     ifelse( deg$logFC < -logFC_t,'DOWN','stable') )
             
)

table(deg$g)

head(deg)

deg$symbol=rownames(deg)

library(ggplot2)

library(clusterProfiler)

library(org.Hs.eg.db)

df <- bitr(unique(deg$symbol), fromType = "SYMBOL",
           
           toType = c( "ENTREZID"),
           
           OrgDb = org.Hs.eg.db)

head(df)

DEG=deg

head(DEG)

#merge函数:将两个数据集合并
#用于指定依据哪个列合并,常用于当两个数据集公共列名不一样的时候;
DEG=merge(DEG,df,by.y='SYMBOL',by.x='symbol')

head(DEG)
#标注好的差异基因
save(DEG,file = 'anno_DEG.Rdata')





gene_up= DEG[DEG$g == 'UP','ENTREZID'] 

gene_down=DEG[DEG$g == 'DOWN','ENTREZID'] 

gene_diff=c(gene_up,gene_down)

gene_all=as.character(DEG[ ,'ENTREZID'] )

data(geneList, package="DOSE")

head(geneList)

boxplot(geneList)

boxplot(DEG$logFC)



geneList=DEG$logFC

names(geneList)=DEG$ENTREZID

geneList=sort(geneList,decreasing = T)





## KEGG pathway analysis

### 做KEGG数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
library(org.Hs.eg.db)
library("clusterProfiler")
if(T){
  
  ###   over-representation test 超几何分布检验分析
  #kegg基因富集
  kk.up <- enrichKEGG(gene         = gene_up,
                      
                      organism     = 'hsa',
                      
                      universe     = gene_all,
                      
                      pvalueCutoff = 0.9,
                      
                      qvalueCutoff =0.9)
  
  head(kk.up)[,1:6]
  
  dotplot(kk.up )
  library(ggplot2)
  ggsave('kk.up.dotplot.png')
  #分析下调基因
  kk.down <- enrichKEGG(gene         =  gene_down,
                        
                        organism     = 'hsa',
                        
                        universe     = gene_all,
                        
                        pvalueCutoff = 0.9,
                        
                        qvalueCutoff =0.9)
  
  head(kk.down)[,1:6]
  
  dotplot(kk.down );ggsave('kk.down.dotplot.png')
  
  kk.diff <- enrichKEGG(gene         = gene_diff,
                        
                        organism     = 'hsa',
                        
                        pvalueCutoff = 0.05)
  
  head(kk.diff)[,1:6]
  
  dotplot(kk.diff );ggsave('kk.diff.dotplot.png')
  

  kegg_diff_dt <- as.data.frame(kk.diff)
  
  kegg_down_dt <- as.data.frame(kk.down)
  
  kegg_up_dt <- as.data.frame(kk.up)
  
  down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,]
  down_kegg$group=-1
  
  up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,]
  up_kegg$group=1
  #调用其他文件中的函数
  source('functions.R')
  
  g_kegg=kegg_plot(up_kegg,down_kegg)
  
  print(g_kegg)
  
  
  
  ggsave(g_kegg,filename = 'kegg_up_down.png')
  
  
  
  ###  GSEA 基因富集
  
  kk_gse <- gseKEGG(geneList     = geneList,
                    
                    organism     = 'hsa',
                    
                    nPerm        = 1000,
                    
                    minGSSize    = 120,
                    
                    pvalueCutoff = 0.9,
                    
                    verbose      = FALSE)
  
  head(kk_gse)[,1:6]
  
  gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
  
  #处理数据,挑选一部分数据
  
  down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,]
  down_kegg$group=-1
  
  up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,]
  up_kegg$group=1
  
  
  
  g_kegg=kegg_plot(up_kegg,down_kegg)
  
  print(g_kegg)
  
  ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
  
  
  
  
  
}



### GO database analysis 

### 做GO数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。

{
  
  
  
  g_list=list(gene_up=gene_up,
              
              gene_down=gene_down,
              
              gene_diff=gene_diff)
  
  
  
  if(F){
    
    go_enrich_results <- lapply( g_list , function(gene) {
      
        ego <- enrichGO(gene          = gene,
                        
                        universe      = gene_all,
                        
                        OrgDb         = org.Hs.eg.db,
                        
                        ont           = ont ,
                        
                        pAdjustMethod = "BH",
                        
                        pvalueCutoff  = 0.99,
                        
                        qvalueCutoff  = 0.99,
                        
                        readable      = TRUE)
        
        
        
        print( head(ego) )
        
        return(ego)
        
      })
      
    })
    
    save(go_enrich_results,file = 'go_enrich_results.Rdata')
    
    
    
  
    
  
  
  
  
  
  load(file = 'go_enrich_results.Rdata')
  
  
  
  n1= c('gene_up','gene_down','gene_diff')
  
  n2= c('BP','MF','CC') 
  
  for (i in 1:3){
    
    for (j in 1:3){
      
      fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
      
      cat(paste0(fn,'\n'))
      
      png(fn,res=150,width = 1080)
      
      print( dotplot(go_enrich_results[[i]][[j]] ))
      
      dev.off()
      
    }
    
  }
  
  }
 
#Step5:GSEA基因富集 
library(ggplot2)

library(clusterProfiler)

library(org.Hs.eg.db)



### 对 MigDB中的全部基因集 做GSEA分析。

# http://www.bio-info-trainee.com/2105.html

# http://www.bio-info-trainee.com/2102.html 

{
  
  load(file = 'anno_DEG.Rdata')
  
  geneList=DEG$logFC
  
  names(geneList)=DEG$symbol
  
  geneList=sort(geneList,decreasing = T)
  
  #选择gmt文件(MigDB中的全部基因集)
  
  d='./GEO-master/MsigDB/symbols'
  
  gmts <- list.files(d,pattern = 'all')
  
  gmts
  
  #GSEA分析
  
  library(GSEABase) # BiocManager::install('GSEABase')
  
  ## 下面使用lapply循环读取每个gmt文件,并且进行GSEA分析
  
  ## 如果存在之前分析后保存的结果文件,就不需要重复进行GSEA分析。
  
  f='gsea_results.Rdata'
  
  if(!file.exists(f)){
    #laaply循环读取,要求输入数据必须是list
    #laaply 高批量处理函数, 遍历列表向量内的每个元素,并且使用指定函数来对其元素进行处理。返回列表向量。
    
    gsea_results <- lapply(gmts, function(gmtfile){
      
      # gmtfile=gmts[2]
      
      geneset <- read.gmt(file.path(d,gmtfile)) 
      
      print(paste0('Now process the ',gmtfile))
      
      egmt <- GSEA(geneList, TERM2GENE=geneset, verbose=FALSE)
      
      head(egmt)
      
      # gseaplot(egmt, geneSetID = rownames(egmt[1,]))
      
      
      
      return(egmt)
      
    })
    
    # 上面的代码耗时,所以保存结果到本地文件
    
    save(gsea_results,file = f)
    
  }
  
  load(file = f)
  
  
  #提取gsea结果,熟悉这个对象
  
  gsea_results_list<- lapply(gsea_results, function(x){
    
    cat(paste(dim(x@result)),'\n')
    
    x@result
    
  })
  
  gsea_results_df <- do.call(rbind, gsea_results_list)
  
  gseaplot(gsea_results[[2]],'KEGG_CELL_CYCLE') 
  
  gseaplot(gsea_results[[2]],'FARMER_BREAST_CANCER_CLUSTER_6') 
  
  gseaplot(gsea_results[[5]],'GO_CONDENSED_CHROMOSOME_OUTER_KINETOCHORE') 
  
  
  
}

 

你可能感兴趣的:(生物信息)