转录组DEGs聚类热图和功能富集分析

写在前面：经常做转录组分析，就是把差异基因搞个火山图和Venn图看各组差异基因的共有和特有情况。看见有个比较好的选择，能直观比较各种处理带来的影响，如下：

image.png

来自Nature plants的一篇文章
Ref:
https://github.com/YulongNiu/MPIPZ_microbe-host_homeostasis
https://www.nature.com/articles/s41477-021-00920-2
这个图很牛逼啊，表示的信息量很全，值得学习。去扒作者的代码，复现出了大部分

所需文件：
总的基因丰度表，即各个基因在每个样品中的丰度

image.png

各个样品的基因差异表达情况
举一个例子，这是Deseq2算出来的：

image.png

1. 导入数据并做一些处理

setwd("C:/Users/yjk/Desktop/cluster_DEGs")
library("dplyr")
library("ComplexHeatmap")
library("tibble")
library("RColorBrewer")
library("ggplot2")
my_theme()
all_genes <- read.table("all_genes.txt", header = TRUE) # fpkm

# DESeq2获得的差异表达基因(DEGs), |log2foldchange| > 1, padj < 0.05
HF12N_Rs <- read.table("HF12N_Rs.txt", header = TRUE, sep = "\t")
HF12N_S <- read.table("HF12N_S.txt", header = TRUE, sep = "\t")
HF12Rs_N_S <- read.table("HF12Rs_N_S.txt", header = TRUE, sep = "\t")
HF12S_Rs <- read.table("HF12S_Rs.txt", header = TRUE, sep = "\t")
HG64N_Rs <- read.table("HG64N_Rs.txt", header = TRUE, sep = "\t")
HG64N_S <- read.table("HG64N_S.txt", header = TRUE, sep = "\t")
HG64Rs_N_S <- read.table("HG64Rs_N_S.txt", header = TRUE, sep = "\t")
HG64S_Rs <- read.table("HG64S_Rs.txt", header = TRUE, sep = "\t")
# 合并差异表达和基因丰度
all <- HF12N_Rs %>% 
    full_join(HF12N_S) %>% 
    full_join(HF12Rs_N_S) %>% 
    full_join(HF12S_Rs) %>% 
    full_join(HG64N_Rs) %>% 
    full_join(HG64N_S) %>% 
    full_join(HG64Rs_N_S) %>% 
    full_join(HG64S_Rs) %>% 
    left_join(all_genes)
all[is.na(all)] <- "No"
## mean func
meanGp <- function(v) {
    require('magrittr')
    res <- v %>%
        split(rep(1 : 8, each = 3)) %>%
        sapply(mean, na.rm = TRUE)
    return(res)
}

all_for_cluster <- select(all, -contains("vs")) # 只选择原始fpkm数据
rownames(all_for_cluster) <- all_for_cluster$id
all_for_cluster <- all_for_cluster[-1]

## sample name
sampleN <- c("HG64NRs","HG64SRs","HG64N","HG64S",
             "HF12NRs","HF12SRs","HF12N","HF12S")

meanCount <- all_for_cluster %>%
    apply(1, meanGp) %>%
    t

colnames(meanCount) <- sampleN
## scale
scaleCount <- meanCount %>%
    t %>%
    scale %>%
    t
scaleCount %<>% .[complete.cases(.), ]

2. 然后要先计算多少个cluster合适：

# set.seed(123) 聚类结果一致
set.seed(123)
## choose cluster num
## 1. sum of squared error
wss <- (nrow(scaleCount) - 1) * sum(apply(scaleCount, 2, var))

for (i in 2:20) {
    wss[i] <- sum(kmeans(scaleCount,
                         centers=i,
                         algorithm = 'MacQueen')$withinss)
}

ggplot(tibble(k = 1:20, wss = wss), aes(k, wss)) +
    geom_point(colour = '#D55E00', size = 3) +
    geom_line(linetype = 'dashed') +
    xlab('Number of clusters') +
    ylab('Sum of squared error')

# ggsave("Sum_of_squared_error.pdf", height = 3, width = 4)
## 2. Akaike information criterion
kmeansAIC = function(fit){
    m = ncol(fit$centers)
    n = length(fit$cluster)
    k = nrow(fit$centers)
    D = fit$tot.withinss
    return(D + 2*m*k)
}

aic <- numeric(20)
for (i in 1:20) {
    fit <- kmeans(x = scaleCount, centers = i, algorithm = 'MacQueen')
    aic[i] <- kmeansAIC(fit)
}

ggplot(tibble(k = 1:20, aic = aic), aes(k, wss)) +
    geom_point(colour = '#009E73', size = 3) +
    geom_line(linetype = 'dashed') +
    xlab('Number of clusters') +
    ylab('Akaike information criterion')

# ggsave("Akaike_information_criterion.pdf", height = 3, width = 4)
# choose cluster 10

withinss: Vector of within-cluster sum of squares, one component per cluster.

我们上面计算的是withinss，即cluster内的误差平方和，同一个cluster趋势越一致则越小。所以cluster越多则这个值越小，这和我们认知一致。但是，cluster太多了信息很杂，太少了就成了生拉硬扯成cluster了

image.png

可以看出选则10比较合适
利用另外一个参数：赤池信息量准则（Akaike information criterion，简称AIC）来衡量

AIC介绍，https://www.biaodianfu.com/aic-bic.html
理论上来讲，增加自由参数的数目可以提高拟合的优良性，但是过多，模型过于复杂容易造成过拟合现象。因此，AIC不仅要提高模型拟合度（极大似然），而且引入了惩罚项，使模型参数尽可能少，有助于降低过拟合的可能性。

image.png

可以看出，两种方法，同样结果。选择10

3. 然后聚类：

kclust10 <- kmeans(scaleCount, centers = 10, algorithm = "MacQueen", nstart = 1000, iter.max = 20)
cl <- as.data.frame(kclust10$cluster) %>% 
    rownames_to_column("id") %>% 
    set_colnames(c("id","cl"))

heat_all <- all %>% left_join(cl)


# 把所有DEGs的cluster信息保存，为后面富集分析所用
degs_cl <- heat_all %>%
    select(c("id","cl"))
write.table(degs_cl, "./enrichment/degs_cl.txt", sep = "\t", quote = FALSE)


scaleC <- heat_all %>% 
    select(-contains("vs")) %>% 
    select(-c("id","cl")) %>% 
    t %>% 
    scale %>%
    t %>%
    as_tibble %>%
    bind_cols(heat_all %>% select(id, cl))
# 排序
scaleC <- scaleC[,c("HG64S1", "HG64S2", "HG64S3",
                 "HG64N1", "HG64N2", "HG64N3",
                 "HG64SRs1", "HG64SRs2", "HG64SRs3",
                 "HG64NRs1", "HG64NRs2", "HG64NRs3",
                 "HF12S1", "HF12S2", "HF12S3",
                 "HF12N1", "HF12N2", "HF12N3",
                 "HF12SRs1", "HF12SRs2", "HF12SRs3",
                 "HF12NRs1", "HF12NRs2", "HF12NRs3",
                 "id", "cl")]


## col annotation
NatSoil <- HeatmapAnnotation(NatSoil = c(rep(rep(c("No", "Yes", "No", "Yes"), each = 3),2)),
                             col = list(NatSoil = c("Yes" = "grey", "No" = "white")),
                             gp = gpar(col = "black"))
Rs <- HeatmapAnnotation(Rs = c(rep(rep(c("No", "Yes"), each = 6),2)),
                               col = list(Rs = c("Yes" = "grey", "No" = "white")),
                        gp = gpar(col = "black"))

## DEG annotation
deg <- heat_all %>% select(matches("vs"))
# order
deg <- deg[,c("HG64N_vs_HG64S", "HF12N_vs_HF12S", "HG64NRs_vs_HG64SRs", "HF12RsN_vs_HF12RsS", 
              "HG64NRs_vs_HG64N", "HF12NRs_vs_HF12N", "HG64SRs_vs_HG64S", "HF12SRs_vs_HF12S")]
Heatmap(matrix = scaleC[1:24],
        name = 'Scaled Counts',
        row_split = scaleC$cl,
        row_gap = unit(2, "mm"),
        column_order = 1:24,
        # column_split = rep(c("HG64S","HG64N","HG64SRs","HG64NRs",
                             # "HF12S","HF12N","HF12SRs","HF12NRs"), each = 3),
        column_split = factor(rep(c("HG64","HF12"), each = 12), levels = c("HG64","HF12")),
        show_column_names = FALSE,
        col = colorRampPalette(rev(brewer.pal(n = 10, name = 'Spectral'))[c(-3,-8)])(100),
        top_annotation = c(NatSoil, Rs),
        row_title_gp = gpar(fontsize = 10),
        use_raster = FALSE) +
        Heatmap(deg,
                col = c('Down' = '#00bbf9', 'No' = 'white', 'Up' = '#f94144'),
                column_names_gp = gpar(fontsize = 6),
                heatmap_legend_param = list(title = 'DEGs'),
                cluster_columns = FALSE,
                column_names_rot = 45,
                use_raster = FALSE)

image.png

4. 然后每个cluster有什么功能呢？富集分析

#####################
setwd("C:/Users/yjk/Desktop/cluster_DEGs/enrichment")
library("clusterProfiler")
library("magrittr")
library("tidyverse")
library("RColorBrewer")
library("AnnotationHub")
my_theme()

# > packageVersion("AnnotationHub")
# [1] ‘3.0.2’
# packageVersion("clusterProfiler")
# [1] ‘4.0.5’

hub <- AnnotationHub()
# snapshotDate(): 2021-05-18
query(hub, "solanum")
# AnnotationHub with 9 records
# # snapshotDate(): 2021-05-18
# # $dataprovider: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/, Inparanoid8, WikiPathways
# # $species: Solanum lycopersicum, Solanum tuberosum, Solanum pennellii, Solanum pennelli, Sola...
# # $rdataclass: OrgDb, Inparanoid8Db, Tibble
# # additional mcols(): taxonomyid, genome, description, coordinate_1_based, maintainer,
# #   rdatadateadded, preparerclass, tags, rdatapath, sourceurl, sourcetype 
# # retrieve records with, e.g., 'object[["AH10593"]]' 
# 
# title                                             
# AH10593 | hom.Solanum_lycopersicum.inp8.sqlite              
# AH10606 | hom.Solanum_tuberosum.inp8.sqlite                 
# AH91815 | wikipathways_Solanum_lycopersicum_metabolites.rda 
# AH94101 | org.Solanum_pennellii.eg.sqlite                   
# AH94102 | org.Solanum_pennelli.eg.sqlite                    
# AH94160 | org.Solanum_tuberosum.eg.sqlite                   
# AH94246 | org.Solanum_esculentum.eg.sqlite                  
# AH94247 | org.Solanum_lycopersicum.eg.sqlite                
# AH94248 | org.Solanum_lycopersicum_var._humboldtii.eg.sqlite

sly.db <- hub[["AH94247"]]

这里如果遇到提示

hub <- AnnotationHub()
snapshotDate(): 2021-05-18
Warning message:DEPRECATION: As of AnnotationHub (>2.23.2), default caching location has changed.
Problematic cache: C:\Users\yjk\AppData\Local/AnnotationHub/AnnotationHub/Cache
See https://bioconductor.org/packages/devel/bioc/vignettes/AnnotationHub/inst/doc/TroubleshootingTheCache.html#default-caching-location-update

运行下面的命令即可

 moveFiles<-function(package){
      olddir <- path.expand(rappdirs::user_cache_dir(appname=package))
      newdir <- tools::R_user_dir(package, which="cache")
      dir.create(path=newdir, recursive=TRUE)
      files <- list.files(olddir, full.names =TRUE)
      moveres <- vapply(files,
                        FUN=function(fl){
                            filename = basename(fl)
                            newname = file.path(newdir, filename)
                            file.rename(fl, newname)
                        },
                        FUN.VALUE = logical(1))
      if(all(moveres)) unlink(olddir, recursive=TRUE)
  }

下面就可以进行GO和KEGG富集分析了

kmeansRes <- read.table("degs_cl.txt")

prefix <- 'kmeans10'
savepath <- "C:/Users/yjk/Desktop/cluster_DEGs/enrichment"

for (i in kmeansRes$cl %>% unique) {
    ## BP
    goBP <- enrichGO(gene = kmeansRes %>% filter(cl == i) %>% .$id,
                     OrgDb = sly.db,
                     keyType= 'SYMBOL',
                     ont = 'BP',
                     pAdjustMethod = 'BH',
                     pvalueCutoff = 0.05,
                     qvalueCutoff = 0.1)
    
    goBPSim <- clusterProfiler::simplify(goBP,
                                         cutoff = 0.5,
                                         by = 'p.adjust',
                                         select_fun = min)
    ## check and plot
    write.table(as.data.frame(goBPSim),
              paste0(prefix, '_cl', i, '_cp_BP.txt') %>% file.path(savepath, .),
              quote = FALSE,
              sep = "\t")
    
    ## KEGG
    kk2 <- enrichKEGG(gene = kmeansRes %>% filter(cl == i) %>% .$id %>%
                      bitr(.,"SYMBOL", "ENTREZID", sly.db) %>% dplyr::select("ENTREZID") %>%
                          unlist(),
                      organism = 'sly',
                      pvalueCutoff = 0.05)
    
    write.table(as.data.frame(kk2),
              paste0(prefix, '_cl', i, '_cp_KEGG.txt') %>% file.path(savepath, .),
              quote = FALSE,
              sep = "\t")
}

kall <- lapply(kmeansRes$cl %>% unique, function(x) {
    
    eachG <- kmeansRes %>% filter(cl == x) %>% .$id
    
    return(eachG)
    
}) %>%
    set_names(kmeansRes$cl %>% unique %>% paste0('cl', .))

kallGOBP <- compareCluster(geneCluster = kall,
                           fun = 'enrichGO',
                           OrgDb = sly.db,
                           keyType= 'SYMBOL',
                           ont = 'BP',
                           pAdjustMethod = 'BH',
                           pvalueCutoff=0.01,
                           qvalueCutoff=0.1)

kallGOBPSim <- clusterProfiler::simplify(kallGOBP,
                                         cutoff = 0.9,
                                         by = 'p.adjust',
                                         select_fun = min)
dotplot(kallGOBPSim, showCategory = 10) + 
    ggtitle("Biological process")
# ggsave("kallGOBPSim.pdf", width = 6.4, height = 5.4)

kallGOCC <- compareCluster(geneCluster = kall,
                           fun = 'enrichGO',
                           OrgDb = sly.db,
                           keyType= 'SYMBOL',
                           ont = "CC",
                           pAdjustMethod = 'BH',
                           pvalueCutoff=0.01,
                           qvalueCutoff=0.1)

kallGOCCSim <- clusterProfiler::simplify(kallGOCC,
                                         cutoff = 0.9,
                                         by = 'p.adjust',
                                         select_fun = min)
dotplot(kallGOCCSim, showCategory = 20) + 
    ggtitle("Cellular component")
# ggsave("kallGOCCSim.pdf", width = 6.4, height = 4)


kallGOMF <- compareCluster(geneCluster = kall,
                           fun = 'enrichGO',
                           OrgDb = sly.db,
                           keyType= 'SYMBOL',
                           ont = "MF",
                           pAdjustMethod = 'BH',
                           pvalueCutoff=0.01,
                           qvalueCutoff=0.1)

kallGOMFSim <- clusterProfiler::simplify(kallGOMF,
                                         cutoff = 0.9,
                                         by = 'p.adjust',
                                         select_fun = min)
dotplot(kallGOMFSim, showCategory = 10) + 
    ggtitle("Molecular function")
ggsave("kallGOMFSim.pdf", width = 6.9, height = 4)

kallGOBP %>%
    as.data.frame %>%
    write.table('kmeans10_GOBP.txt', quote = FALSE, sep = "\t")

emapplot(kallGOBP,
         showCategory = 5,
         pie='count',
         pie_scale=1,
         layout='kk')


kallKEGG_input <- lapply(kmeansRes$cl %>% unique, function(x) {
    
    eachG <- kmeansRes %>% filter(cl == x) %>% 
        .$id %>% 
        bitr(.,"SYMBOL", "ENTREZID", sly.db) %>% 
        dplyr::select("ENTREZID") %>% 
        unlist()
    
    return(eachG)
    
}) %>%
    set_names(kmeansRes$cl %>% unique %>% paste0('cl', .))
kallKEGG <- compareCluster(geneCluster = kallKEGG_input,   
                           fun = 'enrichKEGG',
                           organism = "sly",
                           pvalueCutoff = 0.05)

dotplot(kallKEGG)
# ggsave('kmeans10_KEGGALL.pdf', width = 8, height = 4)

kallKEGG %>% 
    as.data.frame %>%
    write.table('kmeans10_KEGG.txt', quote = FALSE, sep = "\t")

列一个例子，GO富集的生物学过程

image.png

到此结束

转录组DEGs聚类热图和功能富集分析

1. 导入数据并做一些处理

2. 然后要先计算多少个cluster合适：

withinss: Vector of within-cluster sum of squares, one component per cluster.

3. 然后聚类：

4. 然后每个cluster有什么功能呢？富集分析

你可能感兴趣的:(转录组DEGs聚类热图和功能富集分析)