单细胞富集分析系列:
- 单细胞之富集分析-1:单细胞GSEA分析流程
- 单细胞之富集分析-2:批量GSEA和GSVA分析
单细胞富集分析我最常用的是分组GSVA,但最近用到了GO分析,就复习一下GO和KEGG富集分析及绘图。
1. 数据集准备
library(Seurat)
library(patchwork)
library(clusterProfiler)
library(org.Mm.eg.db) ##加载小鼠
library(org.Hs.eg.db) ##加载人类
library(tidyverse)
载入无比熟悉的pbmc.3k数据集 (已注释好,数据准备见monocle)
pbmc <-readRDS("pbmc.rds")
table(pbmc$cell_type)
# Naive CD4 T Memory CD4 T CD14+ Mono B CD8 T FCGR3A+ Mono
# 711 480 472 344 279 162
# NK DC Platelet
# 144 32 14
pbmc3k数据集只有1个样本,没办法区分HC和病例组。
若有分组,可以使用subset函数将某种细胞取出,来做这种细胞病例组和对照组相比的差异基因和富集分析
2. 计算差异基因
- 使用seurat包的
FindMarkers
来计算差异基因。
ident.1是病例组,ident.2是对照组。(这里只做演示,计算的是和Naive CD4 T相比,Memory CD4 T的差异基因)
dge.celltype <- FindMarkers(pbmc, ident.1 = 'Memory CD4 T',ident.2 = 'Naive CD4 T',
group.by = 'cell_type',logfc.threshold = 0,min.pct = 0)
saveRDS(dge.celltype, file = "deg.rds")
sig_dge.all <- subset(dge.celltype, p_val_adj<0.05&abs(avg_log2FC)>0.15) #所有差异基因
View(sig_dge.all)
- 分组可视化
sig_dge.up <- subset(dge.celltype, p_val_adj<0.05&avg_log2FC>0.15)
sig_dge.up <- sig_dge.up[order(sig_dge.up$avg_log2FC,decreasing = T),]
sig_dge.up_TOP30 <- rownames(sig_dge.up[1:30,])
sig_dge.down <- subset(dge.celltype, p_val_adj<0.05&avg_log2FC< -0.15)
sig_dge.down <- sig_dge.down[order(sig_dge.down$avg_log2FC,decreasing = T),]
sig_dge.down_TOP30 <- rownames(sig_dge.down[1:30,])
diffall <-c(sig_dge.up_TOP30,sig_dge.down_TOP30)
Idents(pbmc) <- 'cell_type'
pbmc_sub <- subset(pbmc,ident=c('Memory CD4 T','Naive CD4 T'))
Idents(pbmc_sub) <- 'cell_type'
View(pbmc_sub)
matrix <- AverageExpression(object = pbmc_sub,assays = 'RNA',slot = "scale.data")[[1]]
matrix <- matrix[rownames(matrix)%in%diffall,]
matrix[matrix>2]=2;matrix[matrix< -2]= -2
p=pheatmap( matrix ,show_colnames =T,
show_rownames = T,
cluster_cols = T, cluster_row = T,
border_color = NA,
color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
save_pheatmap_pdf <- function(x, filename, width=8, height=15) {
stopifnot(!missing(x))
stopifnot(!missing(filename))
pdf(filename, width=width, height=height)
grid::grid.newpage()
grid::grid.draw(x$gtable)
dev.off()
}
save_pheatmap_pdf(p, "diff_heatmap.pdf")
3. GO富集分析(分为BP, CC和MF)
# BP, CC和MF三种通路都一起富集
ego_ALL <- enrichGO(gene = row.names(sig_dge.all),
#universe = row.names(dge.celltype),
OrgDb = 'org.Hs.eg.db',
keyType = 'SYMBOL',
ont = "ALL", #设置为ALL时BP, CC, MF都计算
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
ego_all <- data.frame(ego_ALL)
write.csv(ego_ALL,'enrichGO_all.csv')
View(ego_all)
# 分别对BP, CC和MF进行富集
ego_CC <- enrichGO(gene = row.names(sig_dge.all),
#universe = row.names(dge.celltype),
OrgDb = 'org.Hs.eg.db',
keyType = 'SYMBOL',
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
ego_cc <- data.frame(ego_CC)
write.csv(ego_cc,'enrichGO_cc.csv')
ego_MF <- enrichGO(gene = row.names(sig_dge.all),
#universe = row.names(dge.celltype),
OrgDb = 'org.Hs.eg.db',
keyType = 'SYMBOL',
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
ego_mf <- data.frame(ego_MF)
write.csv(ego_mf,'enrichGO_mf.csv')
ego_BP <- enrichGO(gene = row.names(sig_dge.all),
#universe = row.names(dge.celltype),
OrgDb = 'org.Hs.eg.db',
keyType = 'SYMBOL',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
ego_bp <- data.frame(ego_BP)
write.csv(ego_bp,'enrichGO_bp.csv')
绘图
- 最普通的图,也是一般生信公司出报告的图,略丑。
p_BP <- barplot(ego_BP,showCategory = 10) + ggtitle("barplot for Biological process")
p_CC <- barplot(ego_CC,showCategory = 10) + ggtitle("barplot for Cellular component")
p_MF <- barplot(ego_MF,showCategory = 10) + ggtitle("barplot for Molecular function")
plotc <- p_BP/p_CC/p_MF
ggsave('enrichGO.pdf', plotc, width = 12,height = 10)
- 使用ggplot绘图(更灵活)
# 我一般只画bp图,感觉更有意义。
ego_bp <- ego_bp[order(ego_bp$p.adjust),]
ego_bp_top30 <- ego_bp[1 : 30,]
ggplot(data=ego_bp_top30, aes(x=Description,y=Count)) +
geom_bar(stat="identity", width=0.8,fill='salmon1') +
coord_flip() + xlab("GO term") + ylab("Num of Genes") +
theme_bw()
之所以长短不齐不按顺序是因为没有排序
#按照p值排序
ego_bp <- ego_bp[order(ego_all$pvalue,decreasing = T),]
ego_bp$Description <- factor(ego_bp$Description, levels = ego_bp$Description)
排完续之后再画p值就是按顺序的了
4. KEGG富集分析
genelist <- bitr(row.names(sig_dge.all), fromType="SYMBOL",
toType="ENTREZID", OrgDb='org.Hs.eg.db')
genelist <- pull(genelist,ENTREZID)
ekegg <- enrichKEGG(gene = genelist, organism = 'hsa')
p1 <- barplot(ekegg, showCategory=20)
p2 <- dotplot(ekegg, showCategory=20)
plotc = p1/p2
ggsave("enrichKEGG.png", plot = plotc, width = 12, height = 10)
附:单细胞测序数据的差异表达分析方法总结