原文链接
10、DA(Differential abundance) -
1、overview
- the design of replicated multi-condition experiments to detect changes in composition or expression between conditions.
即通过设计多个实验样本(不同conition的多次重复,一般来说 two conditions and three replications) - 本节主要学习DE(differential expression),基因表达差异。这与之前学习的RNAseq的差异分析最主要的区别在于single cell DE一般是基于细胞注释之后,进行不同细胞之间的差异基因分析。
2、setup data
- 示例数据来自
MouseGastrulationData
包,我们从中提取6个sample - design:a paired design with three replicate batches of two samples each.
- conditions(two samples):td-Tomato positive cells VS negative cells
2.1 下载
library(MouseGastrulationData)
sce.chimera <- WTChimeraData(samples=5:10)
sce.chimera
#将近一个G大小,赶紧保存为Rdata
save(sce.chimera, file="sce.chimera.Rdata")
rm(list=ls())
load(sce.chimera.Rdata)
2.2初步探索数据
#two conditions
table(colData(sce.chimera)$tomato)
# three replication
table(colData(sce.chimera)$tomato,
colData(sce.chimera)$pool)
#six samples
table(colData(sce.chimera)$sample)
#annotated celltype
table(colData(sce.chimera)$celltype.mapped,
colData(sce.chimera)$tomato)
2.3 前处理 preprocessing
if (T){
#--- feature-annotation ---# ensemble→symbol
library(scater)
rownames(sce.chimera) <- uniquifyFeatureNames(
rowData(sce.chimera)$ENSEMBL, rowData(sce.chimera)$SYMBOL)
#--- quality-control ---#
drop <- sce.chimera$celltype.mapped %in% c("stripped", "Doublet")
sce.chimera <- sce.chimera[,!drop] #dim 29453 19426
#--- normalization ---#
sce.chimera <- logNormCounts(sce.chimera)
#--- variance-modelling ---#
library(scran)
dec.chimera <- modelGeneVar(sce.chimera, block=sce.chimera$sample)
chosen.hvgs <- dec.chimera$bio > 0
#--- merging ---#
library(batchelor)
set.seed(01001001)
#Apply a correction to multiple SingleCellExperiment objects,
#while also combining the assay data and column metadata for easy use.
merged <- correctExperiments(sce.chimera,
batch=sce.chimera$sample, #共六个样本(3*2)
subset.row=chosen.hvgs,
PARAM=FastMnnParam( #MNN算法去除指定两组的批次效应
merge.order=list(
list(1,3,5), # WT (3 replicates)
list(2,4,6) # td-Tomato (3 replicates)
)
)
)
#--- clustering ---#
g <- buildSNNGraph(merged, use.dimred="corrected")
clusters <- igraph::cluster_louvain(g)
merged$new.clusters <- factor(clusters$membership
#--- dimensionality-reduction ---#
merged <- runTSNE(merged, dimred="corrected", external_neighbors=TRUE)
merged <- runUMAP(merged, dimred="corrected", external_neighbors=TRUE)
}
merged
2.4 可视化
- 图1:二维可视化,有无明显批次效应
gridExtra::grid.arrange(
plotTSNE(merged, colour_by="tomato", text_by="new.clusters"),
plotTSNE(merged, colour_by=data.frame(pool=factor(merged$pool))),
ncol=2
)
- 图2:new.clusters and annotated celltype
by.label <- table(merged$new.clusters, merged$celltype.mapped)
pheatmap::pheatmap(log2(by.label+1), cluster_cols=FALSE, cluster_rows=FALSE,
color=viridis::viridis(101))
Ordinarily, we would be obliged to perform marker detection to assign biological meaning to these clusters.
For simplicity, we will skip this step by directly using the cell type labels have annotated.
3、detailed DE with one celltype
3.1 Creating pseudo-bulk samples
简单来说就是在已经注释celltype的基础上,将每个样本中聚类到一个celltype中的gene counts加在一起。
summed <- aggregateAcrossCells(merged,
id=colData(merged)[,c("celltype.mapped", "sample")])
#将六个样本中各个样本所有cell的各个celltype.mapped的cell gene counts加在一起
dim(summed)
counts(summed)[1:4,1:4]
head(colData(summed),2)
-
如下图注释,样本5的第一列counts即为注释到在Mesenchyme celltype上的cell所有表达情况概况。
3.2 添加ncells
- 教程示例中,
colnames(colData(summed))
有一列ncells
,为统计相应样本里,注释到相应细胞类型的total cells。在后面DE中,过滤时会用到。 - 但在操作中并没有发现,推测可能是版本更新的原因,我的R还是3.6,而现在教程应该是基于R4.0的对应的R包才可以。有时间去升级一下。
- 现在就手动添加吧
dim(colData(summed))
#各个sample的celltype共有186个
table(merged$celltype.mapped,merged$sample)
#转换成表格
tmp <- as.data.frame(table(merged$celltype.mapped,merged$sample))
tmp <- tmp[!(tmp$Freq==0),]
dim(tmp)
#也是186个,应该对应上了,Freq就是目标ncells
tmp1 <- as.data.frame(colData(summed))[,c("celltype.mapped","sample")]
- 先paste确定公共列,再merge
#paste
tmp$x <- paste(tmp$Var1,tmp$Var2,sep = "-")
tmp1$x <- paste(tmp1$celltype.mapped,tmp1$sample,sep = "-")
#merge
tmp1 <- merge(tmp1,tmp,by="x",sort=F)
#注意sort=F,不要改变顺序
head(tmp1)
#添加到colData
summed$ncells <- tmp1$Freq
head(colData(summed))
3.3 以一个celltype进行差异分析
(1)筛选指定celltype数据
label <- "Mesenchyme"
current <- summed[,label==summed$celltype.mapped]
head(colData(current),10)
(3) edgeR
- DEGList
library(edgeR)
y <- DGEList(counts(current), samples=colData(current))
y
#将counts(current)矩阵与colData(current)组成一个list
####### 小插曲:质控
remove samples with very low library sizes due to failed library preparation or sequencing.
- 思路1:remove combinations containing fewer than 20 cells(筛选cells)
discarded <- current$ncells < 20
summary(discarded)
#丢弃sample里只有不到20个cell的sample,这里的数据都大于20
y <- y[,!discarded]
- 思路2:remove genes that are lowly expressed(筛选gene)
keep <- filterByExpr(y, group=current$tomato)
summary(keep)
y <- y[keep,]
#丢弃了9000+个gene
- matrics design
design <- model.matrix(~factor(pool) + factor(tomato), y$samples)
factor(y$samples$pool)
factor(y$samples$tomato)
design
- 差异分析
y <- estimateDisp(y, design)
summary(y$trended.dispersion)
fit <- glmQLFit(y, design, robust=TRUE)
summary(fit$var.prior)
res <- glmQLFTest(fit, coef=ncol(design))
summary(decideTests(res))
topTags(res)
3.4 all DE
- 上述是以具体的一个celltype进行以edgeR的差异分析。
-
scran
包也提供了一次性的封装好的函数以供使用。
同样可能是因为版本的原因,未找到pseudoBulkDGE()
这个函数。故仅将代码录于此处,之后再尝试一下
在写上述草稿的第二天后,下载了最新版的R4.02,再安装相关Bioconductor包进行分析,的确就成功了(但主要还是使用3.6,需要用到4.0版本时再切换到该版本即可)
save(merged,file="merged.Rdata")
rm(list=ls())
#切换R4.0
load("merged.Rdata")
library(scater)
summed <- aggregateAcrossCells(merged,
id=colData(merged)[,c("celltype.mapped", "sample")])
summed$ncells #果然基于4.0版本的scater会添加ncells列
summed.filt <- summed[,summed$ncells >= 20]
summed.filt$ncells
dim(summed.filt)
# Pulling out a sample-level 'targets' data.frame:
targets <- colData(merged)[!duplicated(merged$sample),]
# Constructing the design matrix:
design <- model.matrix(~factor(pool) + factor(tomato), data=targets)
rownames(design) <- targets$sample
design
library(scran)
de.results <- pseudoBulkDGE(summed.filt,
sample=summed.filt$sample,
label=summed.filt$celltype.mapped,
design=design,
coef=ncol(design),
# 'condition' sets the group size for filterByExpr(),
# to perfectly mimic our previous manual analysis.
condition=targets$tomato
)
is.de <- decideTestsPerLabel(de.results, threshold=0.05)
summarizeTestsPerLabel(is.de)
# Upregulated across most cell types.
up.de <- is.de > 0 & !is.na(is.de)
head(sort(rowMeans(up.de), decreasing=TRUE), 10)
# Downregulated across cell types.
down.de <- is.de < 0 & !is.na(is.de)
head(sort(rowMeans(down.de), decreasing=TRUE), 10)
以上是第十五章differential-expression-between-conditions第一部分的简单流程笔记,主要学习了single cell DEG详见Chapter 14 Multi-sample comparisons
本系列笔记基于OSCA全流程的大致流程梳理,详细原理可参考原文。如有错误,恳请指正!
此外还有刘小泽老师整理的全文翻译笔记,详见目录。
最后本来想绘制个火山图,但结果并没有预想的那么好,草稿先放在这,以后再看看吧。有什么想法的朋友,欢迎留言。
tmp3 <- as.data.frame(do.call(rbind, lapply(tmp, as.matrix)),stringsAsFactors=F)
cell_26 <- names(de.results)
cell_26
cells <- rep(cell_26,each=nrow(tmp3)/26)
tmp3 <- cbind(tmp3,cells,stringsAsFactors=F)
test <- na.omit(tmp3)
test$change=ifelse(test$PValue>0.01, 'stable',
ifelse(test$logFC>1,'UP',
ifelse(test$logFC< -1,'DOWN','stable'))
)
table(test$cells,test$change)
head(test)
test$color <- test$cells
test$color[which(test$change=="stable")] <- "stable"
test$color=factor(test$color)
length(unique(test$cells))
test$'-log10(pvalue)' <- -log10(test$PValue)
p <- ggpubr::ggscatter(test, x="logFC", y="-log10(pvalue)",
color="color",size = 0.7)
save(p,file = "1.pdf")
rm(merged)
pdf("1.pdf")
ggpubr::ggscatter(test, x="logFC", y="-log10(pvalue)",
color="color",size = 0.7)
dev.off()