原文链接
10、DA(Differential abundance) -

1、overview

the design of replicated multi-condition experiments to detect changes in composition or expression between conditions.
即通过设计多个实验样本（不同conition的多次重复，一般来说 two conditions and three replications）
本节主要学习DE（differential expression），基因表达差异。这与之前学习的RNAseq的差异分析最主要的区别在于single cell DE一般是基于细胞注释之后，进行不同细胞之间的差异基因分析。

2、setup data

示例数据来自MouseGastrulationData包，我们从中提取6个sample
design：a paired design with three replicate batches of two samples each.
conditions(two samples)：td-Tomato positive cells VS negative cells

2.1 下载

library(MouseGastrulationData)
sce.chimera <- WTChimeraData(samples=5:10)
sce.chimera

#将近一个G大小，赶紧保存为Rdata
save(sce.chimera, file="sce.chimera.Rdata")
rm(list=ls())
load(sce.chimera.Rdata)

2.2初步探索数据

#two conditions
table(colData(sce.chimera)$tomato)
# three replication
table(colData(sce.chimera)$tomato,
      colData(sce.chimera)$pool)
#six samples
table(colData(sce.chimera)$sample)
#annotated celltype
table(colData(sce.chimera)$celltype.mapped,
      colData(sce.chimera)$tomato)

2.3 前处理 preprocessing

if (T){
#--- feature-annotation ---# ensemble→symbol
  library(scater)
  rownames(sce.chimera) <- uniquifyFeatureNames(
    rowData(sce.chimera)$ENSEMBL, rowData(sce.chimera)$SYMBOL)
  
  #--- quality-control ---#
  drop <- sce.chimera$celltype.mapped %in% c("stripped", "Doublet")
  sce.chimera <- sce.chimera[,!drop] #dim 29453 19426 
  
  #--- normalization ---#
  sce.chimera <- logNormCounts(sce.chimera)
  
  #--- variance-modelling ---#
  library(scran)
  dec.chimera <- modelGeneVar(sce.chimera, block=sce.chimera$sample)
  chosen.hvgs <- dec.chimera$bio > 0
  
  #--- merging ---#
  library(batchelor)
  set.seed(01001001)
  #Apply a correction to multiple SingleCellExperiment objects, 
  #while also combining the assay data and column metadata for easy use.
  merged <- correctExperiments(sce.chimera, 
                               batch=sce.chimera$sample,  #共六个样本（3*2）
                               subset.row=chosen.hvgs,
                               PARAM=FastMnnParam( #MNN算法去除指定两组的批次效应
                                 merge.order=list(
                                   list(1,3,5), # WT (3 replicates)
                                   list(2,4,6)  # td-Tomato (3 replicates)
                                 )
                               )
  )
  
  #--- clustering ---#
  g <- buildSNNGraph(merged, use.dimred="corrected")
  clusters <- igraph::cluster_louvain(g)
  merged$new.clusters <- factor(clusters$membership
  
  #--- dimensionality-reduction ---#
  merged <- runTSNE(merged, dimred="corrected", external_neighbors=TRUE)
  merged <- runUMAP(merged, dimred="corrected", external_neighbors=TRUE)
}
merged

2-1

2.4 可视化

图1：二维可视化，有无明显批次效应

gridExtra::grid.arrange(
  plotTSNE(merged, colour_by="tomato", text_by="new.clusters"),
  plotTSNE(merged, colour_by=data.frame(pool=factor(merged$pool))),
  ncol=2
)

2-2

图2：new.clusters and annotated celltype

by.label <- table(merged$new.clusters, merged$celltype.mapped)
pheatmap::pheatmap(log2(by.label+1), cluster_cols=FALSE, cluster_rows=FALSE,
                   color=viridis::viridis(101))

Ordinarily, we would be obliged to perform marker detection to assign biological meaning to these clusters.
For simplicity, we will skip this step by directly using the cell type labels have annotated.

2-3

3、detailed DE with one celltype

3.1 Creating pseudo-bulk samples

简单来说就是在已经注释celltype的基础上，将每个样本中聚类到一个celltype中的gene counts加在一起。

summed <- aggregateAcrossCells(merged, 
                               id=colData(merged)[,c("celltype.mapped", "sample")])
#将六个样本中各个样本所有cell的各个celltype.mapped的cell gene counts加在一起
dim(summed)
counts(summed)[1:4,1:4]
head(colData(summed),2)

如下图注释，样本5的第一列counts即为注释到在Mesenchyme celltype上的cell所有表达情况概况。

3-1

3.2 添加ncells

教程示例中，colnames(colData(summed))有一列ncells，为统计相应样本里，注释到相应细胞类型的total cells。在后面DE中，过滤时会用到。
但在操作中并没有发现，推测可能是版本更新的原因，我的R还是3.6，而现在教程应该是基于R4.0的对应的R包才可以。有时间去升级一下。
现在就手动添加吧

dim(colData(summed))
#各个sample的celltype共有186个

table(merged$celltype.mapped,merged$sample)
#转换成表格
tmp <- as.data.frame(table(merged$celltype.mapped,merged$sample))
tmp <- tmp[!(tmp$Freq==0),]
dim(tmp)
#也是186个，应该对应上了，Freq就是目标ncells
tmp1 <- as.data.frame(colData(summed))[,c("celltype.mapped","sample")]

先paste确定公共列，再merge

#paste
tmp$x <- paste(tmp$Var1,tmp$Var2,sep = "-")
tmp1$x <- paste(tmp1$celltype.mapped,tmp1$sample,sep = "-")
#merge
tmp1 <- merge(tmp1,tmp,by="x",sort=F)
#注意sort=F，不要改变顺序
head(tmp1)
#添加到colData
summed$ncells <- tmp1$Freq
head(colData(summed))

3-2

3.3 以一个celltype进行差异分析

（1）筛选指定celltype数据

label <- "Mesenchyme"
current <- summed[,label==summed$celltype.mapped]
head(colData(current),10)

(3) edgeR

DEGList

library(edgeR)
y <- DGEList(counts(current), samples=colData(current))
y
#将counts(current)矩阵与colData(current)组成一个list

####### 小插曲：质控
remove samples with very low library sizes due to failed library preparation or sequencing.

思路1：remove combinations containing fewer than 20 cells（筛选cells）

discarded <- current$ncells < 20
summary(discarded)
#丢弃sample里只有不到20个cell的sample，这里的数据都大于20
y <- y[,!discarded]

思路2：remove genes that are lowly expressed（筛选gene）

keep <- filterByExpr(y, group=current$tomato)
summary(keep)
y <- y[keep,]
#丢弃了9000+个gene

matrics design

design <- model.matrix(~factor(pool) + factor(tomato), y$samples)
factor(y$samples$pool)
factor(y$samples$tomato)
design

差异分析

y <- estimateDisp(y, design)
summary(y$trended.dispersion)
fit <- glmQLFit(y, design, robust=TRUE)
summary(fit$var.prior)
res <- glmQLFTest(fit, coef=ncol(design))
summary(decideTests(res))
topTags(res)

3-3

3.4 all DE

上述是以具体的一个celltype进行以edgeR的差异分析。
scran包也提供了一次性的封装好的函数以供使用。
同样可能是因为版本的原因，未找到pseudoBulkDGE()这个函数。故仅将代码录于此处，之后再尝试一下

在写上述草稿的第二天后，下载了最新版的R4.02，再安装相关Bioconductor包进行分析，的确就成功了（但主要还是使用3.6，需要用到4.0版本时再切换到该版本即可）

3-4

save(merged,file="merged.Rdata")
rm(list=ls())
#切换R4.0
load("merged.Rdata")
library(scater)
summed <- aggregateAcrossCells(merged, 
                               id=colData(merged)[,c("celltype.mapped", "sample")])
summed$ncells #果然基于4.0版本的scater会添加ncells列
summed.filt <- summed[,summed$ncells >= 20]
summed.filt$ncells
dim(summed.filt)
# Pulling out a sample-level 'targets' data.frame:
targets <- colData(merged)[!duplicated(merged$sample),]

# Constructing the design matrix:
design <-  model.matrix(~factor(pool) + factor(tomato), data=targets)
rownames(design) <- targets$sample
design
library(scran)
de.results <- pseudoBulkDGE(summed.filt, 
                            sample=summed.filt$sample,
                            label=summed.filt$celltype.mapped,
                            design=design,
                            coef=ncol(design),
                            
                            # 'condition' sets the group size for filterByExpr(),
                            # to perfectly mimic our previous manual analysis.
                            condition=targets$tomato 
)

3-5

is.de <- decideTestsPerLabel(de.results, threshold=0.05)
summarizeTestsPerLabel(is.de)

# Upregulated across most cell types.
up.de <- is.de > 0 & !is.na(is.de)
head(sort(rowMeans(up.de), decreasing=TRUE), 10)

# Downregulated across cell types.
down.de <- is.de < 0 & !is.na(is.de)
head(sort(rowMeans(down.de), decreasing=TRUE), 10)

3-6

以上是第十五章differential-expression-between-conditions第一部分的简单流程笔记，主要学习了single cell DEG详见Chapter 14 Multi-sample comparisons
本系列笔记基于OSCA全流程的大致流程梳理，详细原理可参考原文。如有错误，恳请指正！
此外还有刘小泽老师整理的全文翻译笔记，详见目录。

最后本来想绘制个火山图，但结果并没有预想的那么好，草稿先放在这，以后再看看吧。有什么想法的朋友，欢迎留言。

tmp3 <- as.data.frame(do.call(rbind, lapply(tmp, as.matrix)),stringsAsFactors=F)
cell_26 <- names(de.results)
cell_26
cells <- rep(cell_26,each=nrow(tmp3)/26)
tmp3 <- cbind(tmp3,cells,stringsAsFactors=F)
test <- na.omit(tmp3)
test$change=ifelse(test$PValue>0.01, 'stable', 
         ifelse(test$logFC>1,'UP',
                ifelse(test$logFC< -1,'DOWN','stable'))
  )
table(test$cells,test$change)

head(test)
test$color <- test$cells
test$color[which(test$change=="stable")] <- "stable"
test$color=factor(test$color)
length(unique(test$cells))
test$'-log10(pvalue)' <- -log10(test$PValue)
p <- ggpubr::ggscatter(test, x="logFC", y="-log10(pvalue)",
          color="color",size = 0.7)
save(p,file = "1.pdf")
rm(merged)
pdf("1.pdf") 
ggpubr::ggscatter(test, x="logFC", y="-log10(pvalue)",
                  color="color",size = 0.7)
dev.off()

9、DEG(Differential expressed genes)