library(dplyr)
library(Seurat)
library(patchwork)
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir ="../data/pbmc3k/filtered_gene_bc_matrices/hg19/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
Read10X函数读取数据后返回的是UMI count矩阵,接下来用count矩阵创建seurat对象,在这一步就可以做质控,min.cell = n是指一个基因至少在n个细胞里表达,min.features=m是指一个细胞至少表达m个基因。
这一步包括QC、数据标准化、确定高变异基因、缩放(归一化)。
细胞低质量的指标
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
用来计算每个细胞里某个pattern的基因的比例
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
还可以看不同特征的线性关系
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
#做两个特征的散点图
FeatureScatter(
object,#seurat对象
feature1,#特征1
feature2,
cells = NULL,#用哪些细胞
group.by = NULL,
cols = NULL,
pt.size = 1,
shape.by = NULL,
span = NULL,
smooth = FALSE,
combine = TRUE,
slot = "data",#数据来源 'counts', 'data' 或 'scale.data'
plot.cor = TRUE,#在标题中展示相关性
raster = NULL
)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
鉴定在细胞之间表达水平变异性高的基因,并将这些基因用于下游降维分析。函数默认返回2000个feature,不知道这个参数对下游有什么影响?
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
对于seurat对象:
FindVariableFeatures(
object,
assay = NULL,
selection.method = "vst",
loess.span = 0.3,
clip.max = "auto",
mean.function = FastExpMean,
dispersion.function = FastLogVMR,
num.bin = 20,
binning.method = "equal_width",
nfeatures = 2000,
mean.cutoff = c(0.1, 8),
dispersion.cutoff = c(1, Inf),
verbose = TRUE,
...
)
在PCA降维前对数据做线性转换。ScaleData()有如下作用:1.转换每个基因的表达值,使细胞间平均表达值为0。2.缩放每个基因的表达值,使细胞间的variance为1,这个步骤让所有基因在下游分析中有相同的权重,避免高变异基因占主导。3.结果存储在pbmc[["RNA"]]@scale.data
中。
默认的ScaleData函数只使用上一步返回的高变异基因,但是可以自己选择参与ScaleData的基因。DoHeatmap()
用的是scale.data
,为了不对hetamap造成影响,可以对所有基因都scale。
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
在ScaleData()
这一步可以消去细胞周期或线粒体污染来源的异质性对细胞分群造成的影响。用vars.to.regress
参数,例如pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt")
。
# S3 method for Seurat
ScaleData(
object,
features = NULL,#默认是高变异基因
assay = NULL,
vars.to.regress = NULL,#例如uUMI或percent.mito
split.by = NULL,
model.use = "linear",
use.umi = FALSE,
do.scale = TRUE,
do.center = TRUE,
scale.max = 10,
block.size = 1000,
min.cells.to.block = 3000,
verbose = TRUE,
...
)
默认情况下,只使用前面确定的高变异基因作为input,但是可以使用使用features
参数定义自己想用的基因集。
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
seurat提供了几个可视化的方法,如VizDimReduction()
、DimPlot()
和 DimHeatmap()
。在这几个可视化中,DimHeatmap()
的结果参考价值最大,它允许我们快速查看异质性来源,比如通过每个维度图看细胞类型marker,图中细胞和基因将会被依据PCA分数排序,通过设置cells
参数能快速查看表达谱两端的极端细胞。
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca")
DimHeatmap(
object,
dims = 1,#用于画图的维度数,一个向量
nfeatures = 30,
cells = NULL,#用于画图的细胞数,如果设定数字,显示两端的极值细胞
reduction = "pca",
disp.min = -2.5,
disp.max = NULL,
balanced = TRUE,#显示正负极端值的基因
projected = FALSE,
ncol = NULL,
fast = TRUE,
raster = TRUE,
slot = "scale.data",
assays = NULL,
combine = TRUE
)
关于维度选择
除了上述的DimHeatmap
,还有JackStraw和Elbow方法。对于大型数据,推荐使用速度较快的Eblow
ElbowPlot(pbmc)
ElbowPlot(object, ndims = 20, reduction = "pca")
待续