new_cell_data_set(expression_data, cell_metadata = NULL, gene_metadata = NULL)
expression_data |
expression data matrix for an experiment, can be a sparseMatrix. |
cell_metadata |
data frame containing attributes of individual cells, where |
gene_metadata |
data frame containing attributes of features (e.g. genes), where |
# 设置工作目录
setwd("your/working/path")
# 载入包
library(monocle3)
# packageVersion('monocle3') # monocle版本
# ls("package:monocle3")
library(ggplot2)
library(dplyr)
# 1.分别读入表达矩阵,细胞注释数据框以及基因注释数据框,并构建new_cell_data_set对象
# Load the data
expression_matrix <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_expression.rds"))
cell_metadata <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_colData.rds"))
gene_annotation <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_rowData.rds"))
# Make the CDS object
cds <- new_cell_data_set(expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
# 如果表达矩阵不是"dgCMatrix",要转为sparseMatrix,加快下游的数据运算。
# cds <- new_cell_data_set(as(umi_matrix, "sparseMatrix"),
# cell_metadata = cell_metadata,
# gene_metadata = gene_metadata)
## 2. 从CellRanger的输出构建new_cell_data_set对象
## Provide the path to the Cell Ranger output.
#cds <- load_cellranger_data("~/Downloads/10x_data")
#cds <- load_mm_data(mat_path = "~/Downloads/matrix.mtx",
# feature_anno_path = "~/Downloads/features.tsv",
# cell_anno_path = "~/Downloads/barcodes.tsv")
## 3. 合并new_cell_data_set对象
## combine cds objects
# make a fake second cds object for demonstration
#cds2 <- cds[1:100,]
#big_cds <- combine_cds(list(cds, cds2))
可以先降维看是否有批次效应,消除批次效应后要重新进行降维运算。
plot_cells( cds, x = 1, y = 2, reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"), color_cells_by = "cluster", group_cells_by = c("cluster", "partition"), genes = NULL, show_trajectory_graph = TRUE, trajectory_graph_color = "grey28", trajectory_graph_segment_size = 0.75, norm_method = c("log", "size_only"), label_cell_groups = TRUE, label_groups_by_cluster = TRUE, group_label_size = 2, labels_per_group = 1, label_branch_points = TRUE, label_roots = TRUE, label_leaves = TRUE, graph_label_size = 2, cell_size = 0.35, cell_stroke = I(cell_size/2), alpha = 1, min_expr = 0.1, rasterize = FALSE, scale_to_range = FALSE, label_principal_points = FALSE )
# 表型数据
pData(cds)
# 表达矩阵
exprs(cds)
# preprocess_cds首先通过log和size factor (或仅仅size factor )对数据进行标准化,以解决深度差# 异。接下来,preprocess_cds计算一个低维空间,该空间将用作进一步降维的输入,如tSNE和UMAP。
cds <- preprocess_cds(cds, num_dim = 100)
# whether the PCs capture most of the variation
# in gene expression across all the cells in the data set?
plot_pc_variance_explained(cds)
# reduce_dimension,用pca(n=100)数据降维
# 单细胞数量 >10,000, 设置umap.fast_sgd=TRUE,加快计算速度
cds <- reduce_dimension(cds)
#cds <- reduce_dimension(cds,reduction_method="tSNE")
#获得降纬度后的数据矩阵
reducedDim(cds, "PCA")
reducedDim(cds, "UMAP")
# 根据UMAP作图
plot_cells(cds)
#plot_cells(cds,reduction_method="tSNE")
#check for batch effects
plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE)
# 去除批次效应
cds = align_cds(cds, num_dim = 100, alignment_group = "plate")
plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE)
# 根据表型加上颜色
#colnames(pData(cds))
plot_cells(cds, color_cells_by="cao_cell_type")
## 不加label
#plot_cells(cds, color_cells_by="cao_cell_type",label_cell_groups=FALSE)
#plot_cells(cds, reduction_method="tSNE",color_cells_by="cao_cell_type")
# 显示各细胞中特定基因的表达量
plot_cells(cds, genes=c("cpna-2", "egl-21", "ram-2", "inos-1"))
cluster_cells( cds, reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"), k = 20, cluster_method = c("leiden", "louvain"), num_iter = 2, partition_qval = 0.05, weight = FALSE, resolution = NULL, random_seed = NULL, verbose = F, ... )
### 聚类
cds = cluster_cells(cds, resolution=1e-5)
# 根据cluster标记颜色
plot_cells(cds)
# cds = cluster_cells(cds, reduction_method="tSNE",resolution=1e-5)
# plot_cells(cds,reduction_method="tSNE")
# 根据partition标记颜色
plot_cells(cds, color_cells_by="partition", group_cells_by="partition")
# plot_cells(cds, color_cells_by="cao_cell_type")
### Find marker genes expressed by each cluster
marker_test_res <- top_markers(cds, group_cells_by="partition",
reference_cells=1000, cores=8)
# colnames(marker_test_res)
top_specific_markers <- marker_test_res %>%
filter(fraction_expressing >= 0.10) %>%
group_by(cell_group) %>%
top_n(1, pseudo_R2)
top_specific_marker_ids <- unique(top_specific_markers %>% pull(gene_id))
# plot the expression and fraction of cells that express each marker in each group
# 比较不同partition/cluster中marker基因的平均表达量
plot_genes_by_group(cds,
top_specific_marker_ids,
group_cells_by="partition",
ordering_type="maximal_on_diag",
max.size=3)
细胞类型的注释是研究中最难的部分,可能含有新发现的细胞亚群,需要专家知识。细胞亚群往往需要进一步聚类,注释,最后合并一张图上(包含所有的细胞群及精细的注释信息)。
### 1. 手工注释(根据marker基因的表达...)。本例中仅仅把colData中的细胞类型注释到不同的cluster
colData(cds)$assigned_cell_type <- as.character(partitions(cds))
# rename细胞类型
colData(cds)$assigned_cell_type = dplyr::recode(colData(cds)$assigned_cell_type,
"1"="Germline",
"2"="Body wall muscle",
"3"="Unclassified neurons",
"4"="Vulval precursors",
"5"="Failed QC",
"6"="Seam cells",
"7"="Pharyngeal epithelia",
"8"="Coelomocytes",
"9"="Am/PH sheath cells",
"10"="Failed QC",
"11"="Touch receptor neurons",
"12"="Intestinal/rectal muscle",
"13"="Pharyngeal neurons",
"14"="NA",
"15"="flp-1(+) interneurons",
"16"="Canal associated neurons",
"17"="Ciliated sensory neurons",
"18"="Other interneurons",
"19"="Pharyngeal gland",
"20"="Failed QC",
"21"="Ciliated sensory neurons",
"22"="Oxygen sensory neurons",
"23"="Ciliated sensory neurons",
"24"="Ciliated sensory neurons",
"25"="Ciliated sensory neurons",
"26"="Ciliated sensory neurons",
"27"="Oxygen sensory neurons",
"28"="Ciliated sensory neurons",
"29"="Unclassified neurons",
"30"="Socket cells",
"31"="Failed QC",
"32"="Pharyngeal gland",
"33"="Ciliated sensory neurons",
"34"="Ciliated sensory neurons",
"35"="Ciliated sensory neurons",
"36"="Failed QC",
"37"="Ciliated sensory neurons",
"38"="Pharyngeal muscle")
plot_cells(cds, group_cells_by="partition", color_cells_by="assigned_cell_type")
# 交互选择子细胞群
cds_subset <- choose_cells(cds)
## 对子细胞群进行聚类
cds_subset = cluster_cells(cds_subset, resolution=1e-2)
plot_cells(cds_subset, color_cells_by="cluster")
plot_cells(cds, color_cells_by="cluster")
# 注释子细胞群中的细胞类型
colData(cds_subset)$assigned_cell_type <- as.character(clusters(cds_subset))
colData(cds_subset)$assigned_cell_type <- dplyr::recode(colData(cds_subset)$assigned_cell_type,
"1"="Somatic gonad precursors",
"2"="Somatic gonad precursors",
"3"="Vulval precursors",
"4"="Sex myoblasts",
"5"="Sex myoblasts",
"6"="Vulval precursors",
"7"="Failed QC",
"8"="Vulval precursors",
"10"="Unclassified neurons",
"11"="Distal tip cells")
plot_cells(cds_subset, group_cells_by="cluster",
color_cells_by="assigned_cell_type")
# 在所有细胞群图中显示子细胞群的注释信息
colData(cds)[colnames(cds_subset),]$assigned_cell_type <- colData(cds_subset)$assigned_cell_type
cds <- cds[,colData(cds)$assigned_cell_type != "Failed QC" | is.na(colData(cds)$assigned_cell_type )]
plot_cells(cds, group_cells_by="partition",
color_cells_by="assigned_cell_type",
labels_per_group=5)
### 2. Automated annotation with Garnett
assigned_type_marker_test_res <- top_markers(cds,
group_cells_by="assigned_cell_type",
reference_cells=1000,
cores=8)
# Require that markers have at least JS specificty score > 0.5 and
# be significant in the logistic test for identifying their cell type:
garnett_markers <- assigned_type_marker_test_res %>%
filter(marker_test_q_value < 0.01 & specificity >= 0.5) %>%
group_by(cell_group) %>%
top_n(5, marker_score)
# Exclude genes that are good markers for more than one cell type:
garnett_markers <- garnett_markers %>%
group_by(gene_short_name) %>%
filter(n() == 1)
# 生成用于细胞类型注释的文件,可以根据专业知识修改用于细胞类型识别的基因
generate_garnett_marker_file(garnett_markers, file="./marker_file.txt")
colData(cds)$garnett_cluster <- clusters(cds)
# 安装相应的包
BiocManager::install(c("org.Mm.eg.db", "org.Hs.eg.db")) # garnett依赖的包
devtools::install_github("cole-trapnell-lab/garnett", ref="monocle3")
BiocManager::install("org.Ce.eg.db")
library(garnett)
# 训练分类器,训练好的分类器可以保存起来用于其他相似数据的注释
worm_classifier <- train_cell_classifier(cds = cds,
marker_file = "./marker_file.txt",
db=org.Ce.eg.db::org.Ce.eg.db,
cds_gene_id_type = "ENSEMBL",
num_unknown = 50,
marker_file_gene_id_type = "SYMBOL",
cores=8)
#saveRDS(worm_classifier,"worm_classifier.RDS")
#worm_classifier <- readRDS("worm_classifier.RDS")
# 根据分类器进行细胞类型注释
cds <- classify_cells(cds, worm_classifier,
db = org.Ce.eg.db::org.Ce.eg.db,
cluster_extend = TRUE,
cds_gene_id_type = "ENSEMBL")
plot_cells(cds,
group_cells_by="partition",
color_cells_by="cluster_ext_type")
monocle3的差异分析有两种方法:
回归分析:使用fit_models(),您可以评估每个基因是否取决于时间、治疗等变量。
图形自相关分析:使用Graph_test(),您可以找到在轨迹上或簇之间变化的基因。
# 演示数据,取子集,对部分基因做差异表达分析
ciliated_genes <- c("che-1",
"hlh-17",
"nhr-6",
"dmd-6",
"ceh-36",
"ham-1")
cds_subset <- cds[rowData(cds)$gene_short_name %in% ciliated_genes,]
### 1.回归分析差异表达基因
gene_fits <- fit_models(cds_subset, model_formula_str = "~embryo.time")
# fit_models:该函数适用于细胞数据集中每个基因的广义线性模型。
# 可以提供公式来解释额外的协变量 ~embryo.time + batch
# (例如收集的天数、细胞基因型、培养基条件等)。
# 主要参数1: model_formula_str 可以是~cluster or ~partition等colData列名
# 主要参数2:expression_family:Specifies the family function used for expression responses.
#Can be one of "quasipoisson", "negbinomial", "poisson", "binomial", "gaussian",
#"zipoisson", or "zinegbinomial". Default is "quasipoisson".
class(gene_fits) # "tbl_df" "tbl" "data.frame"
fit_coefs <- coefficient_table(gene_fits)
emb_time_terms <- fit_coefs %>% filter(term == "embryo.time")
emb_time_terms %>% filter (q_value < 0.05) %>%
select(gene_short_name, term, q_value, estimate)
plot_genes_violin(cds_subset, group_cells_by="embryo.time.bin", ncol=2) +
theme(axis.text.x=element_text(angle=45, hjust=1))
# control batch effect
gene_fits <- fit_models(cds_subset, model_formula_str = "~embryo.time + batch")
# gene_fits2 <- fit_models(cds_subset, model_formula_str = "~cluster+ batch")
fit_coefs <- coefficient_table(gene_fits)
fit_coefs %>% filter(term != "(Intercept)") %>%
select(gene_short_name, term, q_value, estimate)
evaluate_fits(gene_fits)
# 模型比较,是否存在批次效应
time_batch_models <- fit_models(cds_subset,
model_formula_str = "~embryo.time + batch",
expression_family="negbinomial")
time_models <- fit_models(cds_subset,
model_formula_str = "~embryo.time",
expression_family="negbinomial")
compare_models(time_batch_models, time_models) %>% select(gene_short_name, q_value)
# 比较结果:所有基因的似然比检验都是显著的,这表明数据中存在大量的批量效应。
# 因此,我们有理由将批处理项添加到我们的模型中。
### 2.图形自相关分析差异表达基因
# 依据子细胞群在低维空间的位置,运用Moran’s I 检验,检测空间相关的差异表达基因
pr_graph_test_res <- graph_test(cds_subset, neighbor_graph="knn", cores=8)
pr_deg_ids <- row.names(subset(pr_graph_test_res,
morans_I > 0.01 & q_value < 0.05))
#将基因聚集成跨细胞共表达的模块
gene_module_df <- find_gene_modules(cds_subset[pr_deg_ids,], resolution=1e-3)
plot_cells(cds_subset, genes=gene_module_df,
show_trajectory_graph=FALSE,
label_cell_groups=FALSE)
####### 单细胞伪时间轨迹分析#######
###1.构建cell_data_set对象
expression_matrix <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_expression.rds"))
cell_metadata <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_colData.rds"))
gene_annotation <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_rowData.rds"))
cds <- new_cell_data_set(expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
### 2.预处理
#normalization and dimension reduction.
cds <- preprocess_cds(cds, num_dim = 50)
# plot_pc_variance_explained(cds) # 查看pca的分量解释多少变异
# remove batch effects
head(colData(cds))
cds <- align_cds(cds, alignment_group = "batch",
residual_model_formula_str = "~ bg.300.loading + bg.400.loading + bg.500.1.loading + bg.500.2.loading + bg.r17.loading + bg.b01.loading + bg.b02.loading")
# alignment_group:指定用于对齐单元格组的colData列的字符串。
# 指定的列必须是一个因子。对齐可用于以非线性方式减去批次效果。
# 要更正连续效果,请使用残差模型公式。默认值为空。
# residual_model_formula_str:NULL或字符串模型公式,
# 指定在降维之前从数据中减去的任何影响。使用线性模型减去效果。
# 对于非线性效果,请使用对齐组。默认值为空。
# 使用pca处理的数据进一步降维以便聚类和轨迹推断
cds <- reduce_dimension(cds)
plot_cells(cds, label_groups_by_cluster=FALSE, color_cells_by = "cell.type")
# 图中显示基因表达情况
ciliated_genes <- c("che-1",
"hlh-17",
"nhr-6",
"dmd-6",
"ceh-36",
"ham-1")
plot_cells(cds,
genes=ciliated_genes,
label_cell_groups=FALSE,
show_trajectory_graph=FALSE)
### 3.单细胞聚类
cds <- cluster_cells(cds)
plot_cells(cds, color_cells_by = "partition")
# partition和cluster有区别
# plot_cells(cds, color_cells_by = "cluster")
### 4. 学习伪时间轨迹图
# 伪时间是衡量单个细胞在细胞分化等过程中所取得进展的指标。
cds <- learn_graph(cds)
plot_cells(cds,
color_cells_by = "cell.type",
label_groups_by_cluster=FALSE,
label_leaves=FALSE,
label_branch_points=FALSE)
# 在轨迹图标记胚胎时间段,
# 黑线表示图形的结构,黑圈表示分支节点,浅灰色圆圈表示细胞状态
plot_cells(cds,
color_cells_by = "embryo.time.bin",
label_cell_groups=FALSE,
label_leaves=TRUE,
label_branch_points=TRUE,
graph_label_size=1.5)
# 交互选择根节点(最初状态的细胞)
cds <- order_cells(cds)
# 伪时间轨迹图作图,灰色表示无限的伪时间,因为从拾取的根节点无法访问它们。
# 一般一个partition选择一个root,就能解决无限伪时间问题
plot_cells(cds,
color_cells_by = "pseudotime",
label_cell_groups=FALSE,
label_leaves=FALSE,
label_branch_points=FALSE,
graph_label_size=1.5)
# # 不显示轨迹图
# plot_cells(cds,
# color_cells_by = "pseudotime",
# label_cell_groups=FALSE,
# label_leaves=FALSE,
# label_branch_points=FALSE,
# show_trajectory_graph = FALSE)
## 自动选择根节点(root)
# a helper function to identify the root principal points:
get_earliest_principal_node <- function(cds, time_bin="130-170"){
cell_ids <- which(colData(cds)[, "embryo.time.bin"] == time_bin)
closest_vertex <-
cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
root_pr_nodes <-
igraph::V(principal_graph(cds)[["UMAP"]])$name[as.numeric(names
(which.max(table(closest_vertex[cell_ids,]))))]
root_pr_nodes
}
cds <- order_cells(cds, root_pr_nodes=get_earliest_principal_node(cds))
plot_cells(cds,
color_cells_by = "pseudotime",
label_cell_groups=FALSE,
label_leaves=FALSE,
label_branch_points=FALSE,
show_trajectory_graph = FALSE)
# 沿轨迹图交互选择起始节点以及终节点的单细胞
cds_sub <- choose_graph_segments(cds)
### 5. 单细胞轨迹图上显示差异表达基因的表达量
ciliated_cds_pr_test_res <- graph_test(cds, neighbor_graph="principal_graph", cores=4)
pr_deg_ids <- row.names(subset(ciliated_cds_pr_test_res, q_value < 0.05))
plot_cells(cds, genes=c("hlh-4", "gcy-8", "dac-1", "oig-8"),
show_trajectory_graph=FALSE,
label_cell_groups=FALSE,
label_leaves=FALSE)
### 6. 将沿单细胞轨迹图的差异表达基因进行模块聚类分析,共调节模块分析
gene_module_df <- find_gene_modules(cds[pr_deg_ids,], resolution=c(0,10^seq(-6,-1)))
cell_group_df <- tibble::tibble(cell=row.names(colData(cds)),
cell_group=colData(cds)$cell.type)
# 聚合表达矩阵:行为module,列为cell.type
agg_mat <- aggregate_gene_expression(cds, gene_module_df, cell_group_df)
row.names(agg_mat) <- stringr::str_c("Module ", row.names(agg_mat))
# 可视化:热图
pheatmap::pheatmap(agg_mat,
scale="column", clustering_method="ward.D2")
# 比较不同module的基因聚合表达值
plot_cells(cds,
genes=gene_module_df %>% filter(module %in% c(27, 10, 7, 30)),
label_cell_groups=FALSE,
show_trajectory_graph=FALSE)
# 特定基因在特定细胞类型中的伪时间轨迹图中的表达量作图
AFD_genes <- c("gcy-8", "dac-1", "oig-8")
AFD_lineage_cds <- cds[rowData(cds)$gene_short_name %in% AFD_genes,
colData(cds)$cell.type %in% c("AFD")]
plot_genes_in_pseudotime(AFD_lineage_cds,
color_cells_by="embryo.time.bin",
min_expr=0.5)
### 7. 3d单细胞伪时间轨迹图
cds_3d <- reduce_dimension(cds, max_components = 3)
cds_3d <- cluster_cells(cds_3d)
cds_3d <- learn_graph(cds_3d)
cds_3d <- order_cells(cds_3d, root_pr_nodes=get_earliest_principal_node(cds))
cds_3d_plot_obj <- plot_cells_3d(cds_3d, color_cells_by="partition")