小鼠全脑单细胞数据细胞类型鉴定

最近在处理小鼠全脑的单细胞数据,归纳总结下细胞类型鉴定的过程吧。我用的算法是Seurat label transfer function, 抽取了Tubula Muris部分已知数据,用这个算法进行了鉴定,准确率可以达到98%。后来我做细胞类型鉴定用的都是这个算法加已知marker双重验证的方法。

1.首先下载两个小鼠全脑的数据

a.Tabula Muris

下载的是Tabula Muris中脑的数据
facs_Brain_Non-Myeloid_seurat_tiss.Robj
facs_Brain_Myeloid_seurat_tiss.Robj
其中Myeloid主要是Microglia 以及Macrophage细胞类型
Non-Myeloid主要是Neuron, Astrocyte, Oligodendrocyte, Endothelial等

b.Mouse Cell Atlas

找这套数据的脑的数据找了我半天
后来发现可以在他们文章里找到数据的下载链接
数据存储在figshare
但是我的数据 我是从Tabula muris github网址上的data路径中下载下来的数据
从Tabula muris github网址中可以找到Seurat处理好的MCA的数据可以直接导入到R里面分析,很方便
这个压缩包里包含两个文件
MCA_CellAssignments.csv
MCA_All-batch-removed-assignments.csv

MCA_All-batch-removed-assignments.csv数据内容

我可纳闷了为啥不添加细胞类型信息 有毒
但是这个下载下来的数据是没有细胞类型的,需要自己到这个网址再下载一下,我下载的是MCA_CellAssignment.csv
MCA_CellAssignments.csv

,然后用R代码处理一下就可以得到对应的细胞类型。
代码如下

MCA_exp <- readRDS("MCA_merged_mat.rds")
MCA_celltypeinfo <- read.csv("MCA_CellAssignments.csv")
MCA_cellname <- read.csv("MCA_All-batch-removed-assignments.csv")
index <- match(MCA_cellname$Cell.name, MCA_celltypeinfo$Cell.name)
selected_cellname <- MCA_celltypeinfo[na.omit(index), ]
dim(selected_cellname)
brain_cellname <- selected_cellname[selected_cellname$Tissue=="Brain", ]
brain_exp <- MCA_exp[, as.character(brain_cellname$Cell.name)]

all.equal(as.character(brain_cellname$Cell.name), colnames(brain_exp))
MCA_brain_rds <- list()
MCA_brain_rds$exp <- brain_exp
MCA_brain_rds$meta.data <- brain_cellname
saveRDS(MCA_brain_rds, "MCA_brain_exp.rds")

2.构建Seurat Reference Object

这里遇到一个问题 就是用SCTtransform的时候 没办法运行成功 后来我用NormalizeData方法就可以的

a.Tabula Muris

load("facs_Brain_Non-Myeloid_seurat_tiss.Robj")
selected_cells <- rownames([email protected])
Non_Myeloid_exp <- [email protected][, selected_cells]
Non_Myeloid_label <- [email protected]$cell_ontology_class
all.equal(colnames(Non_Myeloid_exp), rownames([email protected]))
dim(Non_Myeloid_exp) #23341  3401
# write.csv([email protected], "Tabula_Muris_Non_Myeloid_metadata1.csv")

load("facs_Brain_Myeloid_seurat_tiss.Robj")
selected_cells <- rownames([email protected])
Myeloid_exp <- [email protected][, selected_cells]
Myeloid_label <- [email protected]$cell_ontology_class
all.equal(colnames(Myeloid_exp), rownames([email protected]))
dim(Myeloid_exp) #23341  4455
# write.csv([email protected], "Tabula_Muris_Myeloid_metadata1.csv")
## 我在表格中去掉了Rik结尾的基因 以及4-Sep这样子的基因 基因名字写在了"rownames.csv"
## 是因为这些基因感觉没什么实质性作用 可以跳过这里 不做任何处理

genes <- as.character(read.csv("rownames.csv", row.names= 1)$x)
Brain_exp <- cbind(Non_Myeloid_exp, Myeloid_exp)
Cell_ontology_class <- data.frame(celltype = c(Non_Myeloid_label, Myeloid_label))
rownames(Cell_ontology_class) <- colnames(Brain_exp)

Brain_exp <- Brain_exp[genes, ]
Brain_exp <- Brain_exp[rowSums(Brain_exp>5) >= 20 & rowSums(Brain_exp) >= 20, ]
Brain_exp <- na.omit(Brain_exp)

Tabula_Muris_rds <- list()
Tabula_Muris_rds$exp <- Brain_exp
Tabula_Muris_rds$celltype <- Cell_ontology_class$celltype
saveRDS(Tabula_Muris_rds, "Tabula_Muris_rawdata_with_cellinfo.rds")

# 2) 构建NormalizeData的ref.seurat用来做reference data
ref_exp <- Brain_exp
ref.seurat <- CreateSeuratObject(counts = ref_exp)
ref.seurat <- NormalizeData(ref.seurat, verbose = FALSE)
ref.seurat <- FindVariableFeatures(ref.seurat, selection.method = "vst", 
                                             nfeatures = 2000, verbose = FALSE)
ref.seurat <- ScaleData(ref.seurat)
ref.seurat <- RunPCA(object = ref.seurat, npcs = 30, verbose = FALSE)
ref.seurat$celltype <- Cell_ontology_class$celltype
saveRDS(ref.seurat, file = "Tabula_Muris_NormalizeData_ref_seurat.rds")

# 3) 选取了部分数据 做reference 推断 准确率可以达到98.8%
index <- sample(colnames(Brain_exp), 500)
test_data <- Brain_exp[, index]
test_celltype <- Cell_ontology_class[index, 1]
test.seurat <- CreateSeuratObject(counts = test_data)
test.seurat <- NormalizeData(test.seurat, verbose = FALSE)
test.seurat <- FindVariableFeatures(test.seurat, selection.method = "vst", 
                                   nfeatures = 2000, verbose = FALSE)
test.seurat <- ScaleData(test.seurat)
test.seurat <- RunPCA(object = test.seurat, npcs = 30, verbose = FALSE)
test.seurat$original_celltype <- test_celltype

test.anchors <- FindTransferAnchors(reference = ref.seurat, 
                                        query = test.seurat, 
                                        dims = 1:30)
predictions <- TransferData(anchorset = test.anchors, 
                            refdata = ref.seurat$celltype, 
                            dims = 1:30)
test.seurat <- AddMetaData(test.seurat, metadata = predictions)

b.Mouse Cell Atlas

MCA_exp <- readRDS("MCA_merged_mat.rds")
MCA_celltypeinfo <- read.csv("MCA_CellAssignments.csv")
MCA_cellname <- read.csv("MCA_All-batch-removed-assignments.csv")
index <- match(MCA_cellname$Cell.name, MCA_celltypeinfo$Cell.name)
selected_cellname <- MCA_celltypeinfo[na.omit(index), ]
dim(selected_cellname)
brain_cellname <- selected_cellname[selected_cellname$Tissue=="Brain", ]
brain_exp <- MCA_exp[, as.character(brain_cellname$Cell.name)]

all.equal(as.character(brain_cellname$Cell.name), colnames(brain_exp))
MCA_brain_rds <- list()
MCA_brain_rds$exp <- brain_exp
MCA_brain_rds$meta.data <- brain_cellname
saveRDS(MCA_brain_rds, "MCA_brain_exp.rds")

# 2.构建seurat reference 
library(Seurat)
ref_exp <- brain_exp
ref.seurat <- CreateSeuratObject(counts = ref_exp)
ref.seurat <- NormalizeData(ref.seurat, verbose = FALSE)
ref.seurat <- FindVariableFeatures(ref.seurat, selection.method = "vst", 
                                   nfeatures = 2000, verbose = FALSE)
ref.seurat <- ScaleData(ref.seurat)
ref.seurat <- RunPCA(object = ref.seurat, npcs = 30, verbose = FALSE)
ref.seurat$celltype <- as.character(brain_cellname$Annotation)
saveRDS(ref.seurat, file = "MCA_NormalizeData_ref_seurat.rds")

写在最后 佛系更文 很开心 终于有一百个粉丝了开心快乐的一天啊

你可能感兴趣的:(小鼠全脑单细胞数据细胞类型鉴定)