参考:公众号:生信会客厅
library(Seurat)
library(tidyverse)
library(patchwork)
# 下载并解压数据
dir.create("Vignette01")
download.file("https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz",
"Vignette01/pbmc.tar.gz")
untar("Vignette01/pbmc.tar.gz", exdir="Vignette01")
# 创建表达矩阵
pbmc.data <- Read10X(data.dir = "Vignette01/filtered_gene_bc_matrices/hg19/")
#查看基因数和细胞数
dim(pbmc.data)
#[1] 32738 2700
# 创建seurat对象,保留最少在3个细胞中表达的基因以及最少表达200个基因的细胞
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
#查看基因数和细胞数
dim(pbmc)
#[1] 13714 2700
#可以发现过滤了一大半的基因
head([email protected], 2)
# orig.ident nCount_RNA nFeature_RNA
#AAACATACAACCAC pbmc3k 2419 779
#AAACATTGAGCTAC pbmc3k 4903 1352
#可以发现CreateSeuratObject函数中的project参数与orig.ident是对应关系
#细胞质控
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
p=VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), pt.size=0.1, ncol = 3)
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
p=VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), pt.size=0.1, ncol = 3)
pbmc.sc <- pbmc
数据标准化
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 3000)
pbmc <- ScaleData(pbmc, features = rownames(pbmc)) #对所有基因执行scale
pbmc <- ScaleData(pbmc, features = VariableFeatures(pbmc)) #只对高变基因执行scale
library(sctransform)
pbmc.sc <- SCTransform(pbmc.sc, verbose = FALSE)
##对比它们选择的高变基因
library(VennDiagram)
library(gplots)
v1 <- VariableFeatures(pbmc)
v2 <- VariableFeatures(pbmc.sc)
p <- venn.diagram(list(v1,v2), col=c("red","green"), alpha = c(0.5, 0.5),
category.names=c('pbmc','pbmc.sc'), filename=NULL, margin=0.15)
png(file='Vignette01/compare_variablefeatures.png', width=300, height=300)
grid.draw(p)
dev.off()
##降维结果对比
#作者介绍三步法(Log-normalization)的后续分析选择的pc轴数量增加对结果影响不大,一步法则不同(sctransform)
#我们尝试三种不同的参数nPCs=10,nPCs=20,nPCs=30来验证一下
#先画出ElbowPlot
pbmc <- RunPCA(pbmc, features = VariableFeatures(pbmc))
pbmc.sc<- RunPCA(pbmc.sc, features = VariableFeatures(pbmc.sc))
p1 = ElbowPlot(pbmc, ndims=30,reduction = 'pca')
p2 = ElbowPlot(pbmc.sc, ndims=30,reduction = 'pca')
plotc = p1+p2
ggsave('Vignette01/compare_ElbowPlot.png', plotc, width=8, height=4)
#三个参数都分析看看
nPC.list=list(1:10,1:20,1:30)
set.seed(822)
for(nPCs in nPC.list){
pbmc <- RunPCA(pbmc, verbose = FALSE) %>% RunUMAP(dims = nPCs) %>% FindNeighbors(dims = nPCs) %>%
FindClusters()
pbmc.sc <- RunPCA(pbmc.sc, verbose = FALSE) %>% RunUMAP(dims = nPCs) %>% FindNeighbors(dims = nPCs) %>%
FindClusters()
p1 <- DimPlot(object = pbmc, label = TRUE) + NoLegend() + ggtitle(paste0('Log-normalization_',max(nPCs)))
p2 <- DimPlot(object = pbmc.sc, label = TRUE) + NoLegend() + ggtitle(paste0('sctransform_',max(nPCs)))
plotc <- p1 + p2
ggsave(paste0('Vignette01/compare_umap_',max(nPCs),'.png'), plotc, width=8, height=4)
}
plotc
#pc.num=1:30
#pbmc.sc <- RunPCA(pbmc.sc, verbose = FALSE) %>% RunUMAP(dims = pc.num) %>% FindNeighbors(dims = pc.num) %>%
FindClusters()
补充说明:官方介绍sctransform挑选出来的高变基因和标准化之后的数据,更能去除技术噪音并保留细微的生物学差异。三步法标准化之后的数据进行后续分析,选择的pc轴达到一定数量(拐点)之后再增加就没有意义了;但是一步法标准化之后的数据用于后续分析,选择的pc轴越多越有利于发现细微的生物学差异。实测聚类效果都没有明显的差异,详见上图!
sctransform函数运行之后的结果保存在pbmc@ASSAY@SCT,分析结果是@scale.data,@counts和@data都是用@scale.data反向计算回去的,@counts相当于细胞数据量相同情况下的值(也就是均一化了的)。差异表达分析和数据整合最好用@scale.data,但是seurat目前不支持,因此还得用@data的数据
##后续分析采用sctransform处理过的数据,log_normalization的结果备份
saveRDS(pbmc, 'pbmc_log_normalization.rds')
pbmc = pbmc.sc
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
colnames(pbmc.markers)[2] <- 'avg_logFC'
top10markers <- subset(pbmc.markers, p_val<0.05&p_val_adj<0.05) %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
write.csv(top10markers,'Vignette01/top10markers.scv', row.names=F)
##细胞类型的鉴定我综合了SingleR和cellmarker数据库,以及原教程的biomarker,过程就不赘述了
##将判断结果输入pbmc
celltype <- c('CD4+ T','CD14+ Mono','CD4+ T','B','CD8+ T','NK','FCGR3A+ Mono',
'NK','CD8+ T','DC','CD4+ T','Platelet')
celltype <- as.data.frame(celltype)
[email protected]$celltype = "NA"
for(i in 1:nrow(celltype)){
[email protected][which([email protected]$seurat_clusters == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table([email protected]$celltype)
library(SingleR)
load("D:\\genetic_r\\singer-cell-learning\\SingleR_ref\\SingleR_ref\\ref_Human_all.RData")
refdata <- ref_Human_all
testdata <- GetAssayData(pbmc, slot="data")
###把scRNA数据中的seurat_clusters提取出来,注意这里是因子类型的
clusters <- [email protected]$seurat_clusters
###开始用singler分析
cellpred <- SingleR(test = testdata, ref = refdata, labels = refdata$label.main,
method = "cluster", clusters = clusters,
assay.type.test = "logcounts", assay.type.ref = "logcounts")
###制作细胞类型的注释文件
celltype = data.frame(ClusterID=rownames(cellpred), celltype=cellpred$labels, stringsAsFactors = FALSE)
celltype$celltype <- c('CD4+ T','CD4+ T','CD14+ Mono','CD4+ T','B','CD8+ T','NK','FCGR3A+ Mono',
'NK','CD8+ T','DC','CD4+ T','Platelet')
names(cell.type) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, cell.type) #改变了Idents变量,没有改变meta.data
pbmc$celltype <- Idents(pbmc) #将变量Idents赋值给了meta.data新的列
p1=DimPlot(pbmc, group.by="seurat_clusters", repel=T, label=T,
label.size=5, reduction='umap') + NoLegend()
p2=DimPlot(pbmc, group.by="celltype", repel=T, label=T, label.size=3, reduction='umap') + NoLegend()
p3=p1+p2
#ggsave("Vignette01/celltype_UMAP.png", p2, width=7 ,height=6)
ggsave("Vignette01/celltype.png", p3, width=10 ,height=5)
##保存pbmc
saveRDS(pbmc, 'pbmc_Vignette01.rds')
后面的代码有点问题 暂时还不知带怎么解决 菜狗哭泣。。。。