# install.packages('Seurat')
# 表达矩阵来自CellRanger分析结果
data_dir <- "/run_count_1kpbmcs/outs/filtered_feature_bc_matrix"
### 构建Seurat对象
expr_dgCMatrix <- Read10X(data.dir = data_dir)
seurat_object <- CreateSeuratObject(counts = expr_dgCMatrix)
# counts:Either a matrix-like object with unnormalized data
# with cells as columns and features as rows
# or an Assay-derived object
# sparse Matrix of class "dgCMatrix" 行:基因,列:细胞barcode
[email protected] # data.frame
### 预处理与质量控制
seurat_object[["percent.mt"]] <- PercentageFeatureSet(seurat_object,
pattern = "^MT-")
head([email protected], 5)
#使用violin plot查看QC指标分布,如图1所示
VlnPlot(seurat_object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3)
plot1 <- FeatureScatter(seurat_object, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(seurat_object, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
#筛选数据根据之前violin plot来确定。
seurat_object <- subset(seurat_object, subset = nFeature_RNA > 500 &
nFeature_RNA < 6000 & percent.mt < 25)
saveRDS(seurat_object, file = "subset.rds")
### 校正表达值
seurat_object_norm <- NormalizeData(seurat_object)
# # 查看矫正前后的数据
# head(colMeans(seurat_object))
# head(colMeans(seurat_object_norm))
### 鉴定高变异基因
seurat_object_norm <- FindVariableFeatures(seurat_object_norm, nfeatures = 2000)
plot1 <- VariableFeaturePlot(seurat_object_norm)
top10_genes <- head(VariableFeatures(seurat_object_norm), 10)
# repel参数用了后作图出错?
# plot2 <- LabelPoints(plot = plot1, points = top10_genes,
# repel = TRUE,xnudge = 0,ynudge = 0)
plot2 <- LabelPoints(plot = plot1, points = top10_genes, repel = FALSE)
plot1 + plot2
### 归一化表达值
all.genes <- rownames(seurat_object_norm)
seurat_object_scaled <- ScaleData(seurat_object_norm, features = all.genes)
#pbmc@[email protected] <- t(scale(t(pbmc@assays$RNA@data)))
seurat_object_scaled <- ScaleData(seurat_object_scaled, vars.to.regress = "percent.mt")
### 线性降维
seurat_object_scaled <- RunPCA(seurat_object_scaled,
features = VariableFeatures
(object = seurat_object_scaled))
print(seurat_object_scaled[["pca"]], dims = 1:5, nfeatures = 5)
#即PCA feature loading,根据PCA算法,有pbmc@[email protected] =
#t(pbmc@[email protected][VariableFeatures(pbmc),])
#%*% pbmc@[email protected]
VizDimLoadings(seurat_object_scaled, dims = 1:2, reduction = "pca")
#graph2ppt(file="4.PC1nPC2_value.pptx", width=9, aspectr=1.5)
DimPlot(seurat_object_scaled, reduction = "pca")
#graph2ppt(file="5.PCA降维.pptx", width=9, aspectr=1.5)
DimHeatmap(seurat_object_scaled, dims = 1, cells = 500, balanced = TRUE)
#graph2ppt(file="6.单个PC维度_heatmap.pptx", width=9, aspectr=1.5)
DimHeatmap(seurat_object_scaled, dims = 1:9, cells = 500, balanced = TRUE)
#graph2ppt(file="7-1.1到9个PC维度_heatmap.pptx", width=12, aspectr=1.5)
DimHeatmap(seurat_object_scaled, dims = 10:18, cells = 500, balanced = TRUE)
#graph2ppt(file="7-2.9到18个PC维度_heatmap.pptx", width=12, aspectr=1.5)
DimHeatmap(seurat_object_scaled, dims = 19:27, cells = 500, balanced = TRUE)
#graph2ppt(file="7-3.18到27个PC维度_heatmap.pptx", width=12, aspectr=1.5)
saveRDS(seurat_object_scaled, file = "PCA.rds")
pbmc <- seurat_object_scaled
### 确定PCA维度数量
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)#这里dims = 1:20表示看前20个PC的P值,因为第一次算出来的结果真的惊人。而且这个dim最大只能是20.
JackStrawPlot(pbmc, dims = 1:20) #用JackStrawPlot绘图,绘制前10个PC的P值分布曲线
#graph2ppt(file="8.PCA维度_JackStraw.pptx", width=12, aspectr=1.5)
#graph2ppt(file="9.PCA维度_ElbowPlot.pptx", width=12, aspectr=1.5)
### 细胞聚类
pbmc <- FindNeighbors(pbmc, dims = 1:10)
#pbmc <- FindClusters(pbmc, resolution = 0.05)
pbmc <- FindClusters(pbmc, resolution = c(0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1))
head(Idents(pbmc), 5)
#[email protected] %>% View()
#Creates a plot of a clustering tree showing the relationship
#between clusterings at different resolutions.
clustree([email protected], prefix = "RNA_snn_res.")
#graph2ppt(file="10.clustree.pptx", width=9, aspectr=1.5)
pbmc <- FindClusters(pbmc, resolution = 0.3)
### 非线性降维(UMAP/tSNE)
pbmc <- RunUMAP(pbmc, dims = 1:20)
DimPlot(pbmc, reduction = "umap")
#graph2ppt(file="11.umap.pptx", width=9, aspectr=1.5)
#tSNE降维,利用pc1-pc20。pbmc[["pca"]] Number of dimensions: 50
pbmc <- RunTSNE(pbmc, dims = 1:20)
DimPlot(pbmc, reduction = "tsne")
#graph2ppt(file="12.tsne.pptx", width=9, aspectr=1.5)
saveRDS(pbmc, file = "ForDoublets.rds")
### 差异表达分析
# Idents(pbmc)
# find all markers of cluster 1# 找到cluster1当中的所有marker
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)
cluster3.markers <- FindMarkers(pbmc, ident.1 = 3, min.pct = 0.25)
head(cluster3.markers, n = 5)
#使用不同的假设检验方法(test.use = "roc")来进行差异表达分析
# 默认 test.use = "wilcox"
cluster1.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25,
test.use = "roc", only.pos = TRUE)
# find all markers distinguishing cluster 5 from clusters 0 and 3#找到cluster5当中和clusters0和clusters3能够区分的marker。
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
# 找到每一个cluster当中的marker,并且只展示阳性的marker。
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE,
min.pct = 0.25, logfc.threshold = 0.25)
#pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
head(pbmc.markers, n = 10)
write.table(pbmc.markers,file = 'pbmc.markers.txt',sep = '\t')
#差异基因可视化,此外还可以通过RidgePlot, CellScatter, DotPlot等进行展示,这里可以每个亚群筛选一个,也可以根据需要。
VlnPlot(pbmc, features = c("IL7R", "IL32", "TRAC", "CD3D"))
#graph2ppt(file="13.clusterMarker_VlnPlot.pptx", width=20, aspectr=1.5)
# 使用原始count绘制
VlnPlot(pbmc, features = c("IL7R", "IL32", "TRAC", "CD3D"),
slot = "counts", log = TRUE)
#graph2ppt(file="14.clusterMarker_VlnPlot_count.pptx", width=20, aspectr=1.5)
FeaturePlot(pbmc, features = c("IL7R", "IL32", "TRAC", "CD3D"))
#graph2ppt(file="15.clusterMarker_FeaturePlot.pptx", width=15, aspectr=1.5)
RidgePlot(pbmc, features = c("IL7R", "IL32", "TRAC", "CD3D"))
#graph2ppt(file="16.clusterMarker_RidgePlot.pptx", width=18, aspectr=1.5)
#CellScatter(pbmc, features = c("IL7R", "IL32", "TRAC", "CD3D")
DotPlot(pbmc, features = c("IL7R", "IL32", "TRAC", "CD3D"))
#graph2ppt(file="17.clusterMarker_DotPlot.pptx", width=12, aspectr=1.5)
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
#graph2ppt(file="18.markersHeatmap.pptx", width=9, aspectr=1.5)
saveRDS(pbmc, file = "ClustersPlot_beyond.rds")
### 鉴定细胞类型
#根据已知细胞类型的marker gene对各个cluster的细胞进行命名
new.cluster.ids <- paste0("group", 1:11)
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
#graph2ppt(file="鉴定细胞类型.pptx", width=9, aspectr=1.5)
### 保存结果
saveRDS(pbmc, file = "final.rds")