GSE117570
1.下载和读取数据
rm(list = ls())
library(stringr)
library(Seurat)
library(dplyr)
f = dir("GSE117570_RAW/");f
## [1] "GSM3304007_P1_Tumor_processed_data.txt.gz"
## [2] "GSM3304011_P3_Tumor_processed_data.txt.gz"
## [3] "GSM3304013_P4_Tumor_processed_data.txt.gz"
s = str_split(f,"_",simplify = T)[,1];s
## [1] "GSM3304007" "GSM3304011" "GSM3304013"
scelist = list()
for(i in 1:length(f)){
tmp = read.table(paste0("GSE117570_RAW/",f[[i]]),
check.names = F)
tmp = as.matrix(tmp)
scelist[[i]] <- CreateSeuratObject(counts = tmp,
project = s[[i]],
min.cells = 3,
min.features = 50)
scelist[[i]][["percent.mt"]] <- PercentageFeatureSet(scelist[[i]], pattern = "^MT-")
scelist[[i]] <- subset(scelist[[i]], subset = percent.mt < 5)
}
names(scelist) = s
lapply(scelist, function(x)dim(x@assays$RNA@counts))
## $GSM3304007
## [1] 6611 1466
##
## $GSM3304011
## [1] 3211 116
##
## $GSM3304013
## [1] 7492 281
2.合并到一起,找高变基因
sce.all <- merge(scelist[[1]],
y= scelist[ -1 ] ,
add.cell.ids = s)
# 查看数据
head([email protected], 10)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## GSM3304007_AAACCTGGTACAGACG-1 GSM3304007 4338 1224 2.512679
## GSM3304007_AAACGGGGTAGCGCTC-1 GSM3304007 11724 2456 2.021494
## GSM3304007_AAACGGGGTCCTCTTG-1 GSM3304007 3353 726 2.117507
## GSM3304007_AAACGGGTCCAAACAC-1 GSM3304007 2811 986 2.312344
## GSM3304007_AAACGGGTCTTTAGTC-1 GSM3304007 6095 1590 3.002461
## GSM3304007_AAAGATGCATCTACGA-1 GSM3304007 7683 2049 2.824418
## GSM3304007_AAAGATGCATTGAGCT-1 GSM3304007 2428 832 2.800659
## GSM3304007_AAAGATGGTTCAGTAC-1 GSM3304007 2939 1052 3.028241
## GSM3304007_AAAGATGTCTGGTATG-1 GSM3304007 6834 1452 3.906936
## GSM3304007_AAAGCAAAGACTACAA-1 GSM3304007 3429 737 2.391368
table([email protected]$orig.ident)
##
## GSM3304007 GSM3304011 GSM3304013
## 1466 116 281
VlnPlot(sce.all, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
FeatureScatter(sce.all, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
sce.all <- NormalizeData(sce.all)
sce.all <- FindVariableFeatures(sce.all)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(sce.all), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(sce.all)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot2
3. PCA
完成PCA线性降维,是后续其他降维方法的基础
sce.all <- ScaleData(sce.all)
sce.all[["RNA"]]@counts[1:4,1:4]
## 4 x 4 sparse Matrix of class "dgCMatrix"
## GSM3304007_AAACCTGGTACAGACG-1 GSM3304007_AAACGGGGTAGCGCTC-1
## FO538757.2 . .
## AP006222.2 . .
## NOC2L . .
## ISG15 11 3
## GSM3304007_AAACGGGGTCCTCTTG-1 GSM3304007_AAACGGGTCCAAACAC-1
## FO538757.2 . .
## AP006222.2 . .
## NOC2L . .
## ISG15 . 4
sce.all <- RunPCA(sce.all, features = VariableFeatures(sce.all))
DimPlot(sce.all, reduction = "pca")
# 应该选多少个主成分进行后续分析
ElbowPlot(sce.all)
sce.all <- JackStraw(sce.all, num.replicate = 100)
sce.all <- ScoreJackStraw(sce.all, dims = 1:20)
JackStrawPlot(sce.all, dims = 1:20)
4.harmony和UMAP
harmony用于整合数据时消除样本间的差异,上一篇是用Seurat自带的CCA方法整合。
library(harmony)
sce.all <- RunHarmony(sce.all, "orig.ident")
names(sce.all@reductions)
## [1] "pca" "harmony"
sce.all <- RunUMAP(sce.all, dims = 1:15,
reduction = "harmony")
DimPlot(sce.all,reduction = "umap",label=T )
sce.all <- FindNeighbors(sce.all,reduction = "harmony",dims = 1:15)
sce.all <- FindClusters(sce.all, resolution = 0.2) #分辨率
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1863
## Number of edges: 64311
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9515
## Number of communities: 9
## Elapsed time: 0 seconds
# 结果聚成几类,用Idents查看
length(levels(Idents(sce.all)))
## [1] 9
table([email protected])
##
## 0 1 2 3 4 5 6 7 8
## 667 382 246 172 127 101 81 57 30
DimPlot(sce.all,reduction = "umap",label=T )
5.SingleR注释
# 注释
library(celldex)
library(SingleR)
#ref <- celldex::HumanPrimaryCellAtlasData()
#因为下载太慢,使用了提前存好的本地数据
ref <- get(load("../single_ref/ref_Hematopoietic.RData"))
library(BiocParallel)
pred.scRNA <- SingleR(test = sce.all@assays$RNA@data,
ref = ref,
labels = ref$label.main,
clusters = [email protected])
pred.scRNA$pruned.labels
## [1] "Monocytes" "CD4+ T cells" "Erythroid cells" "B cells"
## [5] "Monocytes" "CD4+ T cells" "CD8+ T cells" "HSCs"
## [9] "HSCs"
plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(sce.all)
levels(sce.all)
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8"
sce.all <- RenameIdents(sce.all,new.cluster.ids)
levels(sce.all)
## [1] "Monocytes" "CD4+ T cells" "Erythroid cells" "B cells"
## [5] "CD8+ T cells" "HSCs"
UMAPPlot(object = sce.all, pt.size = 0.5, label = TRUE)