2021-05-26 单细胞分析之harmony与Seurat

参考:生信会客厅

harmony原理

Harmony需要输入低维空间的坐标值(embedding),一般使用PCA的降维结果。Harmony导入PCA的降维数据后,会采用soft k-means clustering算法将细胞聚类。常用的聚类算法仅考虑细胞在低维空间的距离,但是soft clustering算法会考虑我们提供的校正因素。这就好比我们的高考加分制度,小明高考成绩本来达不到A大学的录取分数线,但是他有一项省级竞赛一等奖加10分就够线了。同样的道理,细胞c2距离cluster1有点远,本来不能算作cluster1的一份子;但是c2和cluster1的细胞来自不同的数据集,因为我们期望不同的数据集融合,所以破例让它加入cluster1了。聚类之后先计算每个cluster内各个数据集的细胞的中心点,然后根据这些中心点计算各个cluster的中心点。最后通过算法让cluster内的细胞向中心聚集,实在收敛不了的离群细胞就过滤掉。调整之后的数据重复:聚类—计算cluster中心点—收敛细胞—聚类的过程,不断迭代直至聚类效果趋于稳定。

  • 从第三方的评测结果来看,harmony的整合效果可以与seurat的锚点整合方法媲美,并且运行时间方面具有明显优势,不过内存消耗好像没有那么夸张的优秀。(哈哈我的电脑用seurat锚点整合被卡住了,试了一下harmony,时间挺快的,而且电脑也带得动(记录一下学习笔记)
library(harmony)
install_github("immunogenomics/harmony")
library(devtools)
##==准备seurat对象列表==##
library(Seurat)
library(tidyverse)
library(patchwork)
rm(list=ls())
dir = c('Data/GSE139324/GSE139324_SUB/GSM4138110', 
        'Data/GSE139324/GSE139324_SUB/GSM4138111', 
        'Data/GSE139324/GSE139324_SUB/GSM4138128',
        'Data/GSE139324/GSE139324_SUB/GSM4138129',
        'Data/GSE139324/GSE139324_SUB/GSM4138148',
        'Data/GSE139324/GSE139324_SUB/GSM4138149',
        'Data/GSE139324/GSE139324_SUB/GSM4138162',
        'Data/GSE139324/GSE139324_SUB/GSM4138163',
        'Data/GSE139324/GSE139324_SUB/GSM4138168',
        'Data/GSE139324/GSE139324_SUB/GSM4138169')       #GSE139324是存放数据的目录
sample_name <- c('HNC01PBMC', 'HNC01TIL', 'HNC10PBMC', 'HNC10TIL', 'HNC20PBMC',
                 'HNC20TIL',  'PBMC1', 'PBMC2', 'Tonsil1', 'Tonsil2')
scRNAlist <- list()
for(i in 1:length(dir)){
  counts <- Read10X(data.dir = dir[i])
  scRNAlist[[i]] <- CreateSeuratObject(counts, project=sample_name[i], min.cells=3, min.features = 200)
  scRNAlist[[i]][["percent.mt"]] <- PercentageFeatureSet(scRNAlist[[i]], pattern = "^MT-")
  scRNAlist[[i]] <- subset(scRNAlist[[i]], subset = percent.mt < 10) 
}   
saveRDS(scRNAlist, "scRNAlist.rds")




##==harmony整合多样本==##
library(harmony)
scRNAlist <- readRDS("scRNAlist.rds")
##PCA降维
scRNA_harmony <- merge(scRNAlist[[1]], y=c(scRNAlist[[2]], scRNAlist[[3]], scRNAlist[[4]], scRNAlist[[5]], 
                                           scRNAlist[[6]], scRNAlist[[7]], scRNAlist[[8]], scRNAlist[[9]], scRNAlist[[10]]))
 scRNA_harmony <- NormalizeData(scRNA_harmony) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose=FALSE)
##整合
system.time({scRNA_harmony <- RunHarmony(scRNA_harmony, group.by.vars = "orig.ident")})
#   用户   系统   流逝 
# 34.308  0.024 34.324

#user  system elapsed 
#62.90    4.06   72.08 

#降维聚类
scRNA_harmony <- RunUMAP(scRNA_harmony, reduction = "harmony", dims = 1:30)
scRNA_harmony <- FindNeighbors(scRNA_harmony, reduction = "harmony", dims = 1:30) %>% FindClusters()
##作图
#group_by_cluster
plot1 = DimPlot(scRNA_harmony, reduction = "umap", label=T) 
#group_by_sample
plot2 = DimPlot(scRNA_harmony, reduction = "umap", group.by='orig.ident') 
#combinate
plotc <- plot1+plot2
plotc

练习另外一个数据

#harmonny
library(harmony)
library(Seurat)
library(tidyverse)
library(dplyr)
library(patchwork)
setwd("D:\\genetic_r\\study\\geo\\lesson7")
library(harmony)
scRNA.2=Read10X("D:\\genetic_r\\study\\geo\\lesson7\\GSE152048_BC2.matrix\\BC2")
scRNA.3=Read10X("D:\\genetic_r\\study\\geo\\lesson7\\GSE152048_BC3.matrix\\BC3")
scRNA.2 = CreateSeuratObject(scRNA.2 ,project="sample_2",min.cells = 3, min.features = 200)
scRNA.3 = CreateSeuratObject(scRNA.3 ,project="sample_3",min.cells = 3, min.features = 200)
scRNA_harmony <- merge(scRNA.2, y=c(scRNA.3 ))
Warning message:
  In CheckDuplicateCellNames(object.list = objects) :
  Some cell names are duplicated across objects provided. Renaming to enforce unique cell names.
scRNA_harmony <- NormalizeData(scRNA_harmony) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose=FALSE)
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
  [----|----|----|----|----|----|----|----|----|----|
     **************************************************|
     Calculating gene variances
   0%   10   20   30   40   50   60   70   80   90   100%
     [----|----|----|----|----|----|----|----|----|----|
        **************************************************|
        Calculating feature variances of standardized and clipped values
      0%   10   20   30   40   50   60   70   80   90   100%
        [----|----|----|----|----|----|----|----|----|----|
           **************************************************|
           Centering and scaling data matrix
         |============================================================| 100%
#开始整合
system.time({scRNA_harmony <- RunHarmony(scRNA_harmony, group.by.vars = "orig.ident")})
Harmony 1/10
0%   10   20   30   40   50   60   70   80   90   100%
  [----|----|----|----|----|----|----|----|----|----|
     **************************************************|
     Harmony 2/10
   0%   10   20   30   40   50   60   70   80   90   100%
     [----|----|----|----|----|----|----|----|----|----|
        **************************************************|
        Harmony 3/10
      0%   10   20   30   40   50   60   70   80   90   100%
        [----|----|----|----|----|----|----|----|----|----|
           **************************************************|
           Harmony 4/10
         0%   10   20   30   40   50   60   70   80   90   100%
           [----|----|----|----|----|----|----|----|----|----|
              **************************************************|
              Harmony 5/10
            0%   10   20   30   40   50   60   70   80   90   100%
              [----|----|----|----|----|----|----|----|----|----|
                 **************************************************|
                 Harmony 6/10
               0%   10   20   30   40   50   60   70   80   90   100%
                 [----|----|----|----|----|----|----|----|----|----|
                    **************************************************|
                    Harmony converged after 6 iterations
                  Warning: Invalid name supplied, making object name syntactically valid. New object name is Seurat..ProjectDim.RNA.harmony; see ?make.names for more details on syntax validity
                  用户  系统  流逝 
                  67.39  5.28 88.92 
降维聚类
scRNA_harmony <- FindNeighbors(scRNA_harmony, reduction = "harmony", dims = 1:15) %>% FindClusters(resolution = 0.5)
                  Computing nearest neighbor graph
                  Computing SNN
                  Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
                  
                  Number of nodes: 14760
                  Number of edges: 529809
                  
                  Running Louvain algorithm...
                  0%   10   20   30   40   50   60   70   80   90   100%
                    [----|----|----|----|----|----|----|----|----|----|
                       **************************************************|
                       Maximum modularity in 10 random starts: 0.9244
                     Number of communities: 12
                     Elapsed time: 3 seconds
降维可视化
scRNA_harmony <- RunUMAP(scRNA_harmony, reduction = "harmony", dims = 1:15)
                     11:41:02 UMAP embedding parameters a = 0.9922 b = 1.112
                     11:41:02 Read 14760 rows and found 15 numeric columns
                     11:41:02 Using Annoy for neighbor search, n_neighbors = 30
                     11:41:02 Building Annoy index with metric = cosine, n_trees = 50
                     0%   10   20   30   40   50   60   70   80   90   100%
                       [----|----|----|----|----|----|----|----|----|----|
                          **************************************************|
                          11:41:09 Writing NN index file to temp file C:\Users\ASUS\AppData\Local\Temp\RtmpO8LMz0\file2da4113d7aa5
                        11:41:09 Searching Annoy index using 1 thread, search_k = 3000
                        11:41:16 Annoy recall = 100%
                        11:41:39 Commencing smooth kNN distance calibration using 1 thread
                        11:41:45 Initializing from normalized Laplacian + noise
                        11:41:47 Commencing optimization for 200 epochs, with 640848 positive edges
                        0%   10   20   30   40   50   60   70   80   90   100%
                          [----|----|----|----|----|----|----|----|----|----|
                             **************************************************|
                             11:42:07 Optimization finished
画图看看整合效果
#group_by_cluster
plot1 = DimPlot(scRNA_harmony, reduction = "umap", label=T) 
#group_by_sample
plot2 = DimPlot(scRNA_harmony, reduction = "umap", group.by='orig.ident') 
#combinate
plotc <- plot1+plot2
plotc
对比一下Seurat整合样本,Seurat是采用锚定的方法来整合。
数据归一化处理
scRNA.list <- list(scRNA.2,scRNA.3)
for (i in 1:length(scRNA.list)) {
scRNA.list[[i]] <- NormalizeData(scRNA.list[[i]])
scRNA.list[[i]] <- FindVariableFeatures(scRNA.list[[i]], selection.method = "vst")
}
Performing log-normalization
0%   10   20   30   40   50   60   70   80   90   100%
  [----|----|----|----|----|----|----|----|----|----|
     **************************************************|
     Calculating gene variances
   0%   10   20   30   40   50   60   70   80   90   100%
     [----|----|----|----|----|----|----|----|----|----|
        **************************************************|
        Calculating feature variances of standardized and clipped values
      0%   10   20   30   40   50   60   70   80   90   100%
        [----|----|----|----|----|----|----|----|----|----|
           **************************************************|
           Performing log-normalization
         0%   10   20   30   40   50   60   70   80   90   100%
           [----|----|----|----|----|----|----|----|----|----|
              **************************************************|
              Calculating gene variances
            0%   10   20   30   40   50   60   70   80   90   100%
              [----|----|----|----|----|----|----|----|----|----|
                 **************************************************|
                 Calculating feature variances of standardized and clipped values
               0%   10   20   30   40   50   60   70   80   90   100%
                 [----|----|----|----|----|----|----|----|----|----|
                    **************************************************|

寻找锚点                    
system.time({scRNA.anchors <- FindIntegrationAnchors(object.list = scRNA.list)})
Computing 2000 integration features
Scaling features for provided objects
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=13s  
Finding all pairwise anchors
|                                                  | 0 % ~calculating  Running CCA
Merging objects
Finding neighborhoods
Finding anchors
Found 12929 anchors
Filtering anchors
Retained 4266 anchors
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=15m 55s
user  system elapsed 
365.04  207.64  973.92 
#锚定后整合
system.time({scRNA_seurat <- IntegrateData(anchorset = scRNA.anchors)})
scRNA_seurat<- IntegrateData(anchorset = scRNA.anchors)
Merging dataset 1 into 2
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0%   10   20   30   40   50   60   70   80   90   100%
  [----|----|----|----|----|----|----|----|----|----|
     **************************************************|
     Integrating data
save(scRNA.anchors,file='scRNA.anchors.RData')
rm(list = ls())
降维聚类
scRNA_seurat <- ScaleData(scRNA_seurat) %>% RunPCA(verbose=FALSE)
Centering and scaling data matrix
|===========================================================| 100%
scRNA_seurat <- FindNeighbors(scRNA_seurat, dims = 1:15) %>% FindClusters(resolution = 0.5)
Computing nearest neighbor graph
Computing SNN
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 14760
Number of edges: 523184

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
  [----|----|----|----|----|----|----|----|----|----|
     **************************************************|
     Maximum modularity in 10 random starts: 0.9068
   Number of communities: 12
   Elapsed time: 3 seconds
scRNA_seurat <- ScaleData(scRNA_seurat, features = VariableFeatures(scRNA_seurat))
scRNA_seurat <- RunPCA(scRNA_seurat, features = VariableFeatures(scRNA_seurat))
plot1 <- DimPlot(scRNA_seurat, reduction = "pca", group.by="orig.ident")
plot2 <- ElbowPlot(scRNA_seurat, ndims=30, reduction="pca") 
plotc <- plot1+plot2
pc.num=1:20
scRNA_seurat <- FindNeighbors(scRNA_seurat, dims = pc.num) 
scRNA_seurat <- FindClusters(scRNA_seurat, resolution = 0.5)
table([email protected]$seurat_clusters)
metadata <- [email protected]
cell_cluster <- data.frame(cell_ID=rownames(metadata), cluster_ID=metadata$seurat_clusters)
#tSNE
scRNA_seurat = RunTSNE(scRNA_seurat, dims = pc.num)
#group_by_cluster
plot1 = DimPlot(scRNA_seurat, reduction = "tsne", label=T) 

#group_by_sample
plot2 = DimPlot(scRNA_seurat, reduction = "tsne", group.by='orig.ident') 

#combinate
plotc <- plot1+plot2

#UMAP
scRNA_seurat <- RunUMAP(scRNA_seurat, dims = pc.num)
#group_by_cluster
plot3 = DimPlot(scRNA_seurat, reduction = "umap", label=T) 
#group_by_sample
plot4 = DimPlot(scRNA_seurat, reduction = "umap", group.by='orig.ident')
#combinate
plotc <- plot3+plot4


plotc <- plot3+plot4+ plot_layout(guides = 'collect')

哈哈 暂时还看不出harmony与Seurat谁的整合效果好。。。

你可能感兴趣的:(2021-05-26 单细胞分析之harmony与Seurat)