Seurat单细胞分析常见代码-02

1.R程序并行运算和调整R内部允许的对象大小的限制

默认是500 * 1024 ^ 2 = 500 Mb

suppressPackageStartupMessages(library(future)) 
suppressPackageStartupMessages(library(future.apply)) 
plan("multicore", workers = 10) 
options(future.globals.maxSize = 40000 * 1024^2)

future在seurat的具体应用详见:https://satijalab.org/seurat/archive/v3.0/future_vignette.html

2.检查基因名称是否存在基因表达矩阵中

# 查询单个基因IGKV4
rownames(seurat_obj)[grep('IGKV4', rownames(seurat_obj))]
# 查询genelist

# 方式1:区分基因大小写
marker.genes.filte <- gene_list[gene_list %in% rownames(seurat_obj)]
gene.no.find <- gene_list[!gene_list %in% rownames(seurat_obj)]

# 方式2:不区分基因大小写
data("pbmc_small")
cd_genes <- c('Cd79b', 'Cd19', 'Cd200')
CaseMatch(search = cd_genes, match = rownames(x = pbmc_small))


# 方式3:不区分基因大小写,自定义函数
CheckSpecies <- function(subject){ 
  if(class(subject)=="Seurat"){ 
    subject = rownames(subject)[1] 
  } 
  # Human gene 
  if(subject == toupper(subject)) 
    return("Human") 
  # Mouse gene 
  if(subject == Hmisc::capitalize(tolower(subject))) 
    return("Mouse")  
   
} 
FilterGenes <- function(object, marker.genes, unique= TRUE, verbose = T){ 
  if(missing(object))  
    stop("A seurat object must be provided first") 
  if(class(object) != "Seurat") 
    stop("A seurat object must be provided first") 
  if(missing(marker.genes))  
    stop("A list of marker genes must be provided") 
   
  marker.genes <- as.character(marker.genes) 
  marker.genes <- unlist(strsplit(marker.genes,",")) 
  marker.genes <- gsub(" ","",marker.genes) 
  species = CheckSpecies(object) 
  if(species == "Human") marker.genes <- toupper(marker.genes) 
  if(species == "Mouse") marker.genes <- Hmisc::capitalize(tolower(marker.genes))  
   
  if(verbose) print(paste("Before filtration:",length(marker.genes))) 
  marker.genes.filter <- CaseMatch(search = marker.genes, match = rownames(object)) 
  if(unique) marker.genes.filter <- unique(marker.genes.filter) 
  if(verbose){ 
    print(paste("After filtration:",length(marker.genes.filter))) 
    gene.no.find <- setdiff(marker.genes, marker.genes.filter) 
    if (length(gene.no.find)>0) print(paste("Not found genes:",paste(gene.no.find, collapse=" "))) 
  } 
  return(as.character(marker.genes.filter)) 
}

#-------------------------------
DefaultAssay(seurat_obj) <- "SCT" 
gene_list <- c("Sca1", "Cd34",  "Thy1", "Endoglin","CD73","Lepr","VCAM1","Cxcl12","Grem1") 
gene_features <- FilterGenes(seurat_obj, gene_list) 
# [1] "Before filtration: 9" 
# [1] "After filtration: 6" 
# [1] "Not found genes: Sca1 Endoglin Cd73"

CaseMatch()为seurat包内部函数,https://github.com/satijalab/seurat/blob/master/R/utilities.R

3.Dimplot绘图调整

# 去除坐标和Legend
umap_theme <- theme( 
  axis.line=element_blank(), 
  axis.text.x=element_blank(), 
  axis.text.y=element_blank(), 
  axis.ticks=element_blank(), 
  axis.title.x=element_blank(), 
  axis.title.y=element_blank(), 
  panel.background=element_blank(), 
  panel.border=element_blank(), 
  panel.grid.major=element_blank(), 
  panel.grid.minor=element_blank() 
) 
png('figures/basic_umap_clusters.png', width=7, height=7, res=200, units='in') 
DimPlot(seurat_obj, reduction = "umap", group.by='seurat_clusters', label=TRUE) + 
  umap_theme + NoLegend() + ggtitle('UMAP colored by seurat clusters') 
dev.off() 
png('figures/basic_tsne_clusters.png', width=7, height=7, res=200, units='in') 
DimPlot(seurat_obj, reduction = "tsne", group.by='seurat_clusters', label=TRUE) + 
 umap_theme + NoLegend() + ggtitle('t-SNE colored by seurat clusters') 
dev.off()

如果细胞太密,有重叠,可以设置透明度,我一般alpha.use设置0.8或者0.9

Idents(seurat_obj) <- "celltype"
alpha.use <- 0.9
p1 <- DimPlot(object = seurat_obj, reduction = "tsne", label = TRUE, label.size = 3, pt.size=0.3, raster=FALSE)+labs(x="TSNE1", y="TSNE2")
p1$layers[[1]]$mapping$alpha <- alpha.use
p1 <- p1 + scale_alpha_continuous(range = alpha.use, guide = F)
save_plot("celltype_TSNE_clustering.png", p1, base_height = 8, base_aspect_ratio = 1.2, base_width = NULL, dpi=600)
save_plot("celltype_TSNE_clustering.pdf", p1, base_height = 8, base_aspect_ratio = 1.2, base_width = NULL)

4.FeaturePlot绘图配色

一般我常用的配色是:c("lightgrey", "red")或者c("lightgrey", "blue")

plot1 <- FeaturePlot(seurat_obj, features = c("CD3E","CXCR5","PDCD1"), pt.size=0.5, reduction="tsne", cols =  c("lightgrey", "red"),label = T)
plot1 <- FeaturePlot(seurat_obj, features = c("CD3E","CXCR5","PDCD1"), pt.size=0.5, reduction="tsne", cols =  c("lightgrey", "blue"),label = T)

为了对比细胞间的基因表达差异,常使用不同的配色;
使用viridis调色板

# set up list of canonical cell type markers
canonical_markers <- list(
  'Astrocyte' = c('GFAP', 'AQP4', 'SLC1A2'),
  'Pan-neuronal' = c('SNAP25', 'SYT1'),
  'Excitatory Neuron' = c('SLC17A7', 'SATB2'),
  'Inhibitory Neuron' = c('GAD1', 'GAD2'),
  'Microglia' = c('CSF1R', 'CD74', 'P2RY12'),
  'Oligodendrocyte' = c('MOBP', 'MBP', 'MOG'),
  'Olig. Progenitor' = c('PDGFRA', 'CSPG4')
)

library(viridis)
# create feature plots, cutoff expression values for the 98th and 99th percentile
plot_list <- FeaturePlot(
  seurat_obj,
  features=unlist(canonical_markers),
  combine=FALSE, cols=viridis(256),
  max.cutoff='q98'
)

# apply theme to each feature plot
for(i in 1:length(plot_list)){
  plot_list[[i]] <- plot_list[[i]] + umap_theme + NoLegend()
}

png('figures/basic_canonical_marker_featurePlot.png', width=10, height=10, units='in', res=200)
CombinePlots(plot_list)
dev.off()
image.png

可参考的配色有

library(ggplot2)
library(RColorBrewer)
FeaturePlot(pbmc_small,"LYZ") + scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "RdBu")))
FeaturePlot(pbmc_small,"LYZ") + scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))

5.FeaturePlot基因表达均值

我们常常用一些小的特征基因(3-5 个基因)使用 FeaturePlot() 来辅助celltype的鉴定,文献中也有此操作(https://www.nature.com/articles/s42003-020-0922-4/figures/1)

image.png

##### Plot signature of genes in Feature plot #######
Plot_sign <- function(Seraut.object, signature, operator = sum) {
    x <- Seraut.object
    DefaultAssay(x) <- "RNA"
    x[["Sign_exp"]] <- apply(FetchData(object = x, 
                                       vars = signature),
                             1,
                             operator)
    FP <- FeaturePlot(x, reduction = "umap", 
                      features = 'Sign_exp', 
                      label = T, 
                      order=T,
                      cols = c("lightgrey", "red")) +
                      #cols = as.vector(coolwarm(5))) +
    theme(plot.title = element_text(color="black", size=18, face="bold.italic"),
          plot.subtitle = element_text(color="black", size=10, face="italic"),
          axis.text.x = element_text(angle = 90, face = "bold", color = 'black', size=16, hjust =1), 
          axis.title.x = element_text(face = "bold", color = "black", size = 18),
          axis.text.y = element_text(angle = 0, face = "bold", color = 'black', size=16),
          axis.title.y = element_text(face = "bold", color = "black", size = 18),
          legend.text = element_text(face = "bold", color = "black", size = 12),
          panel.background = element_rect(fill = "white",colour = "black", size = 1, linetype = "solid")) +
    labs(title = "Signature plot", subtitle = paste('MarkerGenes: ',toString(signature), sep=''), 
         x = "UMAP 1", y = "UMAP 2") 
    return(FP)
    }
# 查看CFD,DLK1,LUM基因的平均表达量
DefaultAssay(seurat_obj) <- 'RNA'
pL <- Plot_sign(seurat_obj,
                  signature= c('CFD','DLK1','LUM'), 
                  operator = mean, title = 'LEY')

#------------------------------------------
# 也可不用自定义函数
EpithelialCells <-  c("EPCAM","KRT8","KRT18")
StromalCells <-  c("COL1A1","COL1A2","COL6A1","COL6A2","VWF","PLVAP","CDH5","S100B")
ImmuneCells <- c("CD52","CD2","CD3D","CD3G","CD3E","CD79A","CD79B","CD14","FCGR3A","CD68","CD83","CSF1R","FCER1G")
MyeloidCells <- c("CD68", "XCR1", "CLEC9A", "CLEC10A", "CD1C", "S100A8", "S100A9", "TPSAB1", "OSM")
TCells <- c("NKG7", "KLRC1", "CCR7", "FOXP3", "CTLA4", "CD8B", "CXCR6", "CD3D")
BCells<- c("MZB1", "IGHA1", "SELL", "CD19", "AICDA")

# 计算基因集的平均表达量
[email protected]$EpithelialCells <- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% EpithelialCells,],2,mean)
[email protected]$StromalCells<- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% StromalCells,],2,mean)
[email protected]$ImmuneCells <- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% ImmuneCells,],2,mean)
[email protected]$MyeloidCells<- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% MyeloidCells,],2,mean)
[email protected]$TCells <- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% TCells,],2,mean)
[email protected]$BCells <- apply(seurat_obj@assays$SCT@data[rownames(seurat_obj@assays$SCT@data) %in% BCells,],2,mean)

p1 <- FeaturePlot(seurat_obj,reduction = "umap",features = Types, cols = c("gray","orange","orange","red","red"),combine = FALSE)
for(i in 1:length(p1)) { 
  p1[[i]] <- p1[[i]] + NoLegend() + NoAxes() 

}

pdf("Cluster_Score_UMAP.pdf",width = 8,height =8)
cowplot::plot_grid(plotlist = p1) 
dev.off()

6.AddModuleScore()使用

AddModuleScore()的算法跟上面的计算基因集均值的方法不同,后面细究下源码。
AddModuleScore()为Seurat包中自带函数,需要先计算基因集中所有基因的平均值,再根据平均值把表达矩阵切割成若干份,然后从切割后的每一份中随机抽取对照基因(基因集外的基因)作为背景值。因此,在整合不同样本的情况下,即使使用相同基因集为相同细胞打分,也会产生不同的富集评分;

# 官方示例
cell_typeA_marker_gene_list <- list(c("Gene1", "Gene2", "Gene3", "Gene4"))
object <- AddModuleScore(object = object, features = cell_typeA_marker_gene_list, name = "cell_typeA_score")
FeaturePlot(object = object, features = "cell_typeA_score1")

# 给出一段GitHub文献中的代码,学习下大佬的使用方法
b_cell <- c("MS4A1") # ref5
macrophage <- c("CD68", "IDO1") # ref5
plasmacytoid_dendritic_cell <- c("CLEC4C","NRP1") # ref5
erythrocyte <- c("HBB", "HBA1", "HBA2") # ref6
cytotoxic_t_cell <- c("GZMA", "GZMK", "IFNG") # ref5
regulatory_t_cell <- c("FOXP3", "IL2RA", "IKZF2") # ref5
t_helper <- c("CXCL13","PDCD1","FABP5") # ref5
naive_t_cell <- c("CCR7", "IL7R", "LEF1") # ref5
progenitor <- c("CD34") # ref5
mast_cell <- c("TPSAB1","TPSB2", "KIT", "GATA1", "GATA2") # ref7

# marker基因打分函数
score_marker_genes <- function(seurat_object, nbins=24){
  AddModuleScore(seurat_object, 
                 features = list(b_cell, macrophage, plasmacytoid_dendritic_cell, erythrocyte, cytotoxic_t_cell, regulatory_t_cell, t_helper, naive_t_cell, progenitor, mast_cell),
                 name=c("b_cell", "macrophage", "plasmacytoid_dendritic_cell", "erythrocyte", "cytotoxic_t_cell", "regulatory_t_cell", "t_helper", "naive_t_cell", "progenitor", "mast_cell"),
                 nbin=nbins)
}
hl <- score_marker_genes(hl)

# marker基因可视化函数
plot_marker_genes <- function(seurat_object, image_name){
  ggsave(file = str_glue('../figures/{image_name}.pdf'),
         plot = FeaturePlot(seurat_object,
                            features = c("b_cell1", "macrophage2", "plasmacytoid_dendritic_cell3", "erythrocyte4", "cytotoxic_t_cell5", "regulatory_t_cell6", "t_helper7", "naive_t_cell8", "progenitor9", "mast_cell10"),
                            min.cutoff = "q10", max.cutoff = "q90",
                            ncol=4, label=TRUE, order = TRUE),
         device = "pdf", width = 50, height = 40, units = "cm")
}
plot_marker_genes(hl, "hl_markers")

7.DotPlot按基因集分隔显示

DotPlot的features为list时,list组间会有间隔,便于比较不同celltype的marker基因差异,可视化效果更好;

cgs = list(
  Epi = c("EPCAM","PAX8","KRT18","CD24","KRT19","SCGB2A2","KRT5","KRT15"),
  Meyloid = c("CD68","LYZ","MARCO","AIF1","TYROBR","MS4A6A","CD1E","IL3RA","LAMP3"),
  T_cell = c("CD3D",'CD3E','TRAG','CD3G','CD2'),
  B_cell = c("CD79A","CD79B","IGKC","CD19","MZB1","MS4A1"),
  Endo = c("CLDN5","PECAM1","VWF","FLT1","RAMP2"),
  Fibro = c("COL1A1","COL1A2","COL3A1","BGN","DCN","POSTN","C1R")
)

p_umap <- DimPlot(seurat_obj, reduction = "umap", group.by = "AllTypes", label = T)
p <- DotPlot(seurat_obj, features = cgs, assay = "SCT") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + p_umap + plot_layout(widths = c(2, 1))
save_plot("Dotplot_Celltype_UMAP_clustering.png", p, base_height = 8, base_aspect_ratio = 2.4, base_width = NULL, dpi=600)
save_plot("Dotplot_Celltype_UMAP_clustering.pdf", p, base_height = 8, base_aspect_ratio = 2.4, base_width = NULL)
image.png
# 横坐标和纵坐标互换,水平翻转
DotPlot(pbmc_small, features = unique(top5$gene), cols = "RdYlBu", col.min = 0, dot.scale = 5) + coord_flip()
# x轴标签倾斜
DotPlot(pbmc_small, features = unique(top5$gene), cols ="Spectral", col.min = 0, dot.scale = 5) + RotatedAxis()
DotPlot(pbmc_small, features = unique(top5$gene), cols ="Spectral", col.min = 0, dot.scale = 5) + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

8.DoHeatmap自定义颜色

library(viridis)
DoHeatmap(seurat_obj, features=top_DEGs, group.by='seurat_clusters', label=FALSE) + scale_fill_gradientn(colors=viridis(256)) + NoLegend()

DoHeatmap(pbmc_small) + scale_fill_viridis()
DoHeatmap(pbmc_small) + scale_fill_gradientn(colors = c("blue", "white", "red"))

mapal <- colorRampPalette(RColorBrewer::brewer.pal(11,"RdBu"))(256)
DoHeatmap(pbmc_small,  angle = 90, size = 3) + scale_fill_gradientn(colours = rev(mapal))

#------------------------------------------
seurat_obj <- subset(initial_object, idents = 1:16)

levels(Idents(seurat_obj))

my_cols <- c('3'='#F68282','15'='#31C53F','5'='#1FA195','1'='#B95FBB','13'='#D4D915',
  '14'='#28CECA','9'='#ff9a36','8'='#2FF18B','11'='#aeadb3','6'='#faf4cf',
  '2'='#CCB1F1','12'='#25aff5','7'='#A4DFF2','4'='#4B4BF7','16'='#AC8F14',
  '10'='#E6C122')

my_cols2 <- my_cols[order(as.integer(names(my_cols)))]
scales::show_col(my_cols2)

# 如修改配色,DimPlot和DoHeatmap同步修改
DimPlot(seurat_obj,
cols = my_cols2, label=TRUE , repel=TRUE)
DoHeatmap(seurat_obj,  features = c("gene1", "gene2"),  group.colors = my_cols2)

9.小鼠基因转人基因

# Function to convert human gene symbols into mouse gene symbols
#https://gist.github.com/FloWuenne/f8fc922477df04c1642e9d8945c48d47
#https://rjbioinformatics.com/2016/10/14/converting-mouse-to-human-gene-names-with-biomart-package/
convertHumanGeneList <- function(x){
  require("biomaRt")
  human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
  mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
  
  genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
  
  humanx <- unique(genesV2[, 2])
  
  no_mouse_genes <- length(x)
  no_human_genes <- length(humanx)
  
  if(no_human_genes != no_mouse_genes){
    print("Some genes could not be translated!")
    genes_not_trans <- setdiff(x,genesV2$HGNC.symbol)
    print("These genes could not be translated:")
    print(genes_not_trans)
    print(paste("A total number of ",length(genes_not_trans),"genes could not be translated!"),sep=" ")
  }else{
    print("All genes were translated successfully!")
  }
  
  # Print all gene names that could not be translated and the number of genes that were not translated
  return(humanx)
}  
# 示例
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
s.genes <- convertHumanGeneList(s.genes)
g2m.genes <- convertHumanGeneList(g2m.genes)

10.统计给定基因在cluster的数目

计算表达基因的细胞比例的函数,在https://github.com/satijalab/seurat/issues/371中有网友给出了一个自定义函数,如下:

# Function that can calculate proportion of cells expressing a gene
# calculates total cells expressing a gene (raw counts > 0) by metadata groups
# can be grouped by different samples types or cluster_# based on metadata
# 'ncells' counts to total number of cells, can be passed to have percentages in calc_helper
# you can adjust the threshold for RNA count to select for cells with more 'higher' expression
TotalCellExpringGene <- function(object, genes, group.by = "all"){
  if(group.by == "all"){
    prct = unlist(lapply(genes,calc_helper, object=object))
    result = data.frame(Markers = genes, Cell_proportion = prct)
    return(result)
  }
  else{        
    list = SplitObject(object, group.by)
    factors = names(list)
    
    results = lapply(list, PrctCellExpringGene, genes=genes)
    for(i in 1:length(factors)){
      results[[i]]$Feature = factors[i]
    }
    combined = do.call("rbind", results)
    return(combined)
  }
}
# for total cells:
calc_helper <- function(object,genes){
  counts = object[['RNA']]@counts
  ncells = ncol(counts)
  if(genes %in% row.names(counts)){
    sum(counts[genes,]>0)
  }else{return(NA)}
}
# 示例
PrctCellExpringGene(seurat_object ,genes =c("geneA","geneB"), group.by = "sample.ID")
#Markers Cell_proportion
#MS1.1 geneA 0.022727273
#MS1.2 geneB 0.045337995
#MS2.1 geneA 0.000000000 
#MS2.2 geneB 0.030033951 
#MS3.1 geneA 0.001821125 
#MS3.2 geneB 0.035410765

同样,也可以计算在cluster的占比

PrctCellExpringGene <- function(object, genes, group.by = "all"){
  if(group.by == "all"){
    prct = unlist(lapply(genes,calc_helper.2, object=object))
    result = data.frame(Markers = genes, Cell_proportion = prct)
    return(result)
  }
  else{        
    list = SplitObject(object, group.by)
    factors = names(list)
    
    results = lapply(list, PrctCellExpringGene, genes=genes)
    for(i in 1:length(factors)){
      results[[i]]$Feature = factors[i]
    }
    combined = do.call("rbind", results)
    return(combined)
  }
}
# for percentage of cells use this function:
calc_helper.2 <- function(object,genes){
  counts = object[['RNA']]@counts
  ncells = ncol(counts)
  if(genes %in% row.names(counts)){
    sum(counts[genes,]>0)/ncells
  }else{return(NA)}
}
#Example for finding all cells expressing a gene:
#performs the counts based on the cluster assignment
NumberCellperGene <- PrctCellExpringGene(seurat_object,genes = c("Vim","Pdgfra","Pdgfrb","Acta2","Cnn1"), group.by = "seurat_clusters")

你可能感兴趣的:(Seurat单细胞分析常见代码-02)