封装函数-用R包Seurat跑单套数据

呜呜最近发现我工作效率低的一个原因就是重复性工作没有流程化，一气之下，把seurat分析单套数据的流程封装了起来，步骤包含数据质控、数据标准化、聚类以及初步的细胞类型鉴定。细胞类型鉴定是用每个cluster的top marker来标注的。之后再更新整合多套数据的流程，希望与君分享

1. 用到的所有函数放在了SeuratWrapperFunction.R中了

这个需要用source()函数导入到下面封装好的代码中的

### Time: 20221025
###  Author: zhengyiyi
## load function 
library(Seurat)
library(SingleCellExperiment)
library(scater)
library(MAST)
library(ggplot2)
library(patchwork)
library(RColorBrewer)
library(Seurat)
library(openxlsx)
library("grid")
library("ggplotify")
library(magrittr)
library(dplyr)
library(ggsignif)
library("ggplot2")
library(cowplot)
library(openxlsx)

CreateFirstFilterSeuratObject <- function(exp_count, pr_name, Or_ident, 
                                          min_cell, min_gene, mt_pattern,rb_pattern){
  # CreateSeuratObject : filter gene and cell by min.cells(Include features detected in at least this many cells) and min.features(Include cells where at least this many features are detected.)
  exp_count.seurat <- CreateSeuratObject(exp_count, 
                                         project = pr_name, 
                                         min.cells = min_cell,
                                         min.features = min_gene)
  [email protected]$orig.ident <- as.factor(Or_ident)
  [email protected] <- exp_count.seurat$orig.ident
  ## Check the mt gene and rp gene
  exp_count.seurat[["percent.mt"]] <- PercentageFeatureSet(object = exp_count.seurat, pattern = mt_pattern)
  exp_count.seurat[["percent.rb"]] <- PercentageFeatureSet(object = exp_count.seurat, pattern = rb_pattern)
  
  print(levels([email protected]))
  print(table(exp_count.seurat$orig.ident))
  print(head(x = [email protected], 5))
  print(dim([email protected]))
  return(exp_count.seurat)
}

SeuratQC <- function(exp_count.seurat, name_pre){
  folder_name="1.QC"
  if (file.exists(folder_name)){
    print("1.QC existed.")
  }
  else{
    dir.create(folder_name)
  } 
  file_name=paste(name_pre,"Violinplot.pdf",sep="_")
  
  reads.drop <- isOutlier(as.numeric(exp_count.seurat$nCount_RNA), nmads = 3, type = "lower")
  feature.drop <- isOutlier(as.numeric(exp_count.seurat$nFeature_RNA), nmads = 3, type = "lower")
  qc.mito2 <- isOutlier(exp_count.seurat$percent.mt, nmads = 3, type="higher")
  qc.ribo2 <- isOutlier(exp_count.seurat$percent.rb, nmads = 3, type="higher")
  
  ### 保留下那些质量差的细胞，看看这些细胞的情况 
  low_q_cells <- (reads.drop | feature.drop | qc.mito2 | qc.ribo2)
  exp_count.seurat$lowquality_cells <- "High_qualitycells"
  exp_count.seurat$lowquality_cells[low_q_cells] <- "Low_qualitycells"
  
  plot1=FeatureScatter(object = exp_count.seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "lowquality_cells")
  plot2=FeatureScatter(object = exp_count.seurat, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "lowquality_cells")
  plot3=FeatureScatter(object = exp_count.seurat, feature1 = "nCount_RNA", feature2 = "percent.rb", group.by = "lowquality_cells")
  
  pdf(file = file_name, height = 10, width = 20)
  print(VlnPlot(object = exp_count.seurat, features = c("nFeature_RNA", "nCount_RNA", 
                                                        "percent.mt","percent.rb"), ncol = 2, pt.size=0)) 
  
  opar <- par(no.readonly = TRUE)
  par(mfrow=c(2,2))
  print(hist(exp_count.seurat$nCount_RNA, breaks = 100, main = "Reads Count Distribution", col = "grey80", 
             xlab = "library size", ylab = "frequency",
             cex.lab = 1.4, cex.axis = 1.4))
  print(abline(v = 10000, col = "red", lwd = 2, lty = 2))
  
  print(hist(exp_count.seurat$nFeature_RNA, breaks = 100, main = "Gene Count Distribution",
             col = "grey80", xlab = "total mRNA", ylab = "frequency", cex.lab = 1.4, cex.axis = 1.4))
  print(abline(v = 1500, col = "red", lwd = 2, lty = 2))
  
  print(hist(exp_count.seurat$percent.mt, breaks = 10, main = "Mitochondrial Percentage Distribution",
             col = "grey80", xlab = "total mRNA", ylab = "frequency", cex.lab = 1.4, cex.axis = 1.4))
  print(abline(v = 20, col = "red", lwd = 2, lty = 2))
  
  print(hist(exp_count.seurat$percent.rb, breaks = 10, main = "Ribosome Percentage Distribution",
             col = "grey80", xlab = "total mRNA", ylab = "frequency", cex.lab = 1.4, cex.axis = 1.4))
  print(abline(v = 20, col = "red", lwd = 2, lty = 2))
  par(opar)
  
  print(plot1 + plot2 + plot3)
  dev.off()
  
  #copy files to 1.QC
  result_files <- c(file_name)
  file.copy(result_files, folder_name ,overwrite = T)
  file.remove(result_files) 
  print("Based on the Violin plot, you can filter the cell by the nFeature_RNA, nCount_RNA and/or  percent.mt rb")
}


SubsetSeuratData <- function(exp_count.seurat, min_gene, max_gene,
                             min_ncountRNA,  max_ncountRNA,
                             max_mt_percent, max_rb_percent){
  exp_count.seurat <- subset(exp_count.seurat,  subset = nCount_RNA < max_ncountRNA &  nCount_RNA > min_ncountRNA & nFeature_RNA < max_gene & nFeature_RNA > min_gene  & percent.mt < max_mt_percent & percent.rb < max_rb_percent )
  # QC
  qc_pdf_name <- paste("filtered_by_gene", min_gene, "UMI", min_ncountRNA, "percent.mt", max_mt_percent, "percent.rb", max_rb_percent, sep="_")
  SeuratQC(exp_count.seurat, qc_pdf_name)
  dim(exp_count.seurat)
  print(levels(exp_count.seurat$orig.ident))
  return(exp_count.seurat)
}


SelectPCsByLogNormalize <- function(exp_count.seurat, file_prefix, scale_factor, 
                                    nfeatures, npc_plot){
  
  folder_name="2.RunPCA"
  if (file.exists(folder_name)){
    print("2.RunPCA existed.")
  }
  else{
    dir.create(folder_name)
  } 
  
  DefaultAssay(exp_count.seurat) <- "RNA"
  # Normalize Find VariableGene and Scale Data
  exp_count.seurat <- NormalizeData(exp_count.seurat, normalization.method = "LogNormalize",
                                    scale.factor = scale_factor)
  exp_count.seurat <- FindVariableFeatures(exp_count.seurat, selection.method = "vst", nfeatures = nfeatures)
  exp_count.seurat <- ScaleData(exp_count.seurat, vars.to.regress = c("percent.rb","percent.mt"))
  
  # RunPCA
  exp_count.seurat <- RunPCA(exp_count.seurat, pc.genes = [email protected], 
                             npcs = npc_plot, verbose = FALSE,)
  
  # Select PCs:1:30pcs
  selected_pcs_name <- paste0(file_prefix, "_Selected_PCs.pdf")
  pdf(file = selected_pcs_name, height = 8, width = 8)
  print(ElbowPlot(object = exp_count.seurat, ndims = npc_plot, reduction = "pca"))
  dev.off()
  
  # Plot 
  pca_plot_name <- paste0(file_prefix, "_PCA_plot",".pdf")
  pdf(pca_plot_name,8,8)
  print(DimPlot(exp_count.seurat, reduction="pca", label = TRUE, pt.size=0.5, group.by="ident"))
  dev.off()
  
  #Copy files to 2.RunPCA
  result_files <- c(selected_pcs_name,pca_plot_name)
  file.copy(result_files, folder_name ,overwrite = T)#拷贝文件
  file.remove(result_files) #移除拷贝完的文件
  
  return(exp_count.seurat)
}

SelectPCsBySCTransform <- function(exp_count.seurat, file_prefix, npc_plot){
  
  folder_name="2.RunPCA"
  if (file.exists(folder_name)){
    print("2.RunPCA existed.")
  }
  else{
    dir.create(folder_name)
  } 
  
  # Normalize Find VariableGene and Scale Data
  exp_count.seurat <- SCTransform(exp_count.seurat, vars.to.regress = c("percent.rb","percent.mt"))
  # RunPCA
  exp_count.seurat <- RunPCA(exp_count.seurat, verbose = FALSE,  npcs = npc_plot)
  
  # Select PCs:1:30pcs
  selected_pcs_name <- paste0(file_prefix, "_Selected_PCs.pdf")
  pdf(file = selected_pcs_name, height = 8, width = 8)
  print(ElbowPlot(object = exp_count.seurat, ndims = npc_plot, reduction = "pca"))
  dev.off()
  
  # Plot 
  pca_plot_name <- paste0(file_prefix, "_PCA_plot",".pdf")
  pdf(pca_plot_name, 8, 8)
  print(DimPlot(exp_count.seurat,reduction="pca", label = TRUE, pt.size= 0.5,group.by="ident",shape.by="orig.ident"))
  dev.off()
  
  #Copy files to 2.RunPCA
  result_files <- c(selected_pcs_name,pca_plot_name)
  file.copy(result_files, folder_name ,overwrite = T)#拷贝文件
  file.remove(result_files) #移除拷贝完的文件
  
  return(exp_count.seurat)
}

# RunUMAP and RunTSNE and Then find cluster
PlotCluster <- function(exp_count.seurat, file_prefix, npc_used, k_param, resolution_number){
  
  folder_name="3.PlotCluster"
  if (file.exists(folder_name)){
    print("3.PlotCluster existed.")
  }
  else{
    dir.create(folder_name)
  } 
  
  # FinderNeighbors 
  exp_count.seurat <- FindNeighbors(exp_count.seurat, dims = 1:npc_used, verbose = FALSE, k.param = k_param)
  exp_count.seurat <- FindClusters(exp_count.seurat, verbose = FALSE, resolution = resolution_number)
  
  # RunTSNE and RunUMAP
  exp_count.seurat <- RunTSNE(exp_count.seurat, dims = 1:npc_used, verbose = FALSE, check_duplicates = FALSE)
  exp_count.seurat <- RunUMAP(exp_count.seurat, dims = 1:npc_used, verbose = FALSE)
  
  # RenameIdents from zero to one
  levels_define <- as.numeric(levels(exp_count.seurat))
  new.cluster.ids <- levels_define + 1
  names(new.cluster.ids) <- levels(exp_count.seurat)
  exp_count.seurat <- RenameIdents(exp_count.seurat, new.cluster.ids)
  
  combined_cluster_plotname <- paste0(file_prefix, "_combined_cluster_resolution_",resolution_number,".pdf")
  pdf(combined_cluster_plotname,7,7)
  print(DimPlot(exp_count.seurat,reduction="umap",label = TRUE, pt.size = 0.5, 
                label.size = 4.5, repel=TRUE, group.by="ident"))
  print(DimPlot(exp_count.seurat,reduction="tsne", label = TRUE, pt.size = 0.5, 
                label.size = 4.5, repel=TRUE, group.by="ident"))
  dev.off()
  
  split_clustering_plot_name <- paste0(file_prefix, "_split_clustering_resolution_", resolution_number, ".pdf")
  pdf(split_clustering_plot_name,14,7)
  print(DimPlot(exp_count.seurat,reduction="umap",label = TRUE, pt.size = 0.5, 
                label.size = 4.5, group.by="ident", repel=TRUE, split.by="orig.ident"))
  print(DimPlot(exp_count.seurat,reduction="tsne", label = TRUE, pt.size = 0.5, 
                label.size = 4.5, group.by="ident", repel=TRUE, split.by="orig.ident")) 
  dev.off()
  
  # Copy files to 2.Cluster
  file.copy(combined_cluster_plotname, folder_name ,overwrite = T)
  file.remove(combined_cluster_plotname) 
  file.copy(split_clustering_plot_name, folder_name ,overwrite = T)
  file.remove(split_clustering_plot_name) 
  return(exp_count.seurat)
}

# Find markers for each cluster
FindmarkerForCluster <- function(exp_count.seurat, file_prefix, min.pct, logfc.threshold, p_val_adj, mt_rb_pattern){
  folder_name="4.MarkersInCluster"
  if (file.exists(folder_name)){
    print("4.MarkersInCluster file existed.")
  }
  else{
    dir.create(folder_name)
  } 
  
  cluster.markers <- FindAllMarkers(object = exp_count.seurat, only.pos = TRUE, 
                                    min.pct = min.pct, logfc.threshold = logfc.threshold)
  index <- cluster.markers$p_val_adj < p_val_adj
  cluster.markers <- cluster.markers[index,]
  # remove mt and rb gene
  index <- grep(mt_rb_pattern, cluster.markers$gene)
  if (length(index)  > 0){
    cluster.markers <- cluster.markers[-index, ]
  }
  
  save_name <- paste0(file_prefix,"_MarkersInClusters.csv")
  write.csv(cluster.markers,save_name)
  
  file.copy(save_name, folder_name,overwrite = T)
  file.remove(save_name)
  return(cluster.markers)
}

# Top markers for each cluster
TopMarkersInCluster <- function(cluster.markers, file_prefix, top_num){
  library(dplyr)
  folder_name="4.MarkersInCluster"
  if (file.exists(folder_name)){
    print("4.MarkersInCluster file existed.")
  }
  else{
    dir.create(folder_name)
  } 
  #将readsCountSM.markers传给group_by，group_by按cluster 排序，再将结果传给top_n，top_n按avg_logFC排序，显示每个类中的前两个
  top_marker <- cluster.markers %>% group_by(cluster) %>% top_n(n = top_num, wt = avg_log2FC)
  file_name=paste(file_prefix, "_top_marker", top_num,".csv",sep="")
  write.csv(top_marker, file =file_name)
  file.copy(file_name, folder_name,overwrite = T)
  return(top_marker)
  file.remove(file_name)
}

# Rename each cluster with top2 markers
MapTop2MarkerEachCluster <- function(exp_count.seurat, cluster.markers, file_prefix){
  library(dplyr)
  library(plyr)
  exp_count.seurat$seurat_clusters <- [email protected]
  
  top2 <- cluster.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
  top2 <- top2 %>%  group_by(cluster) %>%  
    dplyr::mutate(markers = paste0(gene, collapse = "/")) %>% dplyr::slice(1)  
  marker.names <- top2$markers
  current.cluster.ids <- as.character(1:(length(unique(Idents(exp_count.seurat))))) 
  new.cluster.ids <- marker.names
  Idents(exp_count.seurat) <- plyr::mapvalues(Idents(exp_count.seurat),
                                              from = current.cluster.ids, 
                                              to = new.cluster.ids)
  return(exp_count.seurat)
}

2.DataQualityOverviewSingle.R: 数据质控的函数是放在了这个脚本

使用的话需要把source()那行代码改成自己的路径
这边有个特别注意的点就是这个脚本里的第一行
“#!/home/zhengjh/miniconda3/envs/r403/bin/Rscript”
之前我是把脚本放环境变量里了，每次source环境的时候总是就不能直接使用，必须全路径，只要把这个Rscript所在位置换成你用的conda环境中的Rscript就可以了。另外需要chmod +x DataQualityOverviewSingle.R 这个代码，才可以执行

#!/home/zhengjh/miniconda3/envs/r403/bin/Rscript
# parameter

library(optparse)
library(getopt)

option_list <- list(
  make_option(c("-o", "--output"), type = "character", default = FALSE,
              action = "store", help = "This is the output directory."
  ),
  make_option(c("-t", "--Type"), type = "character", default = "10X",
              action = "store", help = "This is type of exp matrix, default is 10X, the other is .csv which row is gene, column is cell."
  ),
  make_option(c("--datapath"), type = "character", default = FALSE,
              action = "store", help = "This is the path of exp matrix."
  ),
  
  make_option(c("--project_name"), type = "character", default = FALSE,
              action = "store", help = "This is project name built in seurat object and the pre_fix of files generated by this function."
  ),
  make_option(c("--mt_pattern"), type = "character", default = "^mt-",
              action = "store", help = "This is mitochondria pattern used to calculate its percentage, default is the ^mt-."
  ),
  make_option(c("--rb_pattern"), type = "character", default = "^Rpl|^Rps",
              action = "store", help = "This is ribosome pattern used to calculate its percentage, default is ^Rpl|^Rps"
  ),
  
  make_option(c("--min_cell"), type = "integer", default = 3,
              action = "store", help = "This is min cells number to filter, default is 3."
  ),
  make_option(c("--min_gene"), type = "integer", default = 0,
              action = "store", help = "This is min gene number to filter, default is 0."
  )
  )

# -help 
opt = parse_args(OptionParser(option_list = option_list, 
                              usage = "This Script is general overview data quality of single cell exp matrix!"))
print(opt)

#### step0.load data ####
source("/home/zhengjh/scripts/seurat/function/SeuratWrapperFunction.R")

print("Step0: Load Data")
##### step00. load functions ####
# read in the functions
timestart<-Sys.time() 
# set the output path
cat("Current working dir: ", opt$output)
cat("\n")
setwd(opt$output)
# load ref object

#### step01. load exp matrix ####

print("Step0: Load exp matrix data")
if(opt$Type=="10X"){
  print("Read exp matrix from 10X cellranger output")
  exp_count <- Read10X(data.dir = opt$datapath)
  print("The rownum and colnum of  exp_count is ")
  print(dim(exp_count))
}else{
  print("Read exp matrix from csv file")
  exp_count <- read.csv(opt$datapath, header = T, row.names = 1) 
  print("The rownum and colnum of normal exp is ")
  print(dim(exp_count))
}


#### step1: Create_seurat_filter_cell ######
print("Step1: Creat Seuart Object with filtering lower genes and lower cells")
Or_ident <- rep(opt$project_name, ncol(exp_count))
exp_count.seurat <- CreateFirstFilterSeuratObject(exp_count, opt$project_name, Or_ident,
                                                   opt$min_cell, opt$min_gene,
                                                   opt$mt_pattern, opt$rb_pattern)

print("The row_num and col_num of the first filtered matrix:")
print(dim(exp_count.seurat))
print("The cellnumber of samples:")
print(table(exp_count.seurat$orig.ident))
name_pre <- paste0("filter_gene_expressed_", opt$min_cell, "cells", opt$min_gene, "genes") 

#### step2: Overview of the data quality ######
print("Step2: Overview of the data quality")
SeuratQC(exp_count.seurat, name_pre)
rds_name <- paste0(opt$project_name, "_dataquality_overview_seurat.rds")
saveRDS(exp_count.seurat, file = rds_name)

print("Happy~Finished!!!")
timeend<-Sys.time()
runningtime<-timeend-timestart
print(runningtime)

3.LogNormalizeClusteringSingle.R脚本: LogNormalize数据并聚类

#!/home/zhengjh/miniconda3/envs/r403/bin/Rscript
# parameter

library(optparse)
library(getopt)

option_list <- list(
  make_option(c("-o", "--output"), type = "character", default = FALSE,
              action = "store", help = "This is the output directory."
  ),
  make_option(c( "--seuratobject"), type = "character", default = "10X",
              action = "store", help = "This is absoulte path of seuratobject."
  ),
  
  make_option(c("--min_gene"), type = "integer", default = 50,
              action = "store", help = "This is min gene number to filter, default is 50."
  ),
  make_option(c("--max_gene"), type = "integer", default = 10000,
              action = "store", help = "This is max gene number to filter, default is 10000."
  ),
  make_option(c("--min_ncountRNA"), type = "integer", default = 500,
              action = "store", help = "This is min ncountRNA to filter, default is 5000."
  ),
  make_option(c("--max_ncountRNA"), type = "integer", default = 100000,
              action = "store", help = "This is min ncountRNA to filter, default is 100000."
  ),
  make_option(c("--max_mt_percent"), type = "integer", default = 60,
              action = "store", help = "This is min mitochondria percentage  to filter, default is 60."
  ),
  make_option(c("--max_rb_percent"), type = "integer", default = 60,
              action = "store", help = "This is min ribosome percentage to filter, default is 60."
  ),
  make_option(c("--file_prefix"), type = "character",
              action = "store", help = "This is number of file prefix  to used in the following analysis."
  ),
  make_option(c("--scale_factor"), type = "integer", default = 10000,
              action = "store", help = "This is scale factor used in Seurat NormalizeData function, default is 10000."
  ),
  make_option(c("--nfeatures"), type = "integer", default = 2000,
              action = "store", help = "This is number of features used in Seurat FindVariableFeatures function, default is 2000."
  ),
  make_option(c("--npc_plot"), type = "integer", default = 50,
              action = "store", help = "This is number of principles  to plot, default is 50."
  ),
  make_option(c("--npc_used"), type = "integer", default = 20,
              action = "store", help = "This is number of principles to use in RunTSNE and RunUMAP function, default is 20."
  ),
  
  make_option(c("--k_parameter"), type = "integer", default = 20,
              action = "store", help = "This is k parameter used in FindNeighbors function which will influence the clusering number, default is 20."
  ),
  make_option(c("--resolution_number"), type = "double", default = 0.5,
              action = "store", help = "This is resolution number used in FindClusters function , default is 0.5"
  ),
  make_option(c("--min_pct"), type = "double", default = 0.25,
              action = "store", help = "This is min.pct used in FindAllMarkers function , default is 0.25"
  ),
  make_option(c("--logfc_threshold"), type = "double", default = 0.25,
              action = "store", help = "This is logfc.threshold used in FindClusters function , default is 0.25"
  ),
  make_option(c("--p_val_adj"), type = "double", default = 0.05,
              action = "store", help = "This is p_val_adj used to filter marker , default is 0.05"
  ),
  make_option(c("--top_num"), type = "integer", default = 10,
              action = "store", help = "This is top num marker of each cluster, default is 10"
  ),
  make_option(c("--mt_rb_pattern"), type = "character", default = "^mt-|^Rpl-|^Rps-",
              action = "store", help = "This is mt_rb_pattern used to filter mt/rb genes found in cluster markers, default is ^mt-|^Rpl-|^Rps-"
  )
  
)

# -help 
opt = parse_args(OptionParser(option_list = option_list, usage = "This Script is general processing of single cell exp matrix!"))
print(opt)

source("/home/zhengjh/scripts/seurat/function/SeuratWrapperFunction.R")

###### step0. subset seurat object ####
timestart<-Sys.time() 

# set the output path
setwd(opt$output)

exp_count.seurat <- readRDS(opt$seuratobject)
print("Step0: Subset Seurat Object")
cat(opt$min_gene, "< gene number <", opt$max_gene, "\n")
cat(opt$min_ncountRNA, "< ncountRNA <", opt$max_ncountRNA, "\n")
cat("max_mt_percent ", opt$max_mt_percent,  "\n")
cat("max_rb_percent ", opt$max_rb_percent,  "\n")
exp_count.seurat <- SubsetSeuratData(exp_count.seurat = exp_count.seurat, 
                                     min_gene = opt$min_gene, max_gene = opt$max_gene, 
                                      min_ncountRNA = opt$min_ncountRNA, max_ncountRNA = opt$max_ncountRNA, 
                                      max_mt_percent = opt$max_mt_percent, max_rb_percent = opt$max_rb_percent)

#### step1. normalize data and run pca ####
print("Step1: LogNormalize Data and Run PCA")
cat("scale factor used in NormalizeData function is ", opt$scale_factor, "\n")
cat("nfeatures used in Seurat FindVariableFeatures function is ", opt$nfeatures, "\n")
cat("npc_plot used in ElbowPlot is ", opt$npc_plot, "\n")
exp_count.seurat <- SelectPCsByLogNormalize(exp_count.seurat = exp_count.seurat, 
                                            file_prefix = opt$file_prefix, 
                                            scale_factor = opt$scale_factor,
                                            nfeatures =  opt$nfeatures,
                                            npc_plot = opt$npc_plot)

#### step2. find clusters ####
print("Step2: FindNeighbors and clusters and then run tsne and umap reduction")
cat("k_param used in Seurat FindNeighbors function is ", opt$k_param, "\n")
cat("resolution_number used in Seurat FindClusters function is ", opt$resolution_number, "\n")
cat("npc_used used in RunTSNE and RUNUMAP is ", opt$npc_used, "\n")
exp_count.seurat <- PlotCluster(exp_count.seurat = exp_count.seurat, 
                             file_prefix = opt$file_prefix, 
                             npc_used = opt$npc_used, 
                             k_param = opt$k_param,
                             resolution_number = opt$resolution_number)
  

#### step3. find top markers #####
print("Step3: find top markers and then map cluster with top2 marke")
cat("min.pct used in Seurat FindAllMarkers function is ", opt$min.pct, "\n")
cat("logfc.threshold used in Seurat FindAllMarkers function is ", opt$logfc.threshold, "\n")
cat("p_val_adj used to filter markers is ", opt$p_val_adj, "\n")

cluster.markers <- FindmarkerForCluster(exp_count.seurat = exp_count.seurat, 
                                         file_prefix = opt$file_prefix, 
                                         min.pct = opt$min_pct, 
                                         logfc.threshold = opt$logfc_threshold, 
                                         p_val_adj = opt$p_val_adj,
                                         mt_rb_pattern = opt$mt_rb_pattern)

TopMarker <- TopMarkersInCluster(cluster.markers = cluster.markers, 
                                 file_prefix = opt$file_prefix, 
                                 top_num = opt$top_num)
# Top markers heatmap
P_heatmap_top_marker <- DoHeatmap(exp_count.seurat, features = TopMarker$gene)
filename <- paste0(opt$file_prefix, "_Heatmap_plot_top_markers.pdf")
folder_name <- "4.MarkersInCluster"
pdf(filename, 30, 15)
print(P_heatmap_top_marker)
dev.off()

file.copy(filename, folder_name, overwrite = T) #copy files
file.remove(filename)


#### step4. map top2 marker to each cluster #####
print("step4: map top2 marker to each cluster")
exp_count.seurat <- MapTop2MarkerEachCluster(exp_count.seurat = exp_count.seurat, 
                                             cluster.markers = cluster.markers,
                                             file_prefix = opt$file_prefix)

#### step5. save the clustering plots aftering assign top2 markers ###
print("step5. save the clustering plots aftering assign top2 markers")

p1 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
              pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident") + NoLegend()
p1 <- p1 + labs(title="Clustering UMAP Reduction")
p1 <- p1 + theme(plot.title = element_text(hjust = 0.5)) 

p2 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
              pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident")
p2 <- p2 + labs(title="Clustering TSNE Reduction")
p2 <- p2 + theme(plot.title = element_text(hjust = 0.5)) 

plots <- plot_grid(p1, p2, ncol=2,rel_widths = c(1.7, 2.2))

cluster_pdfname <- paste0(opt$file_prefix, "_Clustering_TSNE_UMAP.pdf")
folder_name <- "3.PlotCluster"
pdf(cluster_pdfname, width = 18, height = 6)
print(plots)
dev.off()

file.copy(cluster_pdfname, folder_name, overwrite = T)#拷贝文件
file.remove(cluster_pdfname)

rds_name <- paste0(opt$file_prefix, "_lognormalize_clustering_seurat.rds.rds")
saveRDS(exp_count.seurat, file = rds_name)

print("Happy~Finished!!")
timeend<-Sys.time()
runningtime<-timeend-timestart
print(runningtime)

4. SCTransformClusteringSingle.R 脚本: SCTtransform数据并聚类

#!/home/zhengjh/miniconda3/envs/r403/bin/Rscript
# parameter

library(optparse)
library(getopt)

option_list <- list(
  make_option(c("-o", "--output"), type = "character", default = FALSE,
              action = "store", help = "This is the output directory."
  ),
  make_option(c( "--seuratobject"), type = "character", default = "10X",
              action = "store", help = "This is absoulte path of seuratobject."
  ),
  
  make_option(c("--min_gene"), type = "integer", default = 50,
              action = "store", help = "This is min gene number to filter, default is 50."
  ),
  make_option(c("--max_gene"), type = "integer", default = 10000,
              action = "store", help = "This is max gene number to filter, default is 10000."
  ),
  make_option(c("--min_ncountRNA"), type = "integer", default = 500,
              action = "store", help = "This is min ncountRNA to filter, default is 5000."
  ),
  make_option(c("--max_ncountRNA"), type = "integer", default = 100000,
              action = "store", help = "This is min ncountRNA to filter, default is 100000."
  ),
  make_option(c("--max_mt_percent"), type = "integer", default = 60,
              action = "store", help = "This is min mitochondria percentage  to filter, default is 60."
  ),
  make_option(c("--max_rb_percent"), type = "integer", default = 60,
              action = "store", help = "This is min ribosome percentage to filter, default is 60."
  ),
  make_option(c("--file_prefix"), type = "character",
              action = "store", help = "This is number of file prefix  to used in the following analysis."
  ),
  make_option(c("--npc_plot"), type = "integer", default = 50,
              action = "store", help = "This is number of principles  to plot, default is 50."
  ),
  make_option(c("--npc_used"), type = "integer", default = 20,
              action = "store", help = "This is number of principles to use in RunTSNE and RunUMAP function, default is 20."
  ),
  
  make_option(c("--k_parameter"), type = "integer", default = 20,
              action = "store", help = "This is k parameter used in FindNeighbors function which will influence the clusering number, default is 20."
  ),
  make_option(c("--resolution_number"), type = "double", default = 0.5,
              action = "store", help = "This is resolution number used in FindClusters function , default is 0.5"
  ),
  make_option(c("--min_pct"), type = "double", default = 0.25,
              action = "store", help = "This is min.pct used in FindAllMarkers function , default is 0.25"
  ),
  make_option(c("--logfc_threshold"), type = "double", default = 0.25,
              action = "store", help = "This is logfc.threshold used in FindClusters function , default is 0.25"
  ),
  make_option(c("--p_val_adj"), type = "double", default = 0.05,
              action = "store", help = "This is p_val_adj used to filter marker , default is 0.05"
  ),
  make_option(c("--top_num"), type = "integer", default = 10,
              action = "store", help = "This is top num marker of each cluster, default is 10"
  ),
  make_option(c("--mt_rb_pattern"), type = "character", default = "^mt-|^Rpl-|^Rps-",
              action = "store", help = "This is mt_rb_pattern used to filter mt/rb genes found in cluster markers, default is ^mt-|^Rpl-|^Rps-"
  )
  
)

# -help 
opt = parse_args(OptionParser(option_list = option_list, usage = "This Script is general processing of single cell exp matrix!"))
print(opt)

source("/home/zhengjh/scripts/seurat/function/SeuratWrapperFunction.R")

###### step0. subset seurat object ####
timestart<-Sys.time() 

# set the output path
setwd(opt$output)

exp_count.seurat <- readRDS(opt$seuratobject)
print("Step0: Subset Seurat Object")
cat(opt$min_gene, "< gene number <", opt$max_gene, "\n")
cat(opt$min_ncountRNA, "< ncountRNA <", opt$max_ncountRNA, "\n")
cat("max_mt_percent ", opt$max_mt_percent,  "\n")
cat("max_rb_percent ", opt$max_rb_percent,  "\n")
exp_count.seurat <- SubsetSeuratData(exp_count.seurat = exp_count.seurat, 
                                     min_gene = opt$min_gene, max_gene = opt$max_gene, 
                                      min_ncountRNA = opt$min_ncountRNA, max_ncountRNA = opt$max_ncountRNA, 
                                      max_mt_percent = opt$max_mt_percent, max_rb_percent = opt$max_rb_percent)

#### step1. normalize data and run pca ####
print("Step1: SCTtransform Data and Run PCA")
cat("scale factor used in NormalizeData function is ", opt$scale_factor, "\n")
cat("nfeatures used in Seurat FindVariableFeatures function is ", opt$nfeatures, "\n")
cat("npc_plot used in ElbowPlot is ", opt$npc_plot, "\n")
exp_count.seurat <- SelectPCsBySCTransform(exp_count.seurat = exp_count.seurat, 
                                            file_prefix = opt$file_prefix, 
                                            npc_plot = opt$npc_plot)

#### step2. find clusters ####
print("Step2: FindNeighbors and clusters and then run tsne and umap reduction")
cat("k_param used in Seurat FindNeighbors function is ", opt$k_param, "\n")
cat("resolution_number used in Seurat FindClusters function is ", opt$resolution_number, "\n")
cat("npc_used used in RunTSNE and RUNUMAP is ", opt$npc_used, "\n")
exp_count.seurat <- PlotCluster(exp_count.seurat = exp_count.seurat, 
                             file_prefix = opt$file_prefix, 
                             npc_used = opt$npc_used, 
                             k_param = opt$k_param,
                             resolution_number = opt$resolution_number)
  

#### step3. find top markers #####
print("Step3: find top markers and then map cluster with top2 marke")
cat("min.pct used in Seurat FindAllMarkers function is ", opt$min.pct, "\n")
cat("logfc.threshold used in Seurat FindAllMarkers function is ", opt$logfc.threshold, "\n")
cat("p_val_adj used to filter markers is ", opt$p_val_adj, "\n")

cluster.markers <- FindmarkerForCluster(exp_count.seurat = exp_count.seurat, 
                                         file_prefix = opt$file_prefix, 
                                         min.pct = opt$min_pct, 
                                         logfc.threshold = opt$logfc_threshold, 
                                         p_val_adj = opt$p_val_adj,
                                         mt_rb_pattern = opt$mt_rb_pattern)

TopMarker <- TopMarkersInCluster(cluster.markers = cluster.markers, 
                                 file_prefix = opt$file_prefix, 
                                 top_num = opt$top_num)

# Top markers heatmap
P_heatmap_top_marker <- DoHeatmap(exp_count.seurat, features = TopMarker$gene)
filename <- paste0(opt$file_prefix, "_Heatmap_plot_top_markers.pdf")
folder_name <- "4.MarkersInCluster"
pdf(filename, 30, 15)
print(P_heatmap_top_marker)
dev.off()

file.copy(filename, folder_name, overwrite = T)# copy files
file.remove(filename)

#### step4. map top2 marker to each cluster #####
print("step4: map top2 marker to each cluster")
exp_count.seurat <- MapTop2MarkerEachCluster(exp_count.seurat = exp_count.seurat, 
                                             cluster.markers = cluster.markers,
                                             file_prefix = opt$file_prefix)

#### step5. save the clustering plots aftering assign top2 markers ###
print("step5. save the clustering plots aftering assign top2 markers")

p1 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
              pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident") + NoLegend()
p1 <- p1 + labs(title="Clustering UMAP Reduction")
p1 <- p1 + theme(plot.title = element_text(hjust = 0.5)) 

p2 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
              pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident")
p2 <- p2 + labs(title="Clustering TSNE Reduction")
p2 <- p2 + theme(plot.title = element_text(hjust = 0.5)) 

plots <- plot_grid(p1, p2, ncol=2,rel_widths = c(1.7, 2.2))

cluster_pdfname <- paste0(opt$file_prefix, "_Clustering_TSNE_UMAP.pdf")
folder_name <- "3.PlotCluster"
pdf(cluster_pdfname, width = 18, height = 6)
print(plots)
dev.off()

file.copy(cluster_pdfname, folder_name, overwrite = T)#拷贝文件
file.remove(cluster_pdfname)

rds_name <- paste0(opt$file_prefix, "_sctransform_clustering_seurat.rds.rds")
saveRDS(exp_count.seurat, file = rds_name)

print("Happy~Finished!!")

timeend<-Sys.time()
runningtime<-timeend-timestart
print(runningtime)

5.附上使用说明

1) 把SeuratWrapperFunction.R模块下面的代码复制到文件SeuratWrapperFunction.R，并存储

2) 把DataQualityOverviewSingle.R模块下面的代码复制到文件DataQualityOverviewSingle.R，并存储

3) 把LogNormalizeClusteringSingle.R模块下面的代码复制到文件LogNormalizeClusteringSingle.R，并存储

4) 把SCTransformClusteringSingle.R模块下面的代码复制到文件SCTransformClusteringSingle.R，并存储

5) 可执行权限

chmod +x DataQualityOverviewSingle.R
chmod +x LogNormalizeClusteringSingle.R
chmod +x SCTransformClusteringSingle.R

6)使用

## 质量控制
Rscript ~/scripts/seurat/pipeline/DataQualityOverviewSingle.R -o /home/zhengjh/other/PinealGland/Results/Zebrafish -t 10X --datapath /home/zhengjh/other/PinealGland/Data/Zebrafish_GSE123778_RAW/GSM3511192_scSeq1 \
--project_name "zebrafish_scseq1" --mt_pattern "^mt-" --rb_pattern "^rpl|^rps" --min_cell 3 --min_gene 100

## LogNormalize
Rscript ~/scripts/seurat/pipeline/LogNormalizeClusteringSingle.R -o ~/other/PinealGland/Results/Zebrafish --seuratobject ~/other/PinealGland/Results/Zebrafish/zebrafish_scseq1_dataquality_overview_seurat.rds \
--min_gene 200 --max_gene 6500 --min_ncountRNA 200 --max_ncountRNA 75000 \
--max_mt_percent 20 --max_rb_percent 20 --file_prefix zebrafish_scseq1_logtransform \
--scale_factor 10000 --nfeatures 2000 --npc_plot 50 --npc_used 30 \
--k_parameter 20 --resolution_number 0.5 --min_pct 0.25 --logfc_threshold 0.25  \
--p_val_adj 0.05 --top_num 10 --mt_rb_pattern "^mt-|rpl|rps"

## SCTransform
Rscript ~/scripts/seurat/pipeline/SCTransformClusteringSingle.R -o ~/other/PinealGland/Results/Zebrafish --seuratobject ~/other/PinealGland/Results/Zebrafish/zebrafish_scseq1_dataquality_overview_seurat.rds --min_gene 200 --max_gene 6500 --min_ncountRNA 200 --max_ncountRNA 75000 \
--max_mt_percent 20 --max_rb_percent 20 --file_prefix zebrafish_scseq1_scttransform \
--npc_plot 50 --npc_used 30 \
--k_parameter 20 --resolution_number 0.5 --min_pct 0.25 --logfc_threshold 0.25 \
--p_val_adj 0.05 --top_num 10 --mt_rb_pattern "^mt-|rpl|rps"

7) 结果展示

结果展示

最后放个图片
祝君开心快乐健康！
也希望自己一直保持学习不郁闷呜呜u

image.png