Cibersort 算法 分析肿瘤样本免疫细胞组分

2015年斯坦福大学医学院的一个研究团队,提出了一种分析单细胞类型的新方法。这种方法类似于分析一杯奶昔,以找到什么水果加入其中。本研究描述的方法称为Cibersort,在线发表于三月三十日的《自然方法》(Nature Methods)

Cibersort 算法 分析肿瘤样本免疫细胞组分_第1张图片 提出 CIBERSORT 的文章

Newman, Aaron M et al. “Robust enumeration of cell subsets from tissue expression profiles.” Nature methods vol. 12,5 (2015): 453-7. doi:10.1038/nmeth.3337 

 在Pubmed简单搜索,就有 1300+ 的记录,可见该方法很值得学习

Cibersort 算法 分析肿瘤样本免疫细胞组分_第2张图片

CIBERSORTx 新版官网入口:CIBERSORTx 官网https://cibersortx.stanford.edu/

增加了适配大量组织(bulk tissue)样本单细胞测序方面的功能。

Cibersort 算法 分析肿瘤样本免疫细胞组分_第3张图片

CIBERSORTx is an analytical tool from the Alizadeh Lab and Newman Lab to impute gene expression profiles and provide an estimation of the abundances of member cell types in a mixed cell population, using gene expression data. (输入基因表达矩阵,输出样本中的各种细胞类型的丰度。比较有意思的是,CIBERSORT的设计之初的核心目标是“predicting fractions of multiple cell types in gene expression profiles (GEPs)”,可能例子里22个免疫细胞 signature让人太过印象深刻,几乎全互联网的教程都是基于免疫细胞的,CIBERSORT也被认为是专门用来量化免疫浸润的,其实signature是可以按需求自行选择的。)

 source.R文件:

#' CIBERSORT R script v1.03 (last updated 07-10-2015)
#' Note: Signature matrix construction is not currently available; use java version for full functionality.
#' Author: Aaron M. Newman, Stanford University ([email protected])
#' Requirements:
#'       R v3.0 or later. (dependencies below might not work properly with earlier versions)
#'       install.packages('e1071')
#'       install.pacakges('parallel')
#'       install.packages('preprocessCore')
#'       if preprocessCore is not available in the repositories you have selected, run the following:
#'           source("http://bioconductor.org/biocLite.R")
#'           biocLite("preprocessCore")
#' Windows users using the R GUI may need to Run as Administrator to install or update packages.
#' This script uses 3 parallel processes.  Since Windows does not support forking, this script will run
#' single-threaded in Windows.
#'
#' Usage:
#'       Navigate to directory containing R script
#'
#'   In R:
#'       source('CIBERSORT.R')
#'       results <- CIBERSORT('sig_matrix_file.txt','mixture_file.txt', perm, QN)
#'
#'       Options:
#'       i)  perm = No. permutations; set to >=100 to calculate p-values (default = 0)
#'       ii) QN = Quantile normalization of input mixture (default = TRUE)
#'
#' Input: signature matrix and mixture file, formatted as specified at http://cibersort.stanford.edu/tutorial.php
#' Output: matrix object containing all results and tabular data written to disk 'CIBERSORT-Results.txt'
#' License: http://cibersort.stanford.edu/CIBERSORT_License.txt
#' Core algorithm
#' @param X cell-specific gene expression
#' @param y mixed expression per sample
#' @export
CoreAlg <- function(X, y){

  #try different values of nu
  svn_itor <- 3

  res <- function(i){
    if(i==1){nus <- 0.25}
    if(i==2){nus <- 0.5}
    if(i==3){nus <- 0.75}
    model<-e1071::svm(X,y,type="nu-regression",kernel="linear",nu=nus,scale=F)
    model
  }

  if(Sys.info()['sysname'] == 'Windows') out <- parallel::mclapply(1:svn_itor, res, mc.cores=1) else
    out <- parallel::mclapply(1:svn_itor, res, mc.cores=svn_itor)

  nusvm <- rep(0,svn_itor)
  corrv <- rep(0,svn_itor)

  #do cibersort
  t <- 1
  while(t <= svn_itor) {
    weights = t(out[[t]]$coefs) %*% out[[t]]$SV
    weights[which(weights<0)]<-0
    w<-weights/sum(weights)
    u <- sweep(X,MARGIN=2,w,'*')
    k <- apply(u, 1, sum)
    nusvm[t] <- sqrt((mean((k - y)^2)))
    corrv[t] <- cor(k, y)
    t <- t + 1
  }

  #pick best model
  rmses <- nusvm
  mn <- which.min(rmses)
  model <- out[[mn]]

  #get and normalize coefficients
  q <- t(model$coefs) %*% model$SV
  q[which(q<0)]<-0
  w <- (q/sum(q))

  mix_rmse <- rmses[mn]
  mix_r <- corrv[mn]

  newList <- list("w" = w, "mix_rmse" = mix_rmse, "mix_r" = mix_r)

}

#' do permutations
#' @param perm Number of permutations
#' @param X cell-specific gene expression
#' @param y mixed expression per sample
#' @export
doPerm <- function(perm, X, Y){
  itor <- 1
  Ylist <- as.list(data.matrix(Y))
  dist <- matrix()

  while(itor <= perm){
    #print(itor)

    #random mixture
    yr <- as.numeric(Ylist[sample(length(Ylist),dim(X)[1])])

    #standardize mixture
    yr <- (yr - mean(yr)) / sd(yr)

    #run CIBERSORT core algorithm
    result <- CoreAlg(X, yr)

    mix_r <- result$mix_r

    #store correlation
    if(itor == 1) {dist <- mix_r}
    else {dist <- rbind(dist, mix_r)}

    itor <- itor + 1
  }
  newList <- list("dist" = dist)
}

#' Main functions
#' @param sig_matrix file path to gene expression from isolated cells
#' @param mixture_file heterogenous mixed expression
#' @param perm Number of permutations
#' @param QN Perform quantile normalization or not (TRUE/FALSE)
#' @export
CIBERSORT <- function(sig_matrix, mixture_file, perm=0, QN=TRUE){

  #read in data
  X <- read.table(sig_matrix,header=T,sep="\t",row.names=1,check.names=F)
  Y <- read.table(mixture_file, header=T, sep="\t", check.names=F)
  Y <- Y[!duplicated(Y[,1]),]
  rownames(Y)<-Y[,1]
  Y<-Y[,-1]
  X <- data.matrix(X)
  Y <- data.matrix(Y)

  #order
  X <- X[order(rownames(X)),]
  Y <- Y[order(rownames(Y)),]

  P <- perm #number of permutations

  #anti-log if max < 50 in mixture file
  if(max(Y) < 50) {Y <- 2^Y}

  #quantile normalization of mixture file
  if(QN == TRUE){
    tmpc <- colnames(Y)
    tmpr <- rownames(Y)
    Y <- preprocessCore::normalize.quantiles(Y)
    colnames(Y) <- tmpc
    rownames(Y) <- tmpr
  }

  #intersect genes
  Xgns <- row.names(X)
  Ygns <- row.names(Y)
  YintX <- Ygns %in% Xgns
  Y <- Y[YintX,]
  XintY <- Xgns %in% row.names(Y)
  X <- X[XintY,]

  #standardize sig matrix
  X <- (X - mean(X)) / sd(as.vector(X))

  # 矩阵X是LM22矩阵
  # 矩阵Y是待预测的矩阵
  # 然后对
  #empirical null distribution of correlation coefficients
  if(P > 0) {nulldist <- sort(doPerm(P, X, Y)$dist)}

  #print(nulldist)

  header <- c('Mixture',colnames(X),"P-value","Correlation","RMSE")
  #print(header)

  output <- matrix()
  itor <- 1
  mixtures <- dim(Y)[2]
  pval <- 9999

  #iterate through mixtures
  while(itor <= mixtures){

    y <- Y[,itor]

    #standardize mixture
    y <- (y - mean(y)) / sd(y)

    #run SVR core algorithm
    result <- CoreAlg(X, y)

    #get results
    w <- result$w
    mix_r <- result$mix_r
    mix_rmse <- result$mix_rmse

    #calculate p-value
    if(P > 0) {pval <- 1 - (which.min(abs(nulldist - mix_r)) / length(nulldist))}

    #print output
    out <- c(colnames(Y)[itor],w,pval,mix_r,mix_rmse)
    if(itor == 1) {output <- out}
    else {output <- rbind(output, out)}

    itor <- itor + 1

  }

  #save results
  write.table(rbind(header,output), file="CIBERSORT-Results.txt", sep="\t", row.names=F, col.names=F, quote=F)

  #return matrix object containing all results
  obj <- rbind(header,output)
  obj <- obj[,-1]
  obj <- obj[-1,]
  obj <- matrix(as.numeric(unlist(obj)),nrow=nrow(obj))
  rownames(obj) <- colnames(Y)
  colnames(obj) <- c(colnames(X),"P-value","Correlation","RMSE")
  obj
}

LM22.txt文件:

展示部分:完整文件可从Nature Methods文章附录文件下载,调整成如下格式即可

Gene symbol B cells naive B cells memory Plasma cells T cells CD8 T cells CD4 naive T cells CD4 memory resting T cells CD4 memory activated T cells follicular helper T cells regulatory (Tregs) T cells gamma delta NK cells resting
ABCB4 555.7134 10.74424 7.225819 4.31128 4.60586 7.406442 8.043976 6.469993 7.833082 9.312295 13.68284
ABCB9 15.60354 22.09479 653.3923 24.22372 35.67151 30.04819 38.45542 17.60479 46.07366 19.71572 21.86372
ACAP1 215.306 321.621 38.61687 1055.613 1790.097 922.1527 340.8834 1107.798 1995.483 280.0757 512.6369
ACHE 15.11795 16.64885 22.12374 13.42829 27.18773 18.44493 13.44127 14.80554 24.65271 33.65845 16.65374
ACP5 605.8974 1935.201 1120.105 306.3125 744.6566 557.8198 248.5469 711.9497 958.916 493.9691 340.8201
ADAM28 1943.743 1148.12 324.7808 22.68972 40.06171 21.23331 15.20962 24.28141 74.76634 24.37075 193.6094
ADAMDEC1 371.0336 318.4788 127.9674 44.61629 80.9645 47.43351 30.55298 68.73789 156.7077 231.8273 68.27759
ADAMTS3 146.1956 106.0523 74.33917 42.39042 79.2418 57.38591 53.41302 35.41108 76.28062 65.26799 31.43527
ADRB2 486.3438 510.0813 289.7985 899.6485 77.15395 595.2105 142.8319 97.17423 217.0628 2378.452 1957.539
AIF1 24.0743 20.32186 21.96957 742.8158 718.6148 35.17225 39.50876 19.07768 29.36704 1086.96 105.7346
AIM2 327.9581 4538.991 1429.674 294.4846 57.0808 250.2996 3311.978 1115.682 315.6459 407.5929 87.22742
ALOX15 12.23359 13.75324 9.073509 9.774972 18.55656 12.00966 5.239541 13.15225 22.38142 14.45276 30.17685
ALOX5 1799.684 2425.681 190.3315 78.68444 74.9309 132.0101 59.08736 35.00087 143.0134 441.5556 157.0365
AMPD1 263.7451 217.7617 3012.768 66.10749 68.66195 85.31837 68.46141 53.95514 79.26421 114.0495 59.14044
ANGPT4 23.12217 21.31256 230.6807 44.68466 22.06015 44.47513 99.594 101.4456 29.46508 29.25858 9.380756
ANKRD55 284.4127 218.9158 117.2004 141.6981 1853.336 436.4203 79.77497 1077.33 123.8806 176.2439 83.38528
APOBEC3A 102.4292 89.60069 99.594 64.35534 107.2355 52.73721 57.78232 71.37789 36.39007 723.1438 253.3068

第一步:准备R包与CIBERSORT算法

rm(list=ls())   #清空环境变量
options(stringsAsFactors = F)

install.packages('e1071')
install.packages('parallel')
BiocManager::install("preprocessCore")

source("source.R") 

第二步: 准备表达矩阵文件,无需进行Log转换,cibersort算法会完成转换


load("CESC_FPKM_tumor.Rda")
ciber_input <- log2(CESC_FPKM_tumor_final + 1)
write.table(ciber_input, file = "cibersort_input.txt",
            sep = "\t", row.names = T,col.names = NA,quote = F)

 第三步: CIBERSORT计算


#CIBERSORT计算
sig_matrix <- "LM22.txt"   #注释文件名
mixture_file = 'cibersort_input.txt'   #表达数据文件名
res_cibersort <- CIBERSORT(sig_matrix, mixture_file, perm=100, QN=TRUE)#用cibersort算法计算
save(res_cibersort,file = "res_cibersort.Rdata")   #保存结果

  第四步: 可视化展示


#可视化展示
rm(list=ls())
load("res_cibersort.Rdata")
res_cibersort <- res_cibersort[,1:22]   #取前22列为细胞丰度数据
ciber.res <- res_cibersort[,colSums(res_cibersort) > 0]   #去除丰度全为0的细胞

#barplot图
mycol <- ggplot2::alpha(rainbow(ncol(ciber.res)), 0.7) #创建彩虹色板(带70%透明度)
par(bty="o", mgp = c(2.5,0.3,0), mar = c(2.1,4.1,2.1,10.1),tcl=-.25,las = 1,xpd = F)
a = barplot(as.matrix(t(ciber.res)),
        border = NA, # 柱子无边框
        names.arg = rep("",nrow(ciber.res)), # 无横坐标样本名
        yaxt = "n", # 先不绘制y轴
        ylab = "Relative percentage", # 修改y轴名称
        col = mycol) # 采用彩虹色板
axis(side = 2, at = c(0,0.2,0.4,0.6,0.8,1), # 补齐y轴添加百分号
     labels = c("0%","20%","40%","60%","80%","100%"))
legend(par("usr")[2]-20, # 这里-20要根据实际出图的图例位置情况调整
       par("usr")[4], 
       legend = colnames(ciber.res), 
       xpd = T,
       fill = mycol,
       cex = 0.8, 
       border = NA, 
       y.intersp = 1,
       x.intersp = 0.2,
       bty = "n")
dev.off()   #关闭画板

#相关性热图
#install.packages('corrplot')
M <- round(cor(ciber.res),2) # 计算相关性矩阵并保留两位小数

library(corrplot)
corrplot.mixed(M,
               lower.col = "black", #左下方字体颜色为黑色
               tl.pos = "lt",  #标签出现在左侧和顶部
               number.cex = 0.5, #左下方字号为0.5
               tl.cex = 0.5) #标签字号为0.5
dev.off()   #关闭画板

Cibersort 算法 分析肿瘤样本免疫细胞组分_第4张图片

Cibersort 算法 分析肿瘤样本免疫细胞组分_第5张图片

#箱线图
dd1 <- ciber.res %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  pivot_longer(cols = 2:23,
               names_to = "CellType",
               values_to = "Composition")

plot.info <- dd1[]          

ggboxplot(
  plot.info,
  x = "CellType",
  y = "Composition",
  color = "black",
  fill = "CellType",
  xlab = "",
  ylab = "Cell composition",
  main = "TME Cell composition") +
  theme_base() +
  theme(axis.text.x = element_text(
    angle = 90,
    hjust = 1,
    vjust = 1
  ))     

Cibersort 算法 分析肿瘤样本免疫细胞组分_第6张图片

参考:

作者:又是一只小菜鸟
链接:https://www.jianshu.com/p/add97b7640a4
来源:简书
 

作者:菠萝西斯
链接:https://blog.csdn.net/u013429737/article/details/116044812
来源:CSDN

你可能感兴趣的:(生信,肿瘤,TCGA,算法,r语言)