2015年斯坦福大学医学院的一个研究团队,提出了一种分析单细胞类型的新方法。这种方法类似于分析一杯奶昔,以找到什么水果加入其中。本研究描述的方法称为Cibersort,在线发表于三月三十日的《自然方法》(Nature Methods)
Newman, Aaron M et al. “Robust enumeration of cell subsets from tissue expression profiles.” Nature methods vol. 12,5 (2015): 453-7. doi:10.1038/nmeth.3337
在Pubmed简单搜索,就有 1300+ 的记录,可见该方法很值得学习
CIBERSORTx 新版官网入口:CIBERSORTx 官网https://cibersortx.stanford.edu/
增加了适配大量组织(bulk tissue)样本单细胞测序方面的功能。
CIBERSORTx is an analytical tool from the Alizadeh Lab and Newman Lab to impute gene expression profiles and provide an estimation of the abundances of member cell types in a mixed cell population, using gene expression data. (输入基因表达矩阵,输出样本中的各种细胞类型的丰度。比较有意思的是,CIBERSORT的设计之初的核心目标是“predicting fractions of multiple cell types in gene expression profiles (GEPs)”,可能例子里22个免疫细胞 signature让人太过印象深刻,几乎全互联网的教程都是基于免疫细胞的,CIBERSORT也被认为是专门用来量化免疫浸润的,其实signature是可以按需求自行选择的。)
source.R文件:
#' CIBERSORT R script v1.03 (last updated 07-10-2015)
#' Note: Signature matrix construction is not currently available; use java version for full functionality.
#' Author: Aaron M. Newman, Stanford University ([email protected])
#' Requirements:
#' R v3.0 or later. (dependencies below might not work properly with earlier versions)
#' install.packages('e1071')
#' install.pacakges('parallel')
#' install.packages('preprocessCore')
#' if preprocessCore is not available in the repositories you have selected, run the following:
#' source("http://bioconductor.org/biocLite.R")
#' biocLite("preprocessCore")
#' Windows users using the R GUI may need to Run as Administrator to install or update packages.
#' This script uses 3 parallel processes. Since Windows does not support forking, this script will run
#' single-threaded in Windows.
#'
#' Usage:
#' Navigate to directory containing R script
#'
#' In R:
#' source('CIBERSORT.R')
#' results <- CIBERSORT('sig_matrix_file.txt','mixture_file.txt', perm, QN)
#'
#' Options:
#' i) perm = No. permutations; set to >=100 to calculate p-values (default = 0)
#' ii) QN = Quantile normalization of input mixture (default = TRUE)
#'
#' Input: signature matrix and mixture file, formatted as specified at http://cibersort.stanford.edu/tutorial.php
#' Output: matrix object containing all results and tabular data written to disk 'CIBERSORT-Results.txt'
#' License: http://cibersort.stanford.edu/CIBERSORT_License.txt
#' Core algorithm
#' @param X cell-specific gene expression
#' @param y mixed expression per sample
#' @export
CoreAlg <- function(X, y){
#try different values of nu
svn_itor <- 3
res <- function(i){
if(i==1){nus <- 0.25}
if(i==2){nus <- 0.5}
if(i==3){nus <- 0.75}
model<-e1071::svm(X,y,type="nu-regression",kernel="linear",nu=nus,scale=F)
model
}
if(Sys.info()['sysname'] == 'Windows') out <- parallel::mclapply(1:svn_itor, res, mc.cores=1) else
out <- parallel::mclapply(1:svn_itor, res, mc.cores=svn_itor)
nusvm <- rep(0,svn_itor)
corrv <- rep(0,svn_itor)
#do cibersort
t <- 1
while(t <= svn_itor) {
weights = t(out[[t]]$coefs) %*% out[[t]]$SV
weights[which(weights<0)]<-0
w<-weights/sum(weights)
u <- sweep(X,MARGIN=2,w,'*')
k <- apply(u, 1, sum)
nusvm[t] <- sqrt((mean((k - y)^2)))
corrv[t] <- cor(k, y)
t <- t + 1
}
#pick best model
rmses <- nusvm
mn <- which.min(rmses)
model <- out[[mn]]
#get and normalize coefficients
q <- t(model$coefs) %*% model$SV
q[which(q<0)]<-0
w <- (q/sum(q))
mix_rmse <- rmses[mn]
mix_r <- corrv[mn]
newList <- list("w" = w, "mix_rmse" = mix_rmse, "mix_r" = mix_r)
}
#' do permutations
#' @param perm Number of permutations
#' @param X cell-specific gene expression
#' @param y mixed expression per sample
#' @export
doPerm <- function(perm, X, Y){
itor <- 1
Ylist <- as.list(data.matrix(Y))
dist <- matrix()
while(itor <= perm){
#print(itor)
#random mixture
yr <- as.numeric(Ylist[sample(length(Ylist),dim(X)[1])])
#standardize mixture
yr <- (yr - mean(yr)) / sd(yr)
#run CIBERSORT core algorithm
result <- CoreAlg(X, yr)
mix_r <- result$mix_r
#store correlation
if(itor == 1) {dist <- mix_r}
else {dist <- rbind(dist, mix_r)}
itor <- itor + 1
}
newList <- list("dist" = dist)
}
#' Main functions
#' @param sig_matrix file path to gene expression from isolated cells
#' @param mixture_file heterogenous mixed expression
#' @param perm Number of permutations
#' @param QN Perform quantile normalization or not (TRUE/FALSE)
#' @export
CIBERSORT <- function(sig_matrix, mixture_file, perm=0, QN=TRUE){
#read in data
X <- read.table(sig_matrix,header=T,sep="\t",row.names=1,check.names=F)
Y <- read.table(mixture_file, header=T, sep="\t", check.names=F)
Y <- Y[!duplicated(Y[,1]),]
rownames(Y)<-Y[,1]
Y<-Y[,-1]
X <- data.matrix(X)
Y <- data.matrix(Y)
#order
X <- X[order(rownames(X)),]
Y <- Y[order(rownames(Y)),]
P <- perm #number of permutations
#anti-log if max < 50 in mixture file
if(max(Y) < 50) {Y <- 2^Y}
#quantile normalization of mixture file
if(QN == TRUE){
tmpc <- colnames(Y)
tmpr <- rownames(Y)
Y <- preprocessCore::normalize.quantiles(Y)
colnames(Y) <- tmpc
rownames(Y) <- tmpr
}
#intersect genes
Xgns <- row.names(X)
Ygns <- row.names(Y)
YintX <- Ygns %in% Xgns
Y <- Y[YintX,]
XintY <- Xgns %in% row.names(Y)
X <- X[XintY,]
#standardize sig matrix
X <- (X - mean(X)) / sd(as.vector(X))
# 矩阵X是LM22矩阵
# 矩阵Y是待预测的矩阵
# 然后对
#empirical null distribution of correlation coefficients
if(P > 0) {nulldist <- sort(doPerm(P, X, Y)$dist)}
#print(nulldist)
header <- c('Mixture',colnames(X),"P-value","Correlation","RMSE")
#print(header)
output <- matrix()
itor <- 1
mixtures <- dim(Y)[2]
pval <- 9999
#iterate through mixtures
while(itor <= mixtures){
y <- Y[,itor]
#standardize mixture
y <- (y - mean(y)) / sd(y)
#run SVR core algorithm
result <- CoreAlg(X, y)
#get results
w <- result$w
mix_r <- result$mix_r
mix_rmse <- result$mix_rmse
#calculate p-value
if(P > 0) {pval <- 1 - (which.min(abs(nulldist - mix_r)) / length(nulldist))}
#print output
out <- c(colnames(Y)[itor],w,pval,mix_r,mix_rmse)
if(itor == 1) {output <- out}
else {output <- rbind(output, out)}
itor <- itor + 1
}
#save results
write.table(rbind(header,output), file="CIBERSORT-Results.txt", sep="\t", row.names=F, col.names=F, quote=F)
#return matrix object containing all results
obj <- rbind(header,output)
obj <- obj[,-1]
obj <- obj[-1,]
obj <- matrix(as.numeric(unlist(obj)),nrow=nrow(obj))
rownames(obj) <- colnames(Y)
colnames(obj) <- c(colnames(X),"P-value","Correlation","RMSE")
obj
}
LM22.txt文件:
展示部分:完整文件可从Nature Methods文章附录文件下载,调整成如下格式即可
Gene symbol | B cells naive | B cells memory | Plasma cells | T cells CD8 | T cells CD4 naive | T cells CD4 memory resting | T cells CD4 memory activated | T cells follicular helper | T cells regulatory (Tregs) | T cells gamma delta | NK cells resting |
ABCB4 | 555.7134 | 10.74424 | 7.225819 | 4.31128 | 4.60586 | 7.406442 | 8.043976 | 6.469993 | 7.833082 | 9.312295 | 13.68284 |
ABCB9 | 15.60354 | 22.09479 | 653.3923 | 24.22372 | 35.67151 | 30.04819 | 38.45542 | 17.60479 | 46.07366 | 19.71572 | 21.86372 |
ACAP1 | 215.306 | 321.621 | 38.61687 | 1055.613 | 1790.097 | 922.1527 | 340.8834 | 1107.798 | 1995.483 | 280.0757 | 512.6369 |
ACHE | 15.11795 | 16.64885 | 22.12374 | 13.42829 | 27.18773 | 18.44493 | 13.44127 | 14.80554 | 24.65271 | 33.65845 | 16.65374 |
ACP5 | 605.8974 | 1935.201 | 1120.105 | 306.3125 | 744.6566 | 557.8198 | 248.5469 | 711.9497 | 958.916 | 493.9691 | 340.8201 |
ADAM28 | 1943.743 | 1148.12 | 324.7808 | 22.68972 | 40.06171 | 21.23331 | 15.20962 | 24.28141 | 74.76634 | 24.37075 | 193.6094 |
ADAMDEC1 | 371.0336 | 318.4788 | 127.9674 | 44.61629 | 80.9645 | 47.43351 | 30.55298 | 68.73789 | 156.7077 | 231.8273 | 68.27759 |
ADAMTS3 | 146.1956 | 106.0523 | 74.33917 | 42.39042 | 79.2418 | 57.38591 | 53.41302 | 35.41108 | 76.28062 | 65.26799 | 31.43527 |
ADRB2 | 486.3438 | 510.0813 | 289.7985 | 899.6485 | 77.15395 | 595.2105 | 142.8319 | 97.17423 | 217.0628 | 2378.452 | 1957.539 |
AIF1 | 24.0743 | 20.32186 | 21.96957 | 742.8158 | 718.6148 | 35.17225 | 39.50876 | 19.07768 | 29.36704 | 1086.96 | 105.7346 |
AIM2 | 327.9581 | 4538.991 | 1429.674 | 294.4846 | 57.0808 | 250.2996 | 3311.978 | 1115.682 | 315.6459 | 407.5929 | 87.22742 |
ALOX15 | 12.23359 | 13.75324 | 9.073509 | 9.774972 | 18.55656 | 12.00966 | 5.239541 | 13.15225 | 22.38142 | 14.45276 | 30.17685 |
ALOX5 | 1799.684 | 2425.681 | 190.3315 | 78.68444 | 74.9309 | 132.0101 | 59.08736 | 35.00087 | 143.0134 | 441.5556 | 157.0365 |
AMPD1 | 263.7451 | 217.7617 | 3012.768 | 66.10749 | 68.66195 | 85.31837 | 68.46141 | 53.95514 | 79.26421 | 114.0495 | 59.14044 |
ANGPT4 | 23.12217 | 21.31256 | 230.6807 | 44.68466 | 22.06015 | 44.47513 | 99.594 | 101.4456 | 29.46508 | 29.25858 | 9.380756 |
ANKRD55 | 284.4127 | 218.9158 | 117.2004 | 141.6981 | 1853.336 | 436.4203 | 79.77497 | 1077.33 | 123.8806 | 176.2439 | 83.38528 |
APOBEC3A | 102.4292 | 89.60069 | 99.594 | 64.35534 | 107.2355 | 52.73721 | 57.78232 | 71.37789 | 36.39007 | 723.1438 | 253.3068 |
rm(list=ls()) #清空环境变量
options(stringsAsFactors = F)
install.packages('e1071')
install.packages('parallel')
BiocManager::install("preprocessCore")
source("source.R")
load("CESC_FPKM_tumor.Rda")
ciber_input <- log2(CESC_FPKM_tumor_final + 1)
write.table(ciber_input, file = "cibersort_input.txt",
sep = "\t", row.names = T,col.names = NA,quote = F)
#CIBERSORT计算
sig_matrix <- "LM22.txt" #注释文件名
mixture_file = 'cibersort_input.txt' #表达数据文件名
res_cibersort <- CIBERSORT(sig_matrix, mixture_file, perm=100, QN=TRUE)#用cibersort算法计算
save(res_cibersort,file = "res_cibersort.Rdata") #保存结果
#可视化展示
rm(list=ls())
load("res_cibersort.Rdata")
res_cibersort <- res_cibersort[,1:22] #取前22列为细胞丰度数据
ciber.res <- res_cibersort[,colSums(res_cibersort) > 0] #去除丰度全为0的细胞
#barplot图
mycol <- ggplot2::alpha(rainbow(ncol(ciber.res)), 0.7) #创建彩虹色板(带70%透明度)
par(bty="o", mgp = c(2.5,0.3,0), mar = c(2.1,4.1,2.1,10.1),tcl=-.25,las = 1,xpd = F)
a = barplot(as.matrix(t(ciber.res)),
border = NA, # 柱子无边框
names.arg = rep("",nrow(ciber.res)), # 无横坐标样本名
yaxt = "n", # 先不绘制y轴
ylab = "Relative percentage", # 修改y轴名称
col = mycol) # 采用彩虹色板
axis(side = 2, at = c(0,0.2,0.4,0.6,0.8,1), # 补齐y轴添加百分号
labels = c("0%","20%","40%","60%","80%","100%"))
legend(par("usr")[2]-20, # 这里-20要根据实际出图的图例位置情况调整
par("usr")[4],
legend = colnames(ciber.res),
xpd = T,
fill = mycol,
cex = 0.8,
border = NA,
y.intersp = 1,
x.intersp = 0.2,
bty = "n")
dev.off() #关闭画板
#相关性热图
#install.packages('corrplot')
M <- round(cor(ciber.res),2) # 计算相关性矩阵并保留两位小数
library(corrplot)
corrplot.mixed(M,
lower.col = "black", #左下方字体颜色为黑色
tl.pos = "lt", #标签出现在左侧和顶部
number.cex = 0.5, #左下方字号为0.5
tl.cex = 0.5) #标签字号为0.5
dev.off() #关闭画板
#箱线图
dd1 <- ciber.res %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
pivot_longer(cols = 2:23,
names_to = "CellType",
values_to = "Composition")
plot.info <- dd1[]
ggboxplot(
plot.info,
x = "CellType",
y = "Composition",
color = "black",
fill = "CellType",
xlab = "",
ylab = "Cell composition",
main = "TME Cell composition") +
theme_base() +
theme(axis.text.x = element_text(
angle = 90,
hjust = 1,
vjust = 1
))
参考:
作者:又是一只小菜鸟
链接:https://www.jianshu.com/p/add97b7640a4
来源:简书
作者:菠萝西斯
链接:https://blog.csdn.net/u013429737/article/details/116044812
来源:CSDN