Scenic转录因子调控实操示例
激活环境
#不同环境安装不同的分析软件,适用于不同的分析
conda activate SCENIC
创建分析目录并打开R
mkdir SCENIC
cd SCENIC
R
#Install dependencies
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install(c("GENIE3", "AUCell", "RcisTarget"), version = "3.8")
install.packages("foreach")
if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
devtools::install_github("aertslab/SCopeLoomR", build_vignettes = TRUE)
## Optional (but highly recommended):
# To score the network on cells (i.e. run AUCell):
BiocManager::install(c("zoo", "mixtools", "rbokeh"),ask = F,update = F)
# For various visualizations and perform t-SNEs:
BiocManager::install(c("DT", "NMF", "ComplexHeatmap", "R2HTML", "Rtsne"),ask = F,update = F)
##Install SCENIC
if (!requireNamespace("devtools", quietly = TRUE)) {install.packages("devtools")}
devtools::install_github("aertslab/SCENIC")
packageVersion("SCENIC")
开始分析
一 输入数据loom文件
1 加载分析软件包
#SCENIC分析
library(SCENIC)
#对loom文件操作
library(SCopeLoomR)
#SCENIC依赖的软件包
library(foreach)
2 下载分析所属物种数据库:(人、小鼠、果蝇,数据大小通常超过 1GB,哺乳动物区域数据库为 100GB)
#人
#dbFiles <- c("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-500bp-upstream-7species.mc9nr.feather","https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-tss-centered-10kb-7species.mc9nr.feather")
#小鼠
#dbFiles <- c("https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-500bp-upstream-7species.mc9nr.feather","https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-tss-centered-10kb-7species.mc9nr.feather")
#果蝇
#dbFiles <- c("https://resources.aertslab.org/cistarget/databases/drosophila_melanogaster/dm6/flybase_r6.02/mc8nr/gene_based/dm6-5kb-upstream-full-tx-11species.mc8nr.feather")
##创建一个文件夹保存数据库
#dir.create("/mnt/sdd/singleron_training_class/resources/SCENIC/scenic")
#设置工作目录
#setwd("/mnt/sdd/singleron_training_class/resources/SCENIC/scenic")
#下载数据库到当前工作目录
#for (featherURL in dbFiles) { download.file(featherURL, destfile=basename(featherURL))}
3 准备输入数据—表达矩阵/细胞信息表
#加载loom文件(loom文件详细介绍:https://www.jianshu.com/p/9eb324ca5ff9)
loomPath <- system.file(package="SCENIC","examples/mouseBrain_toy.loom")
#打开loom文件
loom <- open_loom(loomPath)
#获取表达矩阵
exprMat <- get_dgem(loom)
#表达矩阵展示
exprMat[1:4,1:4]
#获取细胞信息表(可忽略)
#cellInfo <- get_cell_annotation(loom)
#head(cellInfo)
#关闭loom文件——loom对象是文件的链接,写入操作完成之后,必需关闭文件,此次只是从中提取数据可不用关闭(考虑到loom使用的严谨性,仍将其关闭)
close_loom(loom)
4 准备数据库文件
#Initialize settings 初始设置,导入评分数据库
scenicOptions <- initializeScenic(org="mgi",dbDir="/mnt/sdd/singleron_training_class/resources/SCENIC/scenic/cisTarget_databases/",nCores=1)
#org指定物种,mgi——mouse, hgnc——human, dmel——fly
#dbDir指定数据库位置
#nCores指定开启几个线程(本课程loom文件测试数据比较小,将nCores设置为1)
#向scenicOptions中添加细胞信息表(可忽略)
# scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds"
#保存数据
saveRDS(scenicOptions,file="/mnt/sdd/singleron_training_class/resources/SCENIC/int/scenicOptions.Rds")
5 运行Scenic流程
step1 根据表达数据推断潜在的转录因子靶标
#此步会对表达矩阵进行过滤:filter旨在去除最可能是噪音的基因,只保留RcisTarget数据库中可用的基因
genesKept <- geneFiltering(exprMat, scenicOptions)
#使用默认参数
#minCountsPerGene=3*.01*ncol(exprMat):保留所有样品中至少带有6个UMI reads的基因
#minSamples=ncol(exprMat)*.01:保留下来的基因能在至少1%的细胞中检测得到
head(genesKept)
length(genesKept)
#保留数据库中有的基因用于分析
exprMat_filtered <- exprMat[genesKept, ]
#exprMat_filtered[1:4,1:4]
#计算相关性矩阵——共表达分析中既有正调控也有负调控,GENIE3无法区分,所以需要相关性矩阵辅助筛选共表达模块中与TF正相关的基因
runCorrelation(exprMat_filtered, scenicOptions)
#无论是否log转换,或使用TPM值,结果相差不大(作者亲自测试)
exprMat_filtered_log <- log2(exprMat_filtered+1)
exprMat_log <- log2(exprMat+1)
#运行GENIE3得到潜在转录因子TF——参考表达矩阵找出输入基因中有哪些TF,并计算TF与每个基因的相关性权重
runGenie3(exprMat_filtered_log, scenicOptions)
#Settings
scenicOptions@settings$dbs <- scenicOptions@settings$dbs["10kb"]
scenicOptions@settings$nCores <- 1
scenicOptions@inputDatasetInfo$org <- "mgi"
# Toy run settings
scenicOptions <- runSCENIC_1_coexNetwork2modules(scenicOptions)
step2 基于motif鉴定每个TF的潜在靶点
#使用RcisTarget对TF-motif进行富集分析
scenicOptions <- runSCENIC_2_createRegulons(scenicOptions,coexMethod=c("top5perTarget"))
step3 AUC打分
#Regulon活性评分与可视化
scenicOptions <- runSCENIC_3_scoreCells(scenicOptions, exprMat_log,skipHeatmap = TRUE,skipTsne = TRUE)
#regulonAUC矩阵转换为二进制矩阵后,重新降维聚类
scenicOptions <- runSCENIC_4_aucell_binarize(scenicOptions,skipBoxplot = TRUE,skipHeatmaps = TRUE,skipTsne = TRUE,exprMat = NULL)
#降维图中展示
#tsneAUC(scenicOptions, aucType="AUC")
#在哪个目录下运行,output文件夹会放在哪个目录
6 scenicOptions
6.1 regulons信息
regulons <- loadInt(scenicOptions, "aucell_regulons")
regulons <- cbind(onlyNonDuplicatedExtended(names(regulons)))
6.2 获取每个转录因子在细胞中的得分
regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
regulonAUC <- regulonAUC[onlyNonDuplicatedExtended(rownames(regulonAUC)),]
regulonAUC <- getAUC(regulonAUC)
6.3 TF与靶基因权重信息
regulonTargetsInfo <- loadInt(scenicOptions, "regulonTargetsInfo")
regulonTargetsInfo <- as.data.frame(regulonTargetsInfo)
regulonTargetsInfo <- regulonTargetsInfo[c("TF","gene","nMotifs","bestMotif","NES","highConfAnnot","CoexWeight")]
[图片上传失败...(image-eb6067-1651393764678)]
二 输入文件为rds(Seurat对象)
1 读取rds
data <- readRDS("")
2 准备输入数据—表达矩阵/细胞信息表
exprMat <- as.matrix(pbmc3k@assays$RNA@data)
cellInfo <- [email protected][,c(4,2,3)]
colnames(cellInfo)=c('CellType', 'nGene' ,'nUMI')
#查看表达矩阵/细胞信息表(下面四行代码仅用于查看数据,可以不执行)
#dim(exprMat)
#exprMat[1:4,1:4]
#head(cellInfo)
#table(cellInfo$CellType)
3 运行Scenic流程(此步与 使用loom文件作为输入一样 参考上面代码 此步运行较慢)
scenicOptions <- initializeScenic(org="hgnc",dbDir="/home/yys/data/scenic", nCores=1)
saveRDS(scenicOptions, file="int/scenicOptions.Rds")
#Co-expression network
genesKept <- geneFiltering(exprMat, scenicOptions)
exprMat_filtered <- exprMat[genesKept, ]
#下面两行代码,查看数据,可有可无
#exprMat_filtered[1:4,1:4]
#dim(exprMat_filtered)
runCorrelation(exprMat_filtered, scenicOptions)
exprMat_filtered_log <- log2(exprMat_filtered+1)
runGenie3(exprMat_filtered_log, scenicOptions)
### Build and score the GRN
exprMat_log <- log2(exprMat+1)
scenicOptions@settings$dbs <- scenicOptions@settings$dbs["10kb"]
scenicOptions <- runSCENIC_1_coexNetwork2modules(scenicOptions)
scenicOptions <- runSCENIC_2_createRegulons(scenicOptions,coexMethod=c("top5perTarget"))
library(doParallel)
scenicOptions <- runSCENIC_3_scoreCells(scenicOptions, exprMat_log )
scenicOptions <- runSCENIC_4_aucell_binarize(scenicOptions)
tsneAUC(scenicOptions, aucType="AUC")