scRNA-Seq | pySCENIC

组织内细胞异质性的基础是细胞转录状态的差异,转录状态的特异性又是由转录因子主导的基因调控网络(GRNs)决定并维持稳定的。因此分析单细胞的GRNs有助于深入挖掘细胞异质性背后的生物学意义,并为疾病的诊断、治疗以及发育分化的研究提供有价值的线索。

然而单细胞转录组数据具有背景噪音高、基因检出率低和表达矩阵稀疏性的特点,给传统统计学和生物信息学方法推断高质量的GRNs带来了挑战。Single-cell regulatory network inference and clustering (SCENIC)是一种专为单细胞数据开发的GRNs算法,它的创新之处在于引入了转录因子motif序列验证统计学方法推断的基因共表达网络,从而识别高可靠性的由转录因子主导的GRNs。SCENIC相关的文章2017年首先发表于nature methods,2020年又将流程整理后发表于nature protocls。

SCENIC分析流程
官方介绍的主要分析有四步:

  • GENIE3/GRNBoost:基于共表达情况鉴定每个TF的潜在靶点;
  • RcisTarget:基于DNA-motif 分析选择潜在的直接结合靶点;
  • AUCell:分析每个细胞的regulons活性;
  • 细胞聚类:基于regulons的活性鉴定稳定的细胞状态。

一、 运行 pyscenic

1. loom 文件制备

以pbmc3k为例,降维聚类,输出csv矩阵文件。

library(SeuratData) #加载seurat数据集  
#InstallData("pbmc3k") #安装pbmc3k数据
data("pbmc3k")  
sce <- pbmc3k.final   
library(Seurat)
table(Idents(sce))
p1=DimPlot(sce,label = T) 
p1

write.csv(t(as.matrix(sce@assays$RNA@counts)), file = "pbmc_3k.all.csv")

这一步会生成一个70M的pbmc_3k.all.csv文件

接下来需要在Linux操作了。写一个 Python脚本 ( csv2loom.py )把 csv格式的表达量矩阵 转为 .loom 文件。
这一步是在linux下面操作

import os, sys
os.getcwd()
os.listdir(os.getcwd()) 

import loompy as lp;
import numpy as np;
import scanpy as sc;
x=sc.read_csv("pbmc_3k.all.csv"); ## 曾老师的代码这里是x=sc.read_csv("pbmc_3k.csv");
row_attrs = {"Gene": np.array(x.var_names),};
col_attrs = {"CellID": np.array(x.obs_names)};
lp.create("pbmc_3k.loom",x.X.transpose(),row_attrs,col_attrs);

上面的脚本写了后,就可以 运行 Python脚本 ( csv2loom.py )把 csv格式的表达量矩阵 转为 .loom 文件:

#conda activate pyscenic
python csv2loom.py

这一步会生成一个6.7M的pbmc_3k.loom文件。

2. pyscenic运行

2.1 三大文件下载

但是在这之前需要提前下载好3个重要文件。

文件1:hs_hgnc_tfs.txt,https://github.com/aertslab/pySCENIC/blob/master/resources/hs_hgnc_tfs.txt

文件2: hg19-tss-centered-10kb-7species.mc9nr.feather,https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-tss-centered-10kb-7species.mc9nr.feather

文件3: motifs-v9-nr.hgnc-m0.001-o0.0.tbl,https://resources.aertslab.org/cistarget/motif2tf/motifs-v9-nr.hgnc-m0.001-o0.0.tbl

第1个文件12k,第2个文件1.02G,第三个文件99M,大小一定要正确。

2.2 run_pyscenic.sh脚本编写
# 不同物种的数据库不一样,这里是人类是human 
tfs=$dir/TF/TFs_list/hs_hgnc_tfs.txt
feather=$dir/hg19-tss-centered-10kb-7species.mc9nr.feather
tbl=$dir/TF/TFs_annotation_motif/human_TFs/motifs-v9-nr.hgnc-m0.001-o0.0.tbl 
# 一定要保证上面的数据库文件完整无误哦 
input_loom=pbmc_3k.loom
ls $tfs  $feather  $tbl  

# pyscenic 的3个步骤之 grn
pyscenic grn \
--num_workers 20 \
--output adj.sample.tsv \
--method grnboost2 \
$input_loom  \
$tfs 

#pyscenic 的3个步骤之 cistarget
pyscenic ctx \
adj.sample.tsv $feather \
--annotations_fname $tbl \
--expression_mtx_fname $input_loom  \
--mode "dask_multiprocessing" \
--output reg.csv \
--num_workers 20  \
--mask_dropouts

#pyscenic 的3个步骤之 AUCell
pyscenic aucell \
$input_loom \
reg.csv \
--output out_SCENIC.loom \
--num_workers 20 

这一步会得到11M的out_SCENIC.loom文件。
最重要的三个文件如下

11M 3月  15 09:21 out_SCENIC.loom
6.7M 3月  13 20:59 pbmc_3k.loom
14M 3月  15 09:18 reg.csv
70M 3月  13 18:18 pbmc_3k.all.csv

二、初级可视化

1. 依赖包安装

## SCENIC需要一些依赖包,先安装好
BiocManager::install(c("AUCell", "RcisTarget"))
BiocManager::install(c("GENIE3"))
BiocManager::install(c("zoo", "mixtools", "rbokeh"))
BiocManager::install(c("DT", "NMF", "pheatmap", "R2HTML", "Rtsne"))
BiocManager::install(c("doMC", "doRNG"))
devtools::install_github("aertslab/SCopeLoomR", build_vignettes = TRUE)
devtools::install_github("aertslab/SCENIC")


library(SCENIC)
packageVersion("SCENIC")
#[1] ‘1.2.4’

2. 提取 out_SCENIC.loom 信息

##可视化
rm(list=ls())
library(Seurat)
library(SCopeLoomR)
library(AUCell)
library(SCENIC)
library(dplyr)
library(KernSmooth)
library(RColorBrewer)
library(plotly)
library(BiocParallel)
library(grid)
library(ComplexHeatmap)
library(data.table)
library(scRNAseq)

## 提取 out_SCENIC.loom 信息
#inputDir='./outputs/'
#scenicLoomPath=file.path(inputDir,'out_SCENIC.loom')
library(SCENIC)
loom <- open_loom('out_SCENIC.loom') 

regulons_incidMat <- get_regulons(loom, column.attr.name="Regulons")
regulons_incidMat[1:4,1:4] 
regulons <- regulonsToGeneLists(regulons_incidMat)
names(regulons)


regulonAUC <- get_regulons_AUC(loom,column.attr.name='RegulonsAUC')
rownames(regulonAUC)


regulonAucThresholds <- get_regulon_thresholds(loom)
tail(regulonAucThresholds[order(as.numeric(names(regulonAucThresholds)))])

embeddings <- get_embeddings(loom)  
close_loom(loom)

3. 加载SeuratData

然后载入前面的seurat对象,我们这里仅仅是最基础的示例数据,所以直接使用 SeuratData 包即可

library(SeuratData) #加载seurat数据集  
data("pbmc3k")  
sce <- pbmc3k.final   
table(sce$seurat_clusters)
table(Idents(sce))
sce$celltype = Idents(sce)

library(ggplot2) 
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
                   'CD19', 'CD79A', 'MS4A1' ,
                   'IGHG1', 'MZB1', 'SDC1',
                   'CD68', 'CD163', 'CD14', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'C1QA',  'C1QB',  # mac
                   'S100A9', 'S100A8', 'MMP19',# monocyte
                   'FCGR3A',
                   'LAMP3', 'IDO1','IDO2',## DC3 
                   'CD1E','CD1C', # DC2
                   'KLRB1','NCR1', # NK 
                   'FGF7','MME', 'ACTA2', ## fibo 
                   'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                   'MKI67' , 'TOP2A', 
                   'PECAM1', 'VWF',  ## endo 
                   'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )

library(stringr)  
genes_to_check=str_to_upper(genes_to_check)
genes_to_check


p <- DotPlot(sce , features = unique(genes_to_check),
             assay='RNA'  )  + coord_flip() +   theme(axis.text.x=element_text(angle=45,hjust = 1))
p 
ggsave('check_last_markers.pdf',height = 11,width = 11)



DimPlot(sce,reduction = "umap",label=T ) 
sce$sub_celltype =  sce$celltype
DimPlot(sce,reduction = "umap",label=T,group.by = "sub_celltype" )
ggsave('umap-by-sub_celltype.pdf')



Idents(sce) <- sce$sub_celltype
sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.8)
table([email protected]$RNA_snn_res.0.8)  
DimPlot(sce,reduction = "umap",label=T ) 
ggsave('umap-by-sub_RNA_snn_res.0.8.pdf')

这里的代码仍然是简单的检验一下自己的降维聚类分群是否合理,方便后续合并分析。

4. 四种可视化

现在我们就可以把pyscenic的转录因子分析结果去跟我们的降维聚类分群结合起来进行5种可视化展示。

合并的代码如下所示:

sub_regulonAUC <- regulonAUC[,match(colnames(sce),colnames(regulonAUC))]
dim(sub_regulonAUC)
sce 


#确认是否一致
identical(colnames(sub_regulonAUC), colnames(sce))
#[1] TRUE

cellClusters <- data.frame(row.names = colnames(sce), 
                           seurat_clusters = as.character(sce$seurat_clusters))
cellTypes <- data.frame(row.names = colnames(sce), 
                        celltype = sce$sub_celltype)
head(cellTypes)
head(cellClusters)
sub_regulonAUC[1:4,1:4] 
save(sub_regulonAUC,cellTypes,cellClusters,sce,
     file = 'for_rss_and_visual.Rdata')

这个时候,我们的pbmc3k数据集里面的两千多个细胞都有表达量矩阵,也有转录因子活性打分信息。

B细胞有两个非常出名的转录因子,TCF4(+) 以及NR2C1(+),接下来就可以对这两个进行简单的可视化。

首先,我们需要把这两个转录因子活性信息 添加到降维聚类分群后的的seurat对象里面。

#尴尬的是TCF4(+)我这个数据里面没有,换了个PAX5(+)和REL(+)
regulonsToPlot = c('PAX5(+)','REL(+)')
regulonsToPlot
[email protected] = cbind([email protected] ,t(assay(sub_regulonAUC[regulonsToPlot,])))
Idents(sce) <- sce$sub_celltype
table(Idents(sce))

DotPlot(sce, features = unique(regulonsToPlot)) + RotatedAxis()
RidgePlot(sce, features = regulonsToPlot , ncol = 1)
VlnPlot(sce, features = regulonsToPlot,pt.size = 0 ) 
FeaturePlot(sce, features = regulonsToPlot )

可以看到b细胞有两个非常出名的转录因子,TCF4(+) 以及NR2C1(+),确实是在b细胞比较独特的高。

DotPlot
山峦图

可以看到效果其实没有前面的DotPlot好

image
image

三、各个单细胞亚群特异性激活转录因子及可视化

回顾一下pyscenic的转录因子分析结果

######  step0 加载 各种R包  #####

rm(list=ls())
library(Seurat)
library(SCopeLoomR)
library(AUCell)
library(SCENIC)
library(dplyr)
library(KernSmooth)
library(RColorBrewer)
library(plotly)
library(BiocParallel)
library(grid)
library(ComplexHeatmap)
library(data.table)
library(scRNAseq)

load('for_rss_and_visual.Rdata')
head(cellTypes) 
sub_regulonAUC[1:4,1:2] 
dim(sub_regulonAUC)
#[1]  220 2638

值得一提的是这个pbmc3k数据集的两千多个细胞,其实就220个转录因子结果(曾老师教程里面是208个)。

1. TF活性均值

看看不同单细胞亚群的转录因子活性平均值

# Split the cells by cluster:
selectedResolution <- "celltype" # select resolution
cellsPerGroup <- split(rownames(cellTypes), 
                       cellTypes[,selectedResolution]) 

# 去除extened regulons
sub_regulonAUC <- sub_regulonAUC[onlyNonDuplicatedExtended(rownames(sub_regulonAUC)),] 
dim(sub_regulonAUC)
#[1]  220 2638 #似乎没啥区别

# Calculate average expression:
regulonActivity_byGroup <- sapply(cellsPerGroup,
                                  function(cells) 
                                    rowMeans(getAUC(sub_regulonAUC)[,cells]))

# Scale expression. 
# Scale函数是对列进行归一化,所以要把regulonActivity_byGroup转置成细胞为行,基因为列
# 参考:https://www.jianshu.com/p/115d07af3029
regulonActivity_byGroup_Scaled <- t(scale(t(regulonActivity_byGroup),
                                          center = T, scale=T)) 
# 同一个regulon在不同cluster的scale处理
dim(regulonActivity_byGroup_Scaled)
#[1] 220   9
regulonActivity_byGroup_Scaled=regulonActivity_byGroup_Scaled[]
regulonActivity_byGroup_Scaled=na.omit(regulonActivity_byGroup_Scaled)

2. 热图查看TF分布

pheatmap(regulonActivity_byGroup_Scaled)
image

可以看到,确实每个单细胞亚群都是有 自己的特异性的激活的转录因子。

3. rss 查看特异TF

不过,SCENIC包自己提供了一个 calcRSS函数,帮助我们来挑选各个单细胞亚群特异性的转录因子,全称是:Calculates the regulon specificity score

参考文章:The RSS was first used by Suo et al. in: Revealing the Critical Regulators of Cell Identity in the Mouse Cell Atlas. Cell Reports (2018). doi: 10.1016/j.celrep.2018.10.045
运行超级简单。

rss <- calcRSS(AUC=getAUC(sub_regulonAUC), 
               cellAnnotation=cellTypes[colnames(sub_regulonAUC), 
                                           selectedResolution]) 
rss=na.omit(rss) 
rssPlot <- plotRSS(rss)
plotly::ggplotly(rssPlot$plot)

image

PAX5(+)和REL(+)的确在B细胞里面高表达。

4. 其他查看TF方式

library(dplyr) 
rss=regulonActivity_byGroup_Scaled
head(rss)
library(dplyr) 
df = do.call(rbind,
             lapply(1:ncol(rss), function(i){
               dat= data.frame(
                 path  = rownames(rss),
                 cluster =   colnames(rss)[i],
                 sd.1 = rss[,i],
                 sd.2 = apply(rss[,-i], 1, median)  
               )
             }))
df$fc = df$sd.1 - df$sd.2
top5 <- df %>% group_by(cluster) %>% top_n(5, fc)
rowcn = data.frame(path = top5$cluster) 
n = rss[top5$path,] 
#rownames(rowcn) = rownames(n)
pheatmap(n,
         annotation_row = rowcn,
         show_rownames = T)

image

也可以按照sd计算df的sd.2

image

或者按照mean计算df的sd.2

image

在这里似乎median、sd、mean都差不多。

至此, pySCENIC的转录因子分析及数据可视化教程复现结束,

报错:

2023-04-04 00:28:00,057 - distributed.nanny - WARNING - Worker process still alive after 3.9999994277954105 seconds, killing
2023-04-04 00:28:00,076 - distributed.nanny - WARNING - Worker process still alive after 3.9999994277954105 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.9999988555908206 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.9999990463256836 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
2023-04-04 00:28:00,081 - distributed.nanny - WARNING - Worker process still alive after 3.9999988555908206 seconds, killing
2023-04-04 00:28:00,092 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
Traceback (most recent call last):
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/bin/pyscenic", line 8, in 
    sys.exit(main())
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/pyscenic/cli/pyscenic.py", line 713, in main
    args.func(args)
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/pyscenic/cli/pyscenic.py", line 106, in find_adjacencies_command
    network = method(
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/arboreto/algo.py", line 39, in grnboost2
    return diy(expression_data=expression_data, regressor_type='GBM', regressor_kwargs=SGBM_KWARGS,
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/arboreto/algo.py", line 134, in diy
    return client \
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 3215, in compute
    result = self.gather(futures)
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 2174, in gather
    return self.sync(
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 338, in sync
    return sync(
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 405, in sync
    raise exc.with_traceback(tb)
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 378, in f
    result = yield future
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/tornado/gen.py", line 762, in run
    value = future.result()
  File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 2037, in _gather
    raise exception.with_traceback(traceback)
distributed.scheduler.KilledWorker: ('ndarray-99aebd9077176e83098d0511c36efe41', )

-n 设置小点
参考:
https://www.jianshu.com/p/0a29ecfaf21e
https://www.jianshu.com/p/05d1b2d0d772
https://www.jianshu.com/p/7ab2d6c8f764
https://mp.weixin.qq.com/s/SIfyGzx4fwXPtQsVvvwwMQ
https://mp.weixin.qq.com/s/py4AWdtaNNMPqLzU4loODQ
https://mp.weixin.qq.com/s/yaYSqqvBnK8OlL0ZQkR94Q

可视化举例

做热图主要分为两种,一种是把细胞分组求regulons的活性平均值/中位数,另一种是展示所有细胞regulons的活性平均值。

  • 1、细胞分组求regulons的活性平均值/中位数

    image
  • 2、所有细胞的活性值

    image
  • 3、基于Seurat的气泡图

    image
  • 4、基于Seurat的峰图

    image

    5、基于Seurat的小提琴图

    image

    6、基于Seurat的点图

    image

    7、regulon网络调控图

    image

    8、RSS得分热图和气泡图

    image
    image

https://mp.weixin.qq.com/s/zaXpaTQ0IwYGgMO3XIGVaQ

https://www.jianshu.com/p/7180828033a7
https://www.jianshu.com/p/1c9937e7996c
https://blog.csdn.net/qq_42090739/article/details/127745764
https://zhuanlan.zhihu.com/p/537069815
https://www.jianshu.com/p/c47332b880aa
https://mp.weixin.qq.com/s/pN4qWdUszuGqr8nOJstn8w

https://mp.weixin.qq.com/s/5y-72Hfy3vaxgLiT9kZ4_Q
https://mp.weixin.qq.com/s/sA5F1y4Awki9hiHl2T1DrA

你可能感兴趣的:(scRNA-Seq | pySCENIC)