这次介绍的是10X Genomics的数据分析
数据读取
#构建Seurat对象
library(Seurat)
library(dplyr)
# 读取Peripheral Blood Mononuclear Cells (PBMC)数据
pbmc.data <- Read10X(data.dir = "hg19")
# 初始化一个Seurat对象。
# 在初始化的时候,使用每个细胞表达的基因数不小于200,
# 计数基因表达在不少于3个细胞中做为初筛。
pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3,
min.genes = 200, project = "10X_PBMC")
我们还要读取另外一张表格,就是CellRanger产生的计数表
library(Seurat)
nUMI.summary.file <- "umi_expression_matrix.tsv"
counts <- list()
con <- file(nUMI.summary.file, open="r")
## 读取文件头
header <- readLines(con, n=1, warn=FALSE)
header <- strsplit(header, "\\t")[[1]]
## 读取计数
i <- 1
while(length(buf <- readLines(con, n=1e6, warn=FALSE))>0){
buf <- as.matrix(read.delim(text=buf, header=FALSE, row.names=1))
counts[[i]] <- Matrix(buf, sparse=TRUE)
i <- i + 1
}
counts <- do.call(rbind, counts)
rownames(counts)[1:5]
接下来就是构建个对象了
colnames(counts) <- header
## 初始化一个Seurat对象
d <- CreateSeuratObject(raw.data = counts, min.cells = 10,
min.genes = 200, project = "test_project")
QC
1.注入注释
#加载注释
library(rtracklayer)
gtf <- import("Homo_sapiens.GRCh37.87.chr.gtf.gz")
gtf <- gtf[gtf$gene_name!=""]
gtf <- gtf[!is.na(gtf$gene_name)]
#protein coding
protein <-
gtf$gene_name[gtf$transcript_biotype %in%
c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_LV_gene",
"IG_M_gene", "IG_V_gene", "IG_Z_gene",
"nonsense_mediated_decay", "nontranslating_CDS",
"non_stop_decay", "polymorphic_pseudogene",
"protein_coding", "TR_C_gene", "TR_D_gene", "TR_gene",
"TR_J_gene", "TR_V_gene")]
#mitochondrial genes
mito <- gtf$gene_name[as.character(seqnames(gtf)) %in% "MT"]
## long noncoding
lincRNA <-
gtf$gene_name[gtf$transcript_biotype %in%
c("3prime_overlapping_ncrna", "ambiguous_orf",
"antisense_RNA", "lincRNA", "ncrna_host", "non_coding",
"processed_transcript", "retained_intron",
"sense_intronic", "sense_overlapping")]
## short noncoding
sncRNA <-
gtf$gene_name[gtf$transcript_biotype %in%
c("miRNA", "miRNA_pseudogene", "misc_RNA",
"misc_RNA_pseudogene", "Mt_rRNA", "Mt_tRNA",
"Mt_tRNA_pseudogene", "ncRNA", "pre_miRNA",
"RNase_MRP_RNA", "RNase_P_RNA", "rRNA", "rRNA_pseudogene",
"scRNA_pseudogene", "snlRNA", "snoRNA",
"snRNA_pseudogene", "SRP_RNA", "tmRNA", "tRNA",
"tRNA_pseudogene", "ribozyme", "scaRNA", "sRNA")]
## pseudogene
pseudogene <-
gtf$gene_name[gtf$transcript_biotype %in%
c("disrupted_domain", "IG_C_pseudogene", "IG_J_pseudogene",
"IG_pseudogene", "IG_V_pseudogene", "processed_pseudogene",
"pseudogene", "transcribed_processed_pseudogene",
"transcribed_unprocessed_pseudogene",
"translated_processed_pseudogene",
"translated_unprocessed_pseudogene", "TR_J_pseudogene",
"TR_V_pseudogene", "unitary_pseudogene",
"unprocessed_pseudogene")]
#构建注释
annotations <- list(protein=unique(protein),
mito=unique(mito),
lincRNA=unique(lincRNA),
sncRNA=unique(sncRNA),
pseudogene=unique(pseudogene))
annotations <- annotations[lengths(annotations)>0]
2.cell筛选
# Seurat会计算基因数以及UMI数 (nGene and nUMI).
# Seurat会将原始数据保存在raw.data slot中,
# 每一行对应一个基因,每一列对应一个细胞.
# 在计算比例时,使用目标基因中的数值除以总的数值。
percent <- lapply(annotations, function(.ele){
Matrix::colSums(
[email protected][which(rownames(pbmc@data) %in% .ele), , drop=FALSE]) /
Matrix::colSums([email protected])
})
# AddMetaData 会在[email protected]中加一列。这些信息都会在QC中使用到。
for(i in seq_along(percent)){
pbmc <- AddMetaData(object = pbmc, metadata = percent[[i]],
col.name = paste0("percent.", names(percent)[i]))
}
# AddMetaData也可以直接通过改变[email protected]来进行.
id <- sample(c("s_A", "s_B"), nrow([email protected]), replace = TRUE)
names(id) <- rownames([email protected]) #必须有names
[email protected]$f1 <- id
最后pbmc是这样的
接下来我们画出图来,在去除极端细胞
VlnPlot(object = pbmc,
features.plot = c("nGene", "nUMI", paste0("percent.", names(percent))),
nCol = 4)
筛选
pbmc <- FilterCells(object = pbmc,
subset.names = c("nGene", "percent.protein",
"percent.mito"),
low.thresholds = c(200, .95, -Inf),
high.thresholds = c(2500, Inf, 0.05))
标准化
pbmc <- NormalizeData(object = pbmc,
normalization.method = "LogNormalize",
scale.factor = 1e4)
获取高差异基因
为什么要这么做呢,因为很多cell的基因是不差异表达的,所以我们只用关注高差异的基因即可,这样大大减少计算量
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean,
dispersion.function = LogVMR, x.low.cutoff = 0.0125,
x.high.cutoff = 3, y.cutoff = 0.5)
消除批次效应
将批次效应写入metadata
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))
PCA
这里的PCA是看我们需要选取多少个主成分来进行下游分析,意思就是,我们在进行PCA细胞分类中,决定细胞分类的是基因的表达谱,那么我们看哪些基因的表达谱对细胞分类贡献最大,那么我们就重点研究那些细胞
Suerat使用了jackStraw方法来估计应该使用多少个principal components的基因来进行下游分析。
pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
JackStrawPlot(object = pbmc, PCs = 1:12)
我们可以看到,PC1-10越来越平缓,即贡献度越来越低
PCElbowPlot(object = pbmc)
选取贡献最大的PC中的基因进行后续分析
细胞分级
我们根据PCA分选的结果,记性细胞分集
pbmc <- RunTSNE(object = pbmc, dims.use = 1:10)
TSNEPlot(object = pbmc)
分完集后,要寻找marker基因进行细胞的鉴定
以cluster1为例
# find all markers of cluster 1
cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
以cluster5为例
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 5,
ident.2 = c(0,3), min.pct = 0.25)
head(cluster5.markers, n = 5)
以所有cluster为例
# find markers for every cluster compared to all remaining cells,
# report only the positive ones
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE,
min.pct = 0.25, thresh.use = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
#另一种算法
cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 0,
thresh.use = 0.25, test.use = "roc",
only.pos = TRUE)
拿到特异性基因marker后画个小提琴图看一下
VlnPlot(object = pbmc, features.plot = c("MS4A1", "CD79A"))
我们也可以看一下marker基因在那些类里面表达
FeaturePlot(object = pbmc,
features.plot = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A",
"FCGR3A", "LYZ", "PPBP", "CD8A"),
cols.use = c("grey", "blue"), reduction.use = "tsne")
Heatmap看细胞群表达:
pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC) -> top10
# 设置slim.col.label=TRUE将避免打印每个细胞的名称,而只打印cluster的ID.
DoHeatmap(object = pbmc, genes.use = top10$gene,
slim.col.label = TRUE, remove.key = TRUE)
对分类细胞进行标记
鉴定好细胞类群以后,我们可以在图上进行标记
本例的细胞类群如下:
current.cluster.ids <- c(0, 1, 2, 3, 4, 5, 6, 7)
new.cluster.ids <- c("CD4 T cells", "CD14+ Monocytes", "B cells",
"CD8 T cells", "FCGR3A+ Monocytes", "NK cells",
"Dendritic cells", "Megakaryocytes")
pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids,
to = new.cluster.ids)
TSNEPlot(object = pbmc, do.label = TRUE, pt.size = 0.5)
亚群分析
在上述分类的基础上,可以继续对类群进行分类
pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")
# 下面就直接提高一下区分度,设置resolution = 0.8
pbmc <- FindClusters(object = pbmc, reduction.type = "pca",
dims.use = 1:10, resolution = 0.8,
print.output = FALSE)
# 我们可以看到当resolution提高之后,CD4 T细胞被分成了两个亚群。
plot1 <- TSNEPlot(object = pbmc, do.return = TRUE,
no.legend = TRUE, do.label = TRUE)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE,
group.by = "ClusterNames_0.6",
no.legend = TRUE, do.label = TRUE)
plot_grid(plot1, plot2)
由图,CD4 T那一类又可以分成两类
那么这两类再做一次基因marker的寻找
# 我们再一次寻找不同分集中的marders。
tcell.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = 1)
# Most of the markers tend to be expressed in C1 (i.e. S100A4).
# However, we can see that CCR7 is upregulated in
# C0, strongly indicating that we can differentiate memory from naive CD4 cells.
# cols.use demarcates the color palette from low to high expression
FeaturePlot(object = pbmc, features.plot = c("S100A4", "CCR7"),
cols.use = c("green", "blue"))
Monocle
这个包主要用于为时间分析,即细胞轨迹追踪,比方说A细胞经过发育可以发育成B细胞,这就是该细胞的发育轨迹
library(monocle)
library(DDRTree)
library(pheatmap)
library(plyr)
library(data.table)
#读入UMI计数表格
expr_matrix <- read.delim(nUMI.summary.file)
sample_info <- data.frame(sampleID=colnames(expr_matrix))
gene_annotation <- data.frame(symbol=rownames(expr_matrix))
pd <- new("AnnotatedDataFrame", data = sample_info)
fd <- new("AnnotatedDataFrame", data = gene_annotation)
d <- newCellDataSet(as.matrix(expr_matrix), phenoData = pd, featureData = fd)
#或者由Seurat对象直接转化为Monocole对象
sce <- importCDS(pbmc)
QC和过滤
sce <- estimateSizeFactors(sce)
sce <- estimateDispersions(sce)
#去除表达量低的细胞
sce <- detectGenes(sce, min_expr = 0.1)
expressed_genes <- row.names(subset(fData(sce),
num_cells_expressed >= 10))
接下来过滤死细胞
pData(sce)$Total_mRNAs <- Matrix::colSums(exprs(sce))
sce <- sce[, pData(sce)$Total_mRNAs < 1e6 ]
upper_bound <- 10^(mean(log10(pData(sce)$Total_mRNAs)) +
2*sd(log10(pData(sce)$Total_mRNAs)))
lower_bound <- 10^(mean(log10(pData(sce)$Total_mRNAs)) -
2*sd(log10(pData(sce)$Total_mRNAs)))
qplot(Total_mRNAs, data = pData(sce), color = orig.ident, geom = "density") +
geom_vline(xintercept = lower_bound) +
geom_vline(xintercept = upper_bound) + xlim(0, 6000)
#过滤
sce <- sce[,pData(sce)$Total_mRNAs > lower_bound & pData(sce)$Total_mRNAs < upper_bound]
类似于正态分布极端值情况,我们把多于(少于)这两个表达量阈值的cell进行过滤
我们看一下过滤的效果
sce <- detectGenes(sce, min_expr = 0.1)
# Log-transform each value in the expression matrix.
L <- log(exprs(sce[expressed_genes,]))
# Standardize each gene, so that they are all on the same scale,
# Then melt the data with plyr so we can plot it easily
melted_dens_df <- melt(Matrix::t(scale(Matrix::t(L))))
# Plot the distribution of the standardized gene expression values.
qplot(value, geom = "density", data = melted_dens_df) +
stat_function(fun = dnorm, size = 0.5, color = 'red') +
xlab("Standardized log(counts)") +
ylab("Density")
细胞分群
我们按照这样的思路,即平均表达量较高,方差大的基因作为研究对象,这样的基因差异比较大
disp_table <- dispersionTable(sce)
unsup_clustering_genes <- subset(disp_table,
mean_expression >= 0.1) ## 数值因实验而不同
sce <- setOrderingFilter(sce, unsup_clustering_genes$gene_id)
plot_ordering_genes(sce)
选取PC贡献高的基因为研究对象
plot_pc_variance_explained(sce, return_all = FALSE, max_components = 50)
选取前几个即可
然后选取贡献高的基因后,对细胞进行分群
sce <- reduceDimension(sce, max_components = 2, norm_method = 'log',
num_dim = 15, reduction_method = "tSNE", verbose = TRUE)
sce <- clusterCells(sce)
## 查看tSNE图
plot_cell_clusters(sce, color_by = "Cluster") + facet_wrap(~orig.ident)
轨迹追踪
diff_test_res <- differentialGeneTest(sce[expressed_genes,],
fullModelFormulaStr = "~orig.ident")
sce_ordering_genes <- row.names(subset(diff_test_res, qval < 0.1))
sce <-
setOrderingFilter(sce,
ordering_genes = sce_ordering_genes)
plot_ordering_genes(sce)
我们可以想象,理想情况下,从状态A到状态B,一组基因在相同细胞内可以保持类似的变化最好。但是我们寻找这些基因是不太可能的。所以问题就转换成为,我们如何可以尽可能地将细胞分成两类,状态A或者状态B。如果我们有实验样品X和实验样品Y,它们应该从状态A转换到状态B,因为某种原因,样品X会比样品Y到达状态B的时间短一些,那么我们就需要拿到X与Y中差异性最大的基因就好了。所以如果你有不同时间点的样品,那是最好的,只需要把下文中的fullModelFormulaStr指定成你的时间点就好了
接着可以画细胞轨迹了
#降维
sce <- reduceDimension(sce, method = 'DDRTree')
#可视化
sce <- orderCells(sce)
plot_cell_trajectory(sce, color_by = "State")
差异表达
#对于不同属性样品差异分析
diff_test_res <- differentialGeneTest(sce[expressed_genes, ],
fullModelFormulaStr = "~orig.ident")
sig_genes <- subset(diff_test_res, qval < 0.1)
sig_genes[1:2,c("gene_short_name", "pval", "qval")]
#对不同状态差异分析
```diff_test_res <- differentialGeneTest(sce[expressed_genes, ],
fullModelFormulaStr = "~State")
sig_genes <- subset(diff_test_res, qval < 0.1)
sig_genes[1:2,c("gene_short_name", "pval", "qval")]
#对不同时间点(伪时间点)
diff_test_res <- differentialGeneTest(sce[expressed_genes, ],
fullModelFormulaStr = "~sm.ns(Pseudotime)")
sig_genes <- subset(diff_test_res, qval < 0.1)
sig_genes[1:2,c("gene_short_name", "pval", "qval")]
#多因素
diff_test_res <- differentialGeneTest(sce[expressed_genes, ],
fullModelFormulaStr = "~State+orig.ident")
sig_genes <- subset(diff_test_res, qval < 0.01)
sig_genes[1:2,c("gene_short_name", "pval", "qval")]
热图:
sig_genes <- sig_genes[order(sig_genes$qval), ]
plot_pseudotime_heatmap(sce[row.names(sig_genes)[1:20],],
num_clusters = 3,
cores = 1,
show_rownames = T)
BEAM分析
BEAM分析的目的是比较分枝点与分枝末端的差异,什么意思呢?就是说在细胞轨迹追溯的过程中,比方说A细胞在分支点开始分化为B细胞和C细胞,B,C两个细胞是不同类型的细胞。
在分化末端,基因表达谱差异很大,那么BEAM就是动态的观察这种细胞发育轨迹在分支点到分支末端基因表达谱的改变
BEAM_res <- BEAM(sce[expressed_genes, ], branch_point = 1, cores = 4)
BEAM_res <- BEAM_res[order(BEAM_res$qval),]
BEAM_res <- BEAM_res[,c("gene_short_name", "pval", "qval")]
plot_genes_branched_heatmap(sce[row.names(subset(BEAM_res,
qval < 1e-4)),],
branch_point = 1,
num_clusters = 4,
cores = 1,
use_gene_short_name = T,
show_rownames = T)
也可以针对基因
plot_genes_branched_pseudotime(sce[rownames(subset(BEAM_res, qval < 1e-8)), ],
branch_point = 1, color_by = "orig.ident",
ncol = 3)
这幅图主要讲述了单个基因,在细胞轨迹的某几个分支(Y_2,Y_93),经过一定时间的发育,该基因的表达情况的变化