12.1
1)Install Bioconductor Packages
官网:https://bioconductor.org/install/
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install()
2)加载包与数据
library(ggplot2)
library(HSMMSingleCell)
library(monocle)
library(Biobase)
library(knitr)
library(reshape2)
data(HSMM_expr_matrix)
data(HSMM_gene_annotation)
data(HSMM_sample_sheet)
HSMM_expr_matrix[1:10,1:5]
T0_CT_A01 T0_CT_A03 T0_CT_A05 T0_CT_A06 T0_CT_A07
ENSG00000000003.10 21.984400 1.280040 43.461800 0.00000 39.807600
ENSG00000000005.5 0.000000 0.000000 0.000000 0.00000 0.000000
ENSG00000000419.8 40.059700 77.580800 6.496560 4.90934 1.156520
ENSG00000000457.8 0.937081 0.729195 0.000000 0.00000 0.000000
ENSG00000000460.12 0.740922 57.578500 3.935870 0.00000 0.000000
ENSG00000000938.8 0.000000 0.000000 0.000000 0.00000 0.000000
ENSG00000000971.11 3.002980 15.302400 50.804800 4.68513 0.000000
ENSG00000001036.8 128.197000 16.086700 25.320900 10.66480 63.773500
ENSG00000001084.6 7.619720 0.000000 0.000000 0.00000 0.000000
ENSG00000001167.10 13.024900 24.777600 0.681409 1.36587 0.399352
head(HSMM_gene_annotation)
gene_short_name biotype num_cells_expressed
ENSG00000000003.10 TSPAN6 protein_coding 231
ENSG00000000005.5 TNMD protein_coding 0
ENSG00000000419.8 DPM1 protein_coding 275
ENSG00000000457.8 SCYL3 protein_coding 24
ENSG00000000460.12 C1orf112 protein_coding 78
ENSG00000000938.8 FGR protein_coding 0
use_for_ordering
ENSG00000000003.10 FALSE
ENSG00000000005.5 FALSE
ENSG00000000419.8 FALSE
ENSG00000000457.8 FALSE
ENSG00000000460.12 TRUE
ENSG00000000938.8 FALSE
head(HSMM_sample_sheet)
Library Well Hours Media Mapped.Fragments Pseudotime State
T0_CT_A01 SCC10013_A01 A01 0 GM 1958074 23.916673 1
T0_CT_A03 SCC10013_A03 A03 0 GM 1930722 9.022265 1
T0_CT_A05 SCC10013_A05 A05 0 GM 1452623 7.546608 1
T0_CT_A06 SCC10013_A06 A06 0 GM 2566325 21.463948 1
T0_CT_A07 SCC10013_A07 A07 0 GM 2383438 11.299806 1
T0_CT_A08 SCC10013_A08 A08 0 GM 1472238 67.436042 2
12.2
1)NewCellDataSet()函数生成对象
(1)函数参数
表达矩阵: as.matrix(exprs)
featureData = fd 基因与基因信息
phenoData = pd 细胞与细胞信息
expressionFamily= (tobit() // negbinomial.size() // gaussianff()) 矩阵的归一化形式
(2)数据集
数达量矩阵exprs: 行名基因 列名细胞编号
细胞表达性信息pd: 第一列细胞编号 其他列相关信息
基因注释fd: 第一列基因编号 其他列相关信息
(3)注意
phenoData 行名要和表达矩阵列名匹配(列是细胞)
featureData 行名要与表达矩阵行名匹配(行是基因)
featureData至少有一列"gene_short_name"(symble)
2)构造S4对象
#得到基因信息与样本信息
pd <- new("AnnotatedDataFrame", data = HSMM_sample_sheet)
fd <- new("AnnotatedDataFrame", data = HSMM_gene_annotation)
#1 创建对象
HSMM <- newCellDataSet(as.matrix(HSMM_expr_matrix),
phenoData = pd,
featureData = fd,
lowerDetectionLimit = 0.1,
expressionFamily = tobit(Lower = 0,1))
#2 估计RNA的数目
rpc_matrix <- relative2abs(HSMM)
rpc_matrix[1:10,1:5]
T0_CT_A01 T0_CT_A03 T0_CT_A05 T0_CT_A06 T0_CT_A07
ENSG00000000003.10 1.60309506 0.09929705 2.93679928 0.00000000 2.18692386
ENSG00000000005.5 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
ENSG00000000419.8 2.92113986 6.01820615 0.43898533 0.34343867 0.06353614
ENSG00000000457.8 0.06833163 0.05656613 0.00000000 0.00000000 0.00000000
ENSG00000000460.12 0.05402778 4.46655980 0.26595447 0.00000000 0.00000000
ENSG00000000938.8 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
ENSG00000000971.11 0.21897629 1.18705914 3.43298023 0.32775379 0.00000000
ENSG00000001036.8 9.34808217 1.24789995 1.71098300 0.74606865 3.50354678
ENSG00000001084.6 0.55562742 0.00000000 0.00000000 0.00000000 0.00000000
ENSG00000001167.10 0.94977133 1.92208258 0.04604415 0.09555105 0.02193934
#3 使用RNA数目创建新的CDS对象
HSMM <- newCellDataSet(as(as.matrix(rpc_matrix),"sparseMatrix"),
phenoData = pd,
featureData = fd,
lowerDetectionLimit = 0.5,
expressionFamily = negbinomial.size()
)
3)detectGenes()统计与fData() pData()表格提取函数
(1)fData() 提取CDS对象中的基因注释表格
pData()提取CDS对象中细胞表型表格
(即row为基因或者细胞 volumn为属性性质)
(2)detectGenes(CDC,min_exp =)
①筛选保留至少在n个细胞中表达的基因
②统计基因表达的细胞数目+num_cell
4)基于基因的过滤(低质量细胞与未检测基因)
#SizeFactor标准化mRNA差异
HSMM <- estimateSizeFactors(HSMM)
#分散度用于后续分析
HSMM <- estimateDispersions(HSMM)
#Removing 139 outliers
HSMM <-detectGenes(HSMM,min_expr = 0.1)
#挑选基因 并计算每个细胞中表达的基因数量
print(head(fData(HSMM)))
print(head(pData(HSMM)))
gene_short_name biotype num_cells_expressed
ENSG00000000003.10 TSPAN6 protein_coding 184
ENSG00000000005.5 TNMD protein_coding 0
ENSG00000000419.8 DPM1 protein_coding 211
ENSG00000000457.8 SCYL3 protein_coding 18
ENSG00000000460.12 C1orf112 protein_coding 47
ENSG00000000938.8 FGR protein_coding 0
use_for_ordering
ENSG00000000003.10 FALSE
ENSG00000000005.5 FALSE
ENSG00000000419.8 FALSE
ENSG00000000457.8 FALSE
ENSG00000000460.12 TRUE
ENSG00000000938.8 FALSE
Library Well Hours Media Mapped.Fragments Pseudotime State
T0_CT_A01 SCC10013_A01 A01 0 GM 1958074 23.916673 1
T0_CT_A03 SCC10013_A03 A03 0 GM 1930722 9.022265 1
T0_CT_A05 SCC10013_A05 A05 0 GM 1452623 7.546608 1
T0_CT_A06 SCC10013_A06 A06 0 GM 2566325 21.463948 1
T0_CT_A07 SCC10013_A07 A07 0 GM 2383438 11.299806 1
T0_CT_A08 SCC10013_A08 A08 0 GM 1472238 67.436042 2
Size_Factor num_genes_expressed
T0_CT_A01 1.392811 6850
T0_CT_A03 1.311607 6947
T0_CT_A05 1.218922 7019
T0_CT_A06 1.013981 5560
T0_CT_A07 1.085580 5998
T0_CT_A08 1.099878 6055