Monocle2 实验学习日志

12.1
1)Install Bioconductor Packages
官网:https://bioconductor.org/install/

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install()

2)加载包与数据

library(ggplot2)
library(HSMMSingleCell)
library(monocle)
library(Biobase)
library(knitr)
library(reshape2)
data(HSMM_expr_matrix)
data(HSMM_gene_annotation)
data(HSMM_sample_sheet)
HSMM_expr_matrix[1:10,1:5]
                    T0_CT_A01 T0_CT_A03 T0_CT_A05 T0_CT_A06 T0_CT_A07
ENSG00000000003.10  21.984400  1.280040 43.461800   0.00000 39.807600
ENSG00000000005.5    0.000000  0.000000  0.000000   0.00000  0.000000
ENSG00000000419.8   40.059700 77.580800  6.496560   4.90934  1.156520
ENSG00000000457.8    0.937081  0.729195  0.000000   0.00000  0.000000
ENSG00000000460.12   0.740922 57.578500  3.935870   0.00000  0.000000
ENSG00000000938.8    0.000000  0.000000  0.000000   0.00000  0.000000
ENSG00000000971.11   3.002980 15.302400 50.804800   4.68513  0.000000
ENSG00000001036.8  128.197000 16.086700 25.320900  10.66480 63.773500
ENSG00000001084.6    7.619720  0.000000  0.000000   0.00000  0.000000
ENSG00000001167.10  13.024900 24.777600  0.681409   1.36587  0.399352
head(HSMM_gene_annotation)
                   gene_short_name        biotype num_cells_expressed
ENSG00000000003.10          TSPAN6 protein_coding                 231
ENSG00000000005.5             TNMD protein_coding                   0
ENSG00000000419.8             DPM1 protein_coding                 275
ENSG00000000457.8            SCYL3 protein_coding                  24
ENSG00000000460.12        C1orf112 protein_coding                  78
ENSG00000000938.8              FGR protein_coding                   0
                   use_for_ordering
ENSG00000000003.10            FALSE
ENSG00000000005.5             FALSE
ENSG00000000419.8             FALSE
ENSG00000000457.8             FALSE
ENSG00000000460.12             TRUE
ENSG00000000938.8             FALSE
head(HSMM_sample_sheet)
               Library Well Hours Media Mapped.Fragments Pseudotime State
T0_CT_A01 SCC10013_A01  A01     0    GM          1958074  23.916673     1
T0_CT_A03 SCC10013_A03  A03     0    GM          1930722   9.022265     1
T0_CT_A05 SCC10013_A05  A05     0    GM          1452623   7.546608     1
T0_CT_A06 SCC10013_A06  A06     0    GM          2566325  21.463948     1
T0_CT_A07 SCC10013_A07  A07     0    GM          2383438  11.299806     1
T0_CT_A08 SCC10013_A08  A08     0    GM          1472238  67.436042     2

12.2
1)NewCellDataSet()函数生成对象
(1)函数参数
表达矩阵: as.matrix(exprs)
featureData = fd 基因与基因信息
phenoData = pd 细胞与细胞信息
expressionFamily= (tobit() // negbinomial.size() // gaussianff()) 矩阵的归一化形式
(2)数据集
数达量矩阵exprs: 行名基因 列名细胞编号
细胞表达性信息pd: 第一列细胞编号 其他列相关信息
基因注释fd: 第一列基因编号 其他列相关信息
(3)注意
phenoData 行名要和表达矩阵列名匹配(列是细胞)
featureData 行名要与表达矩阵行名匹配(行是基因)
featureData至少有一列"gene_short_name"(symble)

2)构造S4对象

#得到基因信息与样本信息
pd <- new("AnnotatedDataFrame", data = HSMM_sample_sheet)
fd <- new("AnnotatedDataFrame", data = HSMM_gene_annotation)
#1 创建对象
HSMM <- newCellDataSet(as.matrix(HSMM_expr_matrix),
                        phenoData = pd,
                        featureData = fd,
                        lowerDetectionLimit = 0.1,
                        expressionFamily = tobit(Lower = 0,1))
#2 估计RNA的数目
rpc_matrix <- relative2abs(HSMM)
rpc_matrix[1:10,1:5]
                    T0_CT_A01  T0_CT_A03  T0_CT_A05  T0_CT_A06  T0_CT_A07
ENSG00000000003.10 1.60309506 0.09929705 2.93679928 0.00000000 2.18692386
ENSG00000000005.5  0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
ENSG00000000419.8  2.92113986 6.01820615 0.43898533 0.34343867 0.06353614
ENSG00000000457.8  0.06833163 0.05656613 0.00000000 0.00000000 0.00000000
ENSG00000000460.12 0.05402778 4.46655980 0.26595447 0.00000000 0.00000000
ENSG00000000938.8  0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
ENSG00000000971.11 0.21897629 1.18705914 3.43298023 0.32775379 0.00000000
ENSG00000001036.8  9.34808217 1.24789995 1.71098300 0.74606865 3.50354678
ENSG00000001084.6  0.55562742 0.00000000 0.00000000 0.00000000 0.00000000
ENSG00000001167.10 0.94977133 1.92208258 0.04604415 0.09555105 0.02193934
#3 使用RNA数目创建新的CDS对象
HSMM <- newCellDataSet(as(as.matrix(rpc_matrix),"sparseMatrix"),
                        phenoData = pd,
                        featureData = fd,
                        lowerDetectionLimit = 0.5,
                        expressionFamily = negbinomial.size()
                        )

3)detectGenes()统计与fData() pData()表格提取函数
(1)fData() 提取CDS对象中的基因注释表格
pData()提取CDS对象中细胞表型表格
(即row为基因或者细胞 volumn为属性性质)
(2)detectGenes(CDC,min_exp =)
①筛选保留至少在n个细胞中表达的基因
②统计基因表达的细胞数目+num_cell
Monocle2 实验学习日志_第1张图片

③统计细胞表达的基因数目+num_gene
Monocle2 实验学习日志_第2张图片

4)基于基因的过滤(低质量细胞与未检测基因)

#SizeFactor标准化mRNA差异
HSMM <- estimateSizeFactors(HSMM)
#分散度用于后续分析
HSMM <- estimateDispersions(HSMM)
#Removing 139 outliers

HSMM <-detectGenes(HSMM,min_expr = 0.1)
#挑选基因 并计算每个细胞中表达的基因数量
print(head(fData(HSMM)))
print(head(pData(HSMM)))
                   gene_short_name        biotype num_cells_expressed
ENSG00000000003.10          TSPAN6 protein_coding                 184
ENSG00000000005.5             TNMD protein_coding                   0
ENSG00000000419.8             DPM1 protein_coding                 211
ENSG00000000457.8            SCYL3 protein_coding                  18
ENSG00000000460.12        C1orf112 protein_coding                  47
ENSG00000000938.8              FGR protein_coding                   0
                   use_for_ordering
ENSG00000000003.10            FALSE
ENSG00000000005.5             FALSE
ENSG00000000419.8             FALSE
ENSG00000000457.8             FALSE
ENSG00000000460.12             TRUE
ENSG00000000938.8             FALSE
               Library Well Hours Media Mapped.Fragments Pseudotime State
T0_CT_A01 SCC10013_A01  A01     0    GM          1958074  23.916673     1
T0_CT_A03 SCC10013_A03  A03     0    GM          1930722   9.022265     1
T0_CT_A05 SCC10013_A05  A05     0    GM          1452623   7.546608     1
T0_CT_A06 SCC10013_A06  A06     0    GM          2566325  21.463948     1
T0_CT_A07 SCC10013_A07  A07     0    GM          2383438  11.299806     1
T0_CT_A08 SCC10013_A08  A08     0    GM          1472238  67.436042     2
          Size_Factor num_genes_expressed
T0_CT_A01    1.392811                6850
T0_CT_A03    1.311607                6947
T0_CT_A05    1.218922                7019
T0_CT_A06    1.013981                5560
T0_CT_A07    1.085580                5998
T0_CT_A08    1.099878                6055

你可能感兴趣的:(数据挖掘)