DESeq2的基本使用

DEseq预热

主要就是这几个步骤了。

#准备count tables
cnts <- matrix(rnbinom(n=1000, mu=100, size=1/0.5),ncol = 10) #构建满足随机负二项分布的一组数
cond <- factor(rep(1:2,each=5))

#创建DEseqDataSet
dds2 <- DESeqDataSetFromMatrix(cnts,DataFrame(cond), ~cond)

#标准分析
dds2 <- DESeq(dds2)
res <- results(dds2)
##另一种检验,LRT,似然率检验
#ddsLRT <- DESeq(dds2,test="LRT",reduced=~1)
#resLRT <- results(ddsLRT)

#moderate lfc
resultsNames(dds2)
resLFC <- lfcShrink(dds2,coef = 2,type = "apeglm")

#提取结果
summary()、subset()

1. counts: 查看每个样本在每个基因上的counts数

library(DESeq2)
> dds <- makeExampleDESeqDataSet(m=4)
> head(counts(dds))
      sample1 sample2 sample3 sample4
gene1       0       0       2       1
gene2      14      56      50      47
gene3       5       3       3       8
gene4      70     105     116     125
gene5       5       2       2      12
gene6     153      70      70      53
> dds1 <- estimateSizeFactors(dds)
> head(counts(dds1,normalized=TRUE)) #通过估计size对counts数进行标准化
         sample1   sample2    sample3     sample4
gene1   0.000000  0.000000   1.883002   0.9839948
gene2  14.041418 53.289007  47.075041  46.2477573
gene3   5.014792  2.854768   2.824502   7.8719587
gene4  70.207091 99.916889 109.214095 122.9993545
gene5   5.014792  1.903179   1.883002  11.8079380
gene6 153.452641 66.611259  65.905057  52.1517263

2. 构建DESeqDataSet

> #from matrix:从数值矩阵得到,比如用featurecount统计得到的数值矩阵
> a <- as.matrix(read.table("6sample_count_matrix.txt",sep="\t",header = T,row.names=1))
> head(a)
           WT1  WT2  WT3 acc1.51 acc1.52 acc1.53
AT1G01010  226  183  247     514     353     383
AT1G01020  386  364  453     369     483     443
AT1G01030   77   65   87      52      62      74
AT1G01040 1322 1258 1621    1333    1526    1641
AT1G01050 1531 1518 2029    1573    1888    1987
AT1G01060  124  151  823     146     197     297
> b <- read.table("coldata.txt",header = T,row.names = 1)
> b
        condition
WT1            WT
WT2            WT
WT3            WT
acc1.51    acc1.5
acc1.52    acc1.5
acc1.53    acc1.5
> all(rownames(b) == colnames(a)) #make sure that it's true
[1] TRUE
> dds3 <- DESeqDataSetFromMatrix(countData = a,
+                                DataFrame(b),
+                                design = ~ condition)
> #from htseq-count:由单个htseq-count结果文件合并得到
> c <- read.table("sampletable_for_htseq.txt.txt",sep = "\t", header = F)
> c
       V1                V2     V3
1     WT1 SRR3286802.count2     WT
2     WT2 SRR3286803.count2     WT
3     WT3 SRR3286804.count2     WT
4 acc1.51 SRR3286805.count2 acc1.5
5 acc1.52 SRR3286806.count2 acc1.5
6 acc1.53 SRR3286807.count2 acc1.5
> colnames(c) <- c("samplename","filename","condition")
> dds4 <- DESeqDataSetFromHTSeqCount(c, directory = ".", design = ~condition)

3. run DESeq() & results()——主要分析步骤

#是否需要初步过滤一下
#keep <- rowSums(counts(dds4)) >= 20 # 对counts数进行初步的过滤
#dds4 <- dds4[keep,]
> dds4 <- DESeq(dds4) #三步走:文库大小估计;离散程度估计;统计检验
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
> res4 <- results(dds4) #结果呈现
> res4
log2 fold change (MLE): condition WT vs acc1.5 
Wald test p-value: condition WT vs acc1.5 
DataFrame with 37884 rows and 6 columns
                  baseMean      log2FoldChange             lfcSE               stat               pvalue                padj
                                                                      
AT1G01010  308.89691311743  -0.873161860576389 0.235593150198139  -3.70622770586514 0.000210369178701959 0.00230248659971188
AT1G01020 392.486827906655  -0.014533822556904 0.140844859602549 -0.103190294611512     0.91781194256941    0.95926533475846
AT1G01030 63.2288140763626   0.399739741186851 0.255254339106486   1.56604484212152    0.117338119589006   0.275094993904629
......

4. LFC校正

resultsNames(dds4) #显示谁和谁比较
res.shr <- lfcShrink(dds4,coef = 2) #2表示第2列,即LFC这一列
res.shr
res.shr2 <- lfcShrink(dds4,contrast = c("condition","acc1.5","WT")) #contrast和coef不能同时存在
res.shr2
#更换校正模型
res.ape <- lfcShrink(dds4,coef = 2, type = "apeglm")
res.ash <- lfcShrink(dds4, coef = 2, type = "ashr")
res.ash2 <- lfcShrink(dds4, contrast = c("condition","acc1.5","WT"), type = "ashr")

#这一步只会对LFC的值产生影响,p值是没有改变的

5. 简单作图

plotCounts: 用以检验单个基因在组间的reads数有多少
plotCounts(dds4,"AT1G01010")
plotMA: 所有样本normalized counts的平均值和LFC的关系

从整体上来看gene的上调下调情况,注意用resultsNames()看一下是谁和谁比较

res.shr2 <- lfcShrink(dds4,coef = 2,type = "apeglm") #校正LFC
plotMA(res.shr2,alpha=0.01,main="apeglm") #alpha=0.01,定义显著水平

6. results()——从DESeq分析中提取结果

> res4 <- results(dds4,contrast = c("condition","acc1.5","WT"),alpha = 0.001,tidy = F)
> # dds4经过了DESeq()分析
> # contrast: 定义谁和谁比较
> # lfcThreshold: 定义了lfc阈值,从MA图中可以看到红点减少,此外results()结果中不满足lfc阈值的gene的p值都是1
> # alpha: 定义显著水平
> # tidy: 是否将结果输出为data.frame格式
> summary(res4,alpha=0.001)

out of 27432 with nonzero total read count
adjusted p-value < 0.001
LFC > 0 (up)       : 1198, 4.4%
LFC < 0 (down)     : 554, 2%
outliers [1]       : 63, 0.23%
low counts [2]     : 6254, 23%
(mean count < 7)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

> # res4经过了results()操作
> # alpha: 校正p值的阈值,如果没有设置,会沿用results()中的alpha参数

#按照自定义的阈值提取差异基因并导出
#diff_gene_deseq2 <-subset(res, padj < 0.001 & abs(log2FoldChange) > 1)
#write.csv(diff_gene_deseq2,file= "DEG_acc1.5_vs_WT.csv")

你可能感兴趣的:(DESeq2的基本使用)