这次偶然发现了处理RNA-seq的新神器,首先附上地址:http://www.bioconductor.org/packages/devel/workflows/html/rnaseqGene.html
其中我比较喜欢的是降维这一部分和去除批次效应这一部分
1.数据准备
在说明文档里,作者采用的是salmon进行定量的
library("airway")
dir <- system.file("extdata", package="airway", mustWork=TRUE)
list.files(dir)
那么我们的目录里面包括
csvfile <- file.path(dir, "sample_table.csv")
coldata <- read.csv(csvfile, row.names=1, stringsAsFactors=FALSE)
coldata <- coldata[1:2,]
coldata$names <- coldata$Run
coldata$files <- file.path(dir, "quants", coldata$names, "quant.sf.gz")
由于salmon的output为了让数据包更小,quant.sf往往采用.gz的方式压缩,当然也可能存在quant.sf,两种形式都可以存在
se <- tximeta(coldata)
gse <- summarizeToGene(se)
2.差异表达
我们先可以采用如下命令看看数据对象长什么样
assayNames(gse)
head(assay(gse), 3)
colSums(assay(gse))
rowRanges(gse)
seqinfo(rowRanges(gse))
colData(gse)
然后设置分组
gse$cell <- gse$donor
gse$dex <- gse$condition
# when renaming levels, the order must be preserved!
levels(gse$dex) <- c("untrt", "trt")
其中
# when renaming levels, the order must be preserved!
levels(gse$dex) <- c("untrt", "trt")
可以用magrittr包的函数代替
library("magrittr")
gse$dex %<>% relevel("untrt")
接下来就是构建dds对象
library("DESeq2")
dds <- DESeqDataSet(gse, design = ~ cell + dex)
如果想看看表达矩阵,那么:
countdata <- round(assays(gse)[["counts"]])
head(countdata, 3)
3.可视化
3.1 热图
我们先画个聚类热图看看
vsd <- vst(dds, blind = FALSE)
sampleDists <- dist(t(assay(vsd)))
library("pheatmap")
library("RColorBrewer")
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)
或者换种写法
library("PoiClaClu")
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
clustering_distance_rows = poisd$dd,
clustering_distance_cols = poisd$dd,
col = colors)
3.2 降维
首先是PCA
vsd <- vst(dds, blind = FALSE)
sampleDists <- dist(t(assay(vsd)))
plotPCA(vsd, intgroup = c("dex", "cell"))
那么我们还可以提取降维后的PC1和PC2的坐标
pcaData <- plotPCA(vsd, intgroup = c( "dex", "cell"), returnData = TRUE)
pcaData
然后我们就可以随心所欲的画图了,比方说双分组:
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed() +
ggtitle("PCA with VST data")
接着是广义的PCA,什么时候用广义的PCA呢?当我们的数据不满足于正态分布的时候(比方说,over-dispersed count data),我们才用广义PCA
library("glmpca")
gpca <- glmpca(counts(dds), L=2)
gpca.dat <- gpca$factors
gpca.dat$dex <- dds$dex
gpca.dat$cell <- dds$cell
ggplot(gpca.dat, aes(x = dim1, y = dim2, color = dex, shape = cell)) +
geom_point(size =3) + coord_fixed() + ggtitle("glmpca - Generalized PCA")
接着是MDS降维
mds <- as.data.frame(colData(vsd)) %>%
cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed() + ggtitle("MDS with VST data")
mdsPois <- as.data.frame(colData(dds)) %>%
cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed() + ggtitle("MDS with PoissonDistances")
4.差异表达
dds <- DESeq(dds)
res <- results(dds)
提取对应分组的差异表达
res <- results(dds, contrast=c("dex","trt","untrt"))
summary(res)
res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)
选取差异表达最显著的基因看各组表达量情况
topGene <- rownames(res)[which.min(res$padj)]
library("ggbeeswarm")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c("dex","cell"),
returnData = TRUE)
ggplot(geneCounts, aes(x = dex, y = count, color = cell)) +
scale_y_log10() + geom_beeswarm(cex = 3)
5. 基因聚类
library("genefilter")
topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20)
mat <- assay(vsd)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
anno <- as.data.frame(colData(vsd)[, c("cell","dex")])
pheatmap(mat, annotation_col = anno)
6.注释
将基因ID注释成基因名称
library("AnnotationDbi")
library("org.Hs.eg.db")
ens.str <- substr(rownames(res), 1, 15)
res$symbol <- mapIds(org.Hs.eg.db,
keys=ens.str,
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
res$entrez <- mapIds(org.Hs.eg.db,
keys=ens.str,
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
7. 去除批次效应
去除批次效应有两个R包可以完成,sva和RUVSeq
7.1. sva
library("sva")
dat <- counts(dds, normalized = TRUE)
idx <- rowMeans(dat) > 1
dat <- dat[idx, ]
mod <- model.matrix(~ dex, colData(dds))
mod0 <- model.matrix(~ 1, colData(dds))
svseq <- svaseq(dat, mod, mod0, n.sv = 2)
ddssva <- dds
ddssva$SV1 <- svseq$sv[,1]
ddssva$SV2 <- svseq$sv[,2]
design(ddssva) <- ~ SV1 + SV2 + dex
7.2 RUV
library("RUVSeq")
set <- newSeqExpressionSet(counts(dds))
idx <- rowSums(counts(set) > 5) >= 2
set <- set[idx, ]
set <- betweenLaneNormalization(set, which="upper")
not.sig <- rownames(res)[which(res$pvalue > .1)]
empirical <- rownames(set)[ rownames(set) %in% not.sig ]
set <- RUVg(set, empirical, k=2)
ddsruv <- dds
ddsruv$W1 <- set$W_1
ddsruv$W2 <- set$W_2
design(ddsruv) <- ~ W1 + W2 + dex