参考文章:
- 【果子学生信】GEO挖掘教程
- 【生信技能树】表达芯片数据分析伴侣
转载请注明:陈熹 [email protected] (号:半为花间酒)
一、导入包
library(GEOquery)
library(tidyverse)
library(AnnoProbe)
library(idmap2)
library(limma)
library(mice)
AnnoProbe
和idmap2
都是Jimmy写的包,使用非常方便,前者借助了GEO数据库中国区镜像方便下载芯片数据,后者简化了芯片注释流程,储存了很多GPL注释信息
二、获取数据和芯片注释
dat <- geoChina('GSExxxxx')
gset <- dat[[1]]
# 获取分组详情
pdata <- pData(gset)
exprSet <- exprs(gset)
# 获取芯片数据
ids <- get_soft_IDs('GPLxxxxx')
如果idmap2
中没有注释信息则采用原始办法,再利用dplyr
和stringr
处理
library(GEOquery)
GPLxxxxx <-getGEO('GPLxxxxx',destdir =".")
GPLxxxxx <- Table(GPLxxxxx)
三、设置分组
group_list <- c(rep('con',6),rep('test',6))
group_list <- factor(group_list)
group_list <- relevel(group_list, ref="con")
四、可视化数据并按需标准化
需要人工校正一下,用的方法类似于Quntile Normalization
boxplot(exprSet,outline=FALSE, notch=T,col=group_list, las=2)
exprSet <- normalizeBetweenArrays(exprSet)
boxplot(exprSet,outline=FALSE, notch=T,col=group_list, las=2)
五、数值log2化校正
一般芯片数值需要小于20,测序数据需要小于16
ex <- exprSet
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprSet <- log2(ex)
print("log2 transform finished")}else{print("log2 transform not needed")}
六、评估缺失值情况
md.pattern(exprSet)
# 按需去除
# exprSet <- exprSet %>%
# na.omit(.)
七、探针注释
ids <- ids %>%
rename('probeset' = 'ID')
exprSet <- as.data.frame(exprSet)
exprSet <- exprSet %>%
rownames_to_column(var="probeset") %>%
inner_join(ids,by="probeset") %>%
select(-probeset) %>%
select(symbol,everything()) %>%
separate(symbol,c("symbol","drop"),sep=" /// ") %>%
select(-drop) %>%
mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>%
filter(symbol != "NA") %>%
arrange(desc(rowMean)) %>%
distinct(symbol,.keep_all = T) %>%
select(-rowMean) %>%
filter(symbol != '---') %>%
column_to_rownames(var = "symbol")
八、limma
包差异分析
# 步骤一
design <- model.matrix(~0+factor(group_list))
colnames(design) <- levels(factor(group_list))
rownames(design) <- colnames(exprSet)
design
# 步骤二
contrast.matrix <- makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
contrast.matrix <- makeContrasts(test - con,levels = design)
contrast.matrix
# 步骤三
fit <- lmFit(exprSet,design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
tempOutput <- topTable(fit2,adjust="fdr",number = Inf,sort.by = "B",coef = 1)
nrDEG <- na.omit(tempOutput)
head(nrDEG)
# 检查差异分析分组
exprSet['xxx',]
# 写入文件
write.csv(nrDEG,"results.csv",row.names = T)