GEO数据表达差异分析

1-1从GEO数据库下载数据

rm(list = ls())
options(stringsAsFactors = F)
library(GEOquery)
gse = "GSE4107"
eSet <- getGEO(gse, 
               destdir = '.', 
               getGPL = F)

1-2提取表达矩阵exp

exp <- exprs(eSet[[1]])
head(exp)
exp = log2(exp+1)
dim(exp)  #22个样本,54975个探针
boxplot(exp)  #对样本的总体表达水平范围有一个了解
截屏2020-07-18 19.35.13.png

1-3提取各样本的分组信息

pd <- pData(eSet[[1]])

1-4调整exp的行名顺序与pd列名完全一致

p = identical(rownames(pd),colnames(exp));p
if(!p) exp <- exp[,match(rownames(pd),colnames(exp))]

1-5提取芯片平台编号,保存

gpl <- eSet[[1]]@annotation

2-1提取分组信息

rm(list = ls())  
load(file = "step1output.Rdata")
library(stringr)
group_list=ifelse(str_detect(pd$title,"control"),"control","treat")
table(group_list)
group_list = factor(group_list,
                    levels = c("control","treat"))

2-2获取芯片的探针注释

gpl 
#http://www.bio-info-trainee.com/1399.html
if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids <- toTable(hgu133plus2SYMBOL) #toTable转换为矩阵
head(ids)
save(exp,group_list,ids,file = "step2output.Rdata")

3借助PCA和热图,进行数据检查
3-1PCA

rm(list = ls())  
load(file = "step1output.Rdata")
load(file = "step2output.Rdata")
dat=as.data.frame(t(exp))  #转换为行是样本,列是基因
library(FactoMineR)#画主成分分析图需要加载这两个包
library(factoextra) 
# pca
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
                         geom.ind = "point", # show points only (nbut not "text")
                         col.ind = group_list, # color by groups
                         #palette = c("#00AFBB", "#E7B800"),  #修改颜色
                         addEllipses = TRUE, # Concentration ellipses
                         legend.title = "Groups"
)
pca_plot
ggsave(plot = pca_plot,filename = paste0(gse,"PCA.png"))
save(pca_plot,file = "pca_plot.Rdata")
GSE4107PCA.png

3-2热图

cg=names(tail(sort(apply(exp,1,sd)),1000))  #对exp按行计算每个探针的标准差,从小到大排序,取标准差最大的前1000个探针,并提取对应的探针名
n=exp[cg,]
#绘制热图
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n) 
library(pheatmap)
pheatmap(n,
         show_colnames =F,
         show_rownames = F,
         annotation_col=annotation_col,
         scale = "row")
dev.off()

4差异分析
4-1差异基因分析

rm(list = ls()) 
load(file = "step2output.Rdata")
library(limma)
design=model.matrix(~group_list)
fit=lmFit(exp,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)

4-2加probe_id列,把行名变成一列

library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg));head(deg)

4-3加symbol列,去重复

deg <- inner_join(deg,ids,by="probe_id");head(deg)
deg <- deg[!duplicated(deg$symbol),]

4-4标记上下调基因

logFC_t=1  #变化超过2倍的视为差异基因
P.Value_t = 0.01  #P值小于等于0.01视为显著
k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)
change = ifelse(k1,"down",ifelse(k2,"up","stable"))
deg <- mutate(deg,change);head(deg)

4-5加ENTREZID列,用于富集分析

library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
s2e <- bitr(deg$symbol, 
            fromType = "SYMBOL",
            toType = "ENTREZID",
            OrgDb = org.Hs.eg.db)#人类
deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL"));head(deg)
save(group_list,deg,logFC_t,P.Value_t,file = "step4output.Rdata")

5画火山图

rm(list = ls()) 
load(file = "step1output.Rdata")
load(file = "step4output.Rdata")
library(dplyr)
library(ggplot2)
dat  = deg
for_label <- dat%>% 
  filter(symbol %in% c("TRPM3","SFRP1")) 
p <- ggplot(data = dat, 
            aes(x = logFC, 
                y = -log10(P.Value))) +
  geom_point(alpha=0.4, size=3.5, 
             aes(color=change)) +
  ylab("-log10(Pvalue)")+
  scale_color_manual(values=c("blue", "grey","red"))+
  geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +
  geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +
  theme_bw()
p
volcano_plot <- p +
  geom_point(size = 3, shape = 1, data = for_label) +
  ggrepel::geom_label_repel(
    aes(label = symbol),
    data = for_label,
    color="black"
  )
volcano_plot
ggsave(plot = volcano_plot,filename = paste0(gse,"volcano.png"))
GSE4107volcano.png

6画热图

load(file = 'step2output.Rdata')
x=deg$logFC 
names(x)=deg$probe_id 
cg = deg$probe_id[deg$change !="stable"]
n=exp[cg,]
dim(n)
library(pheatmap)
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n) 
library(ggplotify)
heatmap_plot <- as.ggplot(pheatmap(n,show_colnames =F,
                                   show_rownames = F,
                                   scale = "row",
                                   #cluster_cols = F, 
                                   annotation_col=annotation_col)) 
heatmap_plot
ggsave(heatmap_plot,filename = paste0(gse,"heatmap.png"))
GSE4107heatmap.png
test.png

你可能感兴趣的:(GEO数据表达差异分析)