题目来源:https://mp.weixin.qq.com/s/UMCNnURzw5ngeA75Wb2CeQ
现在这个数据集,有24个样本,是4X3X2=24 ,其中4个细胞系和3个时间点。如果是两两对比,至少可以是8次差异分析,每个细胞系的24和72小时处理都需要与0小时进行差异分析,拿到上下调基因!
这里给大家学徒作业,做这8次差异分析,拿到上下调基因,然后上下调基因分开,绘制upsetR这个高级韦恩图,还有8个基因集合并起来绘制GO-KEGG富集分析图表,看起来工作量大, 但是这个分析很有意义,希望你能完成!
step1 - 读入数据
1.1 获取表达矩阵
rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)
library(AnnoProbe)
if (file.exists("GSE103115_eSet.Rdata")) {
load("GSE103115_eSet.Rdata")
}else{
gset <- geoChina("GSE103115")
}##获取数据集
checkGPL(gset[[1]]@annotation)
probe2gene <- idmap(gset[[1]]@annotation) ##获取探针基因转换字典
genes_expr <- filterEM(gset[[1]]@assayData$exprs,probe2gene)
genes_expr[1:4,1:4]
# GSM2753951 GSM2753952 GSM2753953 GSM2753954
#ZZZ3 -0.061379433 0.2470622 -0.008480072 0.08610869
#ZZEF1 -0.648055100 -0.2451930 1.059624200 -0.01988363
#ZYX 0.005674362 -0.1853190 0.274405000 -0.09063005
#ZYG11B 0.142938380 -0.1425798 0.060013056 -0.08956265
表达量有负值???被标准化过了???
进入GEO页面查询GSE103115
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE103115
发现了non-normalized矩阵,下载之。读入R。
xx <- read.table("GSE103115_non-normalized.txt",sep = '\t')
xx[1:4,1:5]
# ID_REF X6898341040_A Detection.Pval X6898341040_B Detection.Pval.1
#1 ILMN_1762337 101.9 0.26104 102.6 0.50519
#2 ILMN_2055271 113.9 0.01429 129.7 0.01429
#3 ILMN_1736007 91.6 0.82987 98.4 0.71558
#4 ILMN_2383229 96.6 0.57662 102.1 0.52987
yy <- xx[,seq(from = 2,to = ncol(xx),by=2)]
rownames(yy) <- xx[,1]
dim(yy)
#[1] 47322 24
yy[1:4,1:4]
# X6898341040_A X6898341040_B X6898341040_C X6898341040_D
#ILMN_1762337 101.9 102.6 96.8 107.3
#ILMN_2055271 113.9 129.7 108.4 125.9
#ILMN_1736007 91.6 98.4 97.4 113.5
#ILMN_2383229 96.6 102.1 97.0 103.5
列名很奇怪,猜测跟样本名相关,进入GEO单独的样本页面查看。
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2753951
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2753952
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2753953
GSM2753951 对应 6898341040_A
GSM2753952 对应 6898341040_B
GSM2753953 对应 6898341040_C
以此类推(此处其实最好写个爬虫抓取对应关系)
colnames(yy) <- colnames(genes_expr)
yy <- filterEM(yy,probe2gene)
yy[1:4,1:4]
# GSM2753951 GSM2753952 GSM2753953 GSM2753954
#ZZZ3 637.4 669.6 339.3 690.0
#ZZEF1 156.8 228.3 205.5 265.8
#ZYX 438.5 1544.4 857.8 1047.7
#ZYG11B 697.2 1140.0 819.8 971.3
boxplot(yy)
yy <- log2(yy+1)
boxplot(yy)
library(limma)
yy <- normalizeBetweenArrays(yy)
boxplot(yy)
1.2 获取临床信息,从中提取分组信息
pheno <- gset[[1]]@phenoData@data ##获取临床信息
tmp <- data.frame(row.names = rownames(pheno),title = pheno[,1])
head(tmp)
# title
#GSM2753951 HCC38 0h rep2
#GSM2753952 MDA-MB-231 72h rep1
#GSM2753953 MDA-MB-157 0h rep1
#GSM2753954 BT549 72h rep1
#GSM2753955 MDA-MB-231 0h rep1
#GSM2753956 MDA-MB-157 72h rep2
library(stringr)
tmp$CellLines <- str_split(tmp$title,' ',simplify = T)[,1]
tmp$CL <- paste0(str_sub(tmp$CellLines,1,1),str_sub(str_split(tmp$CellLines,'-',simplify = T)[,3],1,1))
tmp$hours <- str_split(tmp$title,' ',simplify = T)[,2]
tmp$sample_IDs <- rownames(tmp)
tmp$groups <- paste(tmp$CL,tmp$hours,sep = '.')
head(tmp)
# title CellLines CL hours sample_IDs
#GSM2753951 HCC38 0h rep2 HCC38 H 0h GSM2753951
#GSM2753952 MDA-MB-231 72h rep1 MDA-MB-231 M2 72h GSM2753952
#GSM2753953 MDA-MB-157 0h rep1 MDA-MB-157 M1 0h GSM2753953
#GSM2753954 BT549 72h rep1 BT549 B 72h GSM2753954
#GSM2753955 MDA-MB-231 0h rep1 MDA-MB-231 M2 0h GSM2753955
#GSM2753956 MDA-MB-157 72h rep2 MDA-MB-157 M1 72h GSM2753956
grouplist <- tmp$groups
names(grouplist) <- rownames(tmp)
gl <- factor(grouplist)
save(yy,tmp,grouplist,file = 'step1-output.Rdata')
step2 - Check Data
2.1 样本相关性
rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)
load("step1-output.Rdata")
dat <- yy
dat[1:4,1:4]
## GSM2753951 GSM2753952 GSM2753953 GSM2753954
## ZZZ3 9.622862 9.345666 8.880464 9.501694
## ZZEF1 7.460888 7.819952 8.075162 8.148970
## ZYX 9.058763 10.571679 10.201516 10.128589
## ZYG11B 9.746743 10.115445 10.158117 10.012370
rownames(tmp)
colnames(dat) <- rownames(tmp)
M <- cor(dat)
library(pheatmap)
ac <- data.frame(CellLines = tmp$CellLines,
time = tmp$hours)
rownames(ac)=colnames(dat)
pheatmap(M,annotation_col = ac)
2.2 主成分分析(PCA)
rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)
load("step1-output.Rdata")
table(grouplist)
#grouplist
# B.0h B.24h B.72h H.0h H.24h H.72h M1.0h M1.24h M1.72h M2.0h M2.24h M2.72h
# 2 2 2 2 2 2 2 2 2 2 2 2
按细胞系+时间点,每组只有2个样本,不好做PCA。
2.2.1 按照细胞系做PCA
table(tmp$CellLines)
# BT549 HCC38 MDA-MB-157 MDA-MB-231
# 6 6 6 6
dat <- yy
dat[1:4,1:4]
# GSM2753951 GSM2753952 GSM2753953 GSM2753954
#ZZZ3 637.4 669.6 339.3 690.0
#ZZEF1 156.8 228.3 205.5 265.8
#ZYX 438.5 1544.4 857.8 1047.7
#ZYG11B 697.2 1140.0 819.8 971.3
dim(dat)
#[1] 20937 24
dat=t(dat)#画PCA图时要求是行名时样本名,列名时探针名,因此此时需要转换
dat=as.data.frame(dat)#将matrix转换为data.frame
dim(dat)
#[1] 24 20937
dat=cbind(dat,tmp$CellLines) #将CellLines信息追加到最后一列
colnames(dat)[ncol(dat)] <- "CellLines"
library("FactoMineR")
library("factoextra")
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#现在dat最后一列是CellLines,需要重新赋值给一个dat.pca,这个矩阵是不含有分组信息的
fviz_pca_ind(dat.pca,
geom.ind = "point",
col.ind = dat$CellLines,
addEllipses = TRUE,
legend.title = "CellLines"
)
2.2.2 按照时间点做PCA
table(tmp$hours)
# 0h 24h 72h
# 8 8 8
dat <- yy
dat=t(dat)#画PCA图时要求是行名时样本名,列名时探针名,因此此时需要转换
dat=as.data.frame(dat)#将matrix转换为data.frame
dat=cbind(dat,tmp$hours) #将CellLines信息追加到最后一列
colnames(dat)[ncol(dat)] <- "hours"
library("FactoMineR")
library("factoextra")
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#现在dat最后一列是CellLines,需要重新赋值给一个dat.pca,这个矩阵是不含有分组信息的
fviz_pca_ind(dat.pca,
geom.ind = "point",
col.ind = dat$hours,
addEllipses = TRUE,
legend.title = "Timepoint"
)
2.3 热图
rm(list=ls())
load("step1-output.Rdata")
dat <- yy
cg=names(tail(sort(apply(dat,1,sd)),1000))#apply按行('1'是按行取,'2'是按列取)取每一行的方差,从小到大排序,取最大的1000个
library(pheatmap)
n=t(scale(t(dat[cg,]))) # 'scale'可以对log-ratio数值进行归一化
n[n>2]=2
n[n< -2]= -2
n[1:4,1:4]
# GSM2753951 GSM2753952 GSM2753953 GSM2753954
#MRPL11 0.9091092 0.2982721 -1.7029878 -0.0322281
#TOB1 2.0000000 -0.7394836 0.1459653 -0.5471982
#LANCL1 1.6540921 -0.5890577 0.2143829 -1.1585638
#CYB5D2 -0.3480440 -0.5252493 1.5632971 -0.4775324
ac=data.frame(CellLines=tmp$CellLines,
time = tmp$hours)
rownames(ac)=colnames(n) #把ac的行名给到n的列名,即对每一个探针标记上分组信息
pheatmap(n,show_rownames = F,
annotation_col=ac)
step3 - DEG
参考:【生信星球】那些常用的limma操作
rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)
load("step1-output.Rdata")
gl <- factor(grouplist)
design <- model.matrix(~ 0 + gl)
rownames(design) <- rownames(tmp)
colnames(design) <- as.character(levels(gl))
fit <- lmFit(yy, design)
colnames(design)
gl_n <- colnames(design)
cont <- list()
all_up_genes <- c()
all_down_genes <- c()
for (i in 0:3) {
x <- i*3+1
cont <- c(cont,paste(gl_n[x+1],gl_n[x],sep = '-'))
cont <- c(cont,paste(gl_n[x+2],gl_n[x],sep = '-'))
}
for (i in 1:length(cont)) {
x <- cont[[i]]
print(x)
contrast.matrix <- makeContrasts(x,levels = design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
cont[[i]]$res <- topTable(fit2,coef=1,n=Inf)
with(cont[[i]]$res,plot(logFC,-log10(P.Value)))
cont[[i]]$logFC_thred <- with(cont[[i]]$res,mean(abs(logFC))+2*sd(abs(logFC)))
print(paste0("logFC_thred: ",round(cont[[i]]$logFC_thred*1000)/1000))
cont[[i]]$res$change <- with(cont[[i]]$res,
ifelse(P.Value > 0.05,"stable",
ifelse(logFC > cont[[i]]$logFC_thred,"up",
ifelse(logFC < -cont[[i]]$logFC_thred,"down",
"stable"))))
print(table(cont[[i]]$res$change))
cont[[i]]$up_genes <- with(cont[[i]],rownames(res[res$change=='up',]))
all_up_genes <- c(all_up_genes,cont[[i]]$up_genes)
cont[[i]]$down_genes <- with(cont[[i]],rownames(res[res$change=='down',]))
all_down_genes <- c(all_down_genes,cont[[i]]$down_genes)
}
#[1] "B.24h-B.0h"
#[1] "logFC_thred: 0.308"
# down stable up
# 265 20383 289
#[1] "B.72h-B.0h"
#[1] "logFC_thred: 0.433"
# down stable up
# 378 20159 400
#[1] "H.24h-H.0h"
#[1] "logFC_thred: 0.275"
# down stable up
# 262 20464 211
#[1] "H.72h-H.0h"
#[1] "logFC_thred: 0.388"
# down stable up
# 423 20165 349
#[1] "M1.24h-M1.0h"
#[1] "logFC_thred: 0.261"
# down stable up
# 173 20554 210
#[1] "M1.72h-M1.0h"
#[1] "logFC_thred: 0.302"
# down stable up
# 283 20312 342
#[1] "M2.24h-M2.0h"
#[1] "logFC_thred: 0.366"
# down stable up
# 306 20280 351
#[1] "M2.72h-M2.0h"
#[1] "logFC_thred: 0.379"
# down stable up
# 316 20228 393
library(UpSetR)
all_up_genes <- unique(all_up_genes)
all_down_genes <- unique(all_down_genes)
library(dplyr)
for (i in 1:8) {
cont[[i]]$up_genes_list <- ifelse(all_up_genes %in% cont[[i]]$up_genes,1,0)
cont[[i]]$down_genes_list <- ifelse(all_down_genes %in% cont[[i]]$down_genes,1,0)
}
dat_down <- as.data.frame(do.call(cbind,lapply(cont,function(x){ return(x$down_genes_list)})))
dat_down <- cbind(dat_down,all_down_genes)
for (i in 1:length(cont)) colnames(dat_down)[i] <- cont[[i]][[1]]
upset(dat_down,nsets = 8)
dat_up <- as.data.frame(do.call(cbind,lapply(cont,function(x){ return(x$up_genes_list)})))
dat_up <- cbind(dat_up,all_down_genes)
for (i in 1:length(cont)) colnames(dat_up)[i] <- cont[[i]][[1]]
upset(dat_up,nsets = 8)
保存数据:
save(cont,file = '8-DEG-results.rdata')
save(all_down_genes,file = 'down_genes_list.rdata')
save(all_up_genes,file = 'up_genes_list.rdata')
step 4 - GO/KEGG
4.1 KEGG
rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)
load("up_genes_list.rdata")
load("down_genes_list.rdata")
S2E <- function(genelist) {
genes <- as.data.frame(genelist)
colnames(genes) <- c("SYMBOL")
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(genes$SYMBOL,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db)
return(as.character(df$ENTREZID))
}
gene_up <- S2E(all_up_genes)
gene_down <- S2E(all_down_genes)
gene_diff <- unique(c(gene_up,gene_down))
library(clusterProfiler)
kk_up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
head(kk_up)[,1:6]
# ID Description GeneRatio BgRatio pvalue p.adjust
#hsa03030 hsa03030 DNA replication 17/773 36/7978 7.207655e-09 2.255996e-06
#hsa04110 hsa04110 Cell cycle 32/773 124/7978 1.512416e-07 2.366932e-05
#hsa03460 hsa03460 Fanconi anemia pathway 19/773 54/7978 3.064372e-07 2.434113e-05
#hsa03430 hsa03430 Mismatch repair 12/773 23/7978 3.110688e-07 2.434113e-05
#hsa05169 hsa05169 Epstein-Barr virus infection 43/773 201/7978 3.975693e-07 2.488784e-05
#hsa03440 hsa03440 Homologous recombination 15/773 41/7978 3.101859e-06 1.618136e-04
dotplot(kk_up,color = "p.adjust")
kk_down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
head(kk_down)[,1:6]
# ID Description GeneRatio BgRatio pvalue p.adjust
#hsa04216 hsa04216 Ferroptosis 13/672 40/7978 1.355675e-05 0.002715053
#hsa04141 hsa04141 Protein processing in endoplasmic reticulum 31/672 166/7978 1.757316e-05 0.002715053
#hsa04218 hsa04218 Cellular senescence 25/672 160/7978 1.787752e-03 0.147907410
#hsa05110 hsa05110 Vibrio cholerae infection 11/672 50/7978 2.481062e-03 0.147907410
#hsa04068 hsa04068 FoxO signaling pathway 21/672 131/7978 2.945907e-03 0.147907410
#hsa00562 hsa00562 Inositol phosphate metabolism 14/672 74/7978 3.105142e-03 0.147907410
dotplot(kk_down,color = "p.adjust")
kk_diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
head(kk_diff)[,1:6]
# ID Description GeneRatio BgRatio pvalue p.adjust
#hsa04110 hsa04110 Cell cycle 45/1381 124/7978 2.715615e-07 8.662813e-05
#hsa04218 hsa04218 Cellular senescence 51/1381 160/7978 4.147968e-06 6.616008e-04
#hsa05165 hsa05165 Human papillomavirus infection 88/1381 330/7978 9.213633e-06 9.797164e-04
#hsa03440 hsa03440 Homologous recombination 19/1381 41/7978 1.530381e-05 1.220479e-03
#hsa03030 hsa03030 DNA replication 17/1381 36/7978 3.171412e-05 1.870973e-03
#hsa05110 hsa05110 Vibrio cholerae infection 21/1381 50/7978 3.519072e-05 1.870973e-03
dotplot(kk_diff,color = "p.adjust")
kegg_plot <- function(up_kegg,down_kegg){
library(ggplot2)
dat=rbind(up_kegg,down_kegg)
colnames(dat)
dat$pvalue = -log10(dat$pvalue)
dat$pvalue=dat$pvalue*dat$group
dat=dat[order(dat$pvalue,decreasing = F),]
g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) +
geom_bar(stat="identity") +
scale_fill_gradient(low="blue",high="red",guide = FALSE) +
scale_x_discrete(name ="Pathway names") +
scale_y_continuous(name ="log10P-value") +
coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Pathway Enrichment")
}
kegg_diff_dt <- as.data.frame(kk_diff)
kegg_down_dt <- as.data.frame(kk_down)
kegg_up_dt <- as.data.frame(kk_up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
4.2 GO
rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)
load("up_genes_list.rdata")
load("down_genes_list.rdata")
S2E <- function(genelist) {
genes <- as.data.frame(genelist)
colnames(genes) <- c("SYMBOL")
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(genes$SYMBOL,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db)
return(as.character(df$ENTREZID))
}
gene_up <- S2E(all_up_genes)
gene_down <- S2E(all_down_genes)
gene_diff=unique(c(gene_up,gene_down)))
{
g_list=list(gene_up=gene_up,
gene_down=gene_down,
gene_diff=gene_diff
if(F){
go_enrich_results <- lapply( g_list , function(gene) {
lapply( c('BP','MF','CC') , function(ont) {
print(paste('Now process ',ont))
ego <- enrichGO(gene = gene,
# universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
# print( head(ego) )
return(ego)
})
})
save(go_enrich_results,file = 'go_enrich_results.Rdata')
}
load(file = 'go_enrich_results.Rdata')
n1= c('gene_up','gene_down','gene_diff')
n2= c('BP','MF','CC')
for (i in 1:3){
for (j in 1:3){
fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
cat(paste0(fn,'\n'))
png(fn,res=150,width = 1080)
print( dotplot(go_enrich_results[[i]][[j]] ))
dev.off()
}
}