【生信技能树】2020-01-19作业：GSE103115

题目来源：https://mp.weixin.qq.com/s/UMCNnURzw5ngeA75Wb2CeQ

现在这个数据集，有24个样本，是4X3X2=24 ，其中4个细胞系和3个时间点。如果是两两对比，至少可以是8次差异分析，每个细胞系的24和72小时处理都需要与0小时进行差异分析，拿到上下调基因！

这里给大家学徒作业，做这8次差异分析，拿到上下调基因，然后上下调基因分开，绘制upsetR这个高级韦恩图，还有8个基因集合并起来绘制GO-KEGG富集分析图表，看起来工作量大，但是这个分析很有意义，希望你能完成！

step1 - 读入数据

1.1 获取表达矩阵

rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)

library(AnnoProbe)

if (file.exists("GSE103115_eSet.Rdata")) {
  load("GSE103115_eSet.Rdata")
}else{
  gset <- geoChina("GSE103115")
}##获取数据集
checkGPL(gset[[1]]@annotation)
probe2gene <- idmap(gset[[1]]@annotation) ##获取探针基因转换字典
genes_expr <- filterEM(gset[[1]]@assayData$exprs,probe2gene)
genes_expr[1:4,1:4]
#         GSM2753951 GSM2753952   GSM2753953  GSM2753954
#ZZZ3   -0.061379433  0.2470622 -0.008480072  0.08610869
#ZZEF1  -0.648055100 -0.2451930  1.059624200 -0.01988363
#ZYX     0.005674362 -0.1853190  0.274405000 -0.09063005
#ZYG11B  0.142938380 -0.1425798  0.060013056 -0.08956265

表达量有负值？？？被标准化过了？？？
进入GEO页面查询GSE103115
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE103115

GEO截图.png

发现了non-normalized矩阵，下载之。读入R。

xx <- read.table("GSE103115_non-normalized.txt",sep = '\t')
xx[1:4,1:5]
#        ID_REF X6898341040_A Detection.Pval X6898341040_B Detection.Pval.1
#1 ILMN_1762337         101.9        0.26104         102.6          0.50519
#2 ILMN_2055271         113.9        0.01429         129.7          0.01429
#3 ILMN_1736007          91.6        0.82987          98.4          0.71558
#4 ILMN_2383229          96.6        0.57662         102.1          0.52987
yy <- xx[,seq(from = 2,to = ncol(xx),by=2)]
rownames(yy) <- xx[,1]
dim(yy)
#[1] 47322    24
yy[1:4,1:4]
#             X6898341040_A X6898341040_B X6898341040_C X6898341040_D
#ILMN_1762337         101.9         102.6          96.8         107.3
#ILMN_2055271         113.9         129.7         108.4         125.9
#ILMN_1736007          91.6          98.4          97.4         113.5
#ILMN_2383229          96.6         102.1          97.0         103.5

列名很奇怪，猜测跟样本名相关，进入GEO单独的样本页面查看。
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2753951

GSM2753951.png

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2753952

GSM2753952.png

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2753953

GSM2753953.png

GSM2753951 对应 6898341040_A
GSM2753952 对应 6898341040_B
GSM2753953 对应 6898341040_C
以此类推（此处其实最好写个爬虫抓取对应关系）

colnames(yy) <- colnames(genes_expr)
yy <- filterEM(yy,probe2gene)
yy[1:4,1:4]
#       GSM2753951 GSM2753952 GSM2753953 GSM2753954
#ZZZ3        637.4      669.6      339.3      690.0
#ZZEF1       156.8      228.3      205.5      265.8
#ZYX         438.5     1544.4      857.8     1047.7
#ZYG11B      697.2     1140.0      819.8      971.3
boxplot(yy)

before_log2.png

yy <- log2(yy+1)
boxplot(yy)

after_log2.png

library(limma)

yy <- normalizeBetweenArrays(yy)
boxplot(yy)

after_normalize.png

1.2 获取临床信息，从中提取分组信息

pheno <- gset[[1]]@phenoData@data ##获取临床信息
tmp <- data.frame(row.names = rownames(pheno),title = pheno[,1])
head(tmp)
#                         title
#GSM2753951       HCC38 0h rep2
#GSM2753952 MDA-MB-231 72h rep1
#GSM2753953  MDA-MB-157 0h rep1
#GSM2753954      BT549 72h rep1
#GSM2753955  MDA-MB-231 0h rep1
#GSM2753956 MDA-MB-157 72h rep2

library(stringr)
tmp$CellLines <- str_split(tmp$title,' ',simplify = T)[,1]
tmp$CL <- paste0(str_sub(tmp$CellLines,1,1),str_sub(str_split(tmp$CellLines,'-',simplify = T)[,3],1,1))
tmp$hours <- str_split(tmp$title,' ',simplify = T)[,2]
tmp$sample_IDs <- rownames(tmp)
tmp$groups <- paste(tmp$CL,tmp$hours,sep = '.')
head(tmp)
#                         title  CellLines CL hours sample_IDs
#GSM2753951       HCC38 0h rep2      HCC38  H    0h GSM2753951
#GSM2753952 MDA-MB-231 72h rep1 MDA-MB-231 M2   72h GSM2753952
#GSM2753953  MDA-MB-157 0h rep1 MDA-MB-157 M1    0h GSM2753953
#GSM2753954      BT549 72h rep1      BT549  B   72h GSM2753954
#GSM2753955  MDA-MB-231 0h rep1 MDA-MB-231 M2    0h GSM2753955
#GSM2753956 MDA-MB-157 72h rep2 MDA-MB-157 M1   72h GSM2753956

grouplist <- tmp$groups
names(grouplist) <- rownames(tmp)
gl <- factor(grouplist)

save(yy,tmp,grouplist,file = 'step1-output.Rdata')

step2 - Check Data

2.1 样本相关性

rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)

load("step1-output.Rdata")
dat <- yy
dat[1:4,1:4]
##        GSM2753951 GSM2753952 GSM2753953 GSM2753954
## ZZZ3     9.622862   9.345666   8.880464   9.501694
## ZZEF1    7.460888   7.819952   8.075162   8.148970
## ZYX      9.058763  10.571679  10.201516  10.128589
## ZYG11B   9.746743  10.115445  10.158117  10.012370
rownames(tmp)
colnames(dat) <- rownames(tmp)

M <- cor(dat)
library(pheatmap)
ac <- data.frame(CellLines = tmp$CellLines,
                 time = tmp$hours)
rownames(ac)=colnames(dat)
pheatmap(M,annotation_col = ac)

cor_plot.png

2.2 主成分分析（PCA）

rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)

load("step1-output.Rdata")
table(grouplist)
#grouplist
#  B.0h  B.24h  B.72h   H.0h  H.24h  H.72h  M1.0h M1.24h M1.72h  M2.0h M2.24h M2.72h 
#     2      2      2      2      2      2      2      2      2      2      2      2

按细胞系+时间点，每组只有2个样本，不好做PCA。

2.2.1 按照细胞系做PCA

table(tmp$CellLines)
#     BT549      HCC38 MDA-MB-157 MDA-MB-231 
#         6          6          6          6 
dat <- yy
dat[1:4,1:4]
#       GSM2753951 GSM2753952 GSM2753953 GSM2753954
#ZZZ3        637.4      669.6      339.3      690.0
#ZZEF1       156.8      228.3      205.5      265.8
#ZYX         438.5     1544.4      857.8     1047.7
#ZYG11B      697.2     1140.0      819.8      971.3
dim(dat)
#[1] 20937    24

dat=t(dat)#画PCA图时要求是行名时样本名，列名时探针名，因此此时需要转换
dat=as.data.frame(dat)#将matrix转换为data.frame
dim(dat)
#[1]    24 20937
dat=cbind(dat,tmp$CellLines) #将CellLines信息追加到最后一列
colnames(dat)[ncol(dat)] <- "CellLines"
library("FactoMineR")
library("factoextra") 
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#现在dat最后一列是CellLines，需要重新赋值给一个dat.pca,这个矩阵是不含有分组信息的
fviz_pca_ind(dat.pca,
             geom.ind = "point", 
             col.ind = dat$CellLines, 
             addEllipses = TRUE, 
             legend.title = "CellLines"
)

PCA - CellLines.png

2.2.2 按照时间点做PCA

table(tmp$hours)
# 0h 24h 72h 
#  8   8   8 
dat <- yy
dat=t(dat)#画PCA图时要求是行名时样本名，列名时探针名，因此此时需要转换
dat=as.data.frame(dat)#将matrix转换为data.frame
dat=cbind(dat,tmp$hours) #将CellLines信息追加到最后一列
colnames(dat)[ncol(dat)] <- "hours"
library("FactoMineR")
library("factoextra") 
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#现在dat最后一列是CellLines，需要重新赋值给一个dat.pca,这个矩阵是不含有分组信息的
fviz_pca_ind(dat.pca,
             geom.ind = "point", 
             col.ind = dat$hours, 
             addEllipses = TRUE, 
             legend.title = "Timepoint"
)

Timepoint-PCA.png

2.3 热图

rm(list=ls())
load("step1-output.Rdata")
dat <- yy
cg=names(tail(sort(apply(dat,1,sd)),1000))#apply按行（'1'是按行取，'2'是按列取）取每一行的方差，从小到大排序，取最大的1000个
library(pheatmap)
n=t(scale(t(dat[cg,]))) # 'scale'可以对log-ratio数值进行归一化
n[n>2]=2 
n[n< -2]= -2
n[1:4,1:4]
#       GSM2753951 GSM2753952 GSM2753953 GSM2753954
#MRPL11  0.9091092  0.2982721 -1.7029878 -0.0322281
#TOB1    2.0000000 -0.7394836  0.1459653 -0.5471982
#LANCL1  1.6540921 -0.5890577  0.2143829 -1.1585638
#CYB5D2 -0.3480440 -0.5252493  1.5632971 -0.4775324
ac=data.frame(CellLines=tmp$CellLines,
              time = tmp$hours)
rownames(ac)=colnames(n) #把ac的行名给到n的列名，即对每一个探针标记上分组信息

pheatmap(n,show_rownames = F,
         annotation_col=ac)

heatmap_top1000_sd.png

step3 - DEG

参考：【生信星球】那些常用的limma操作

rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)

load("step1-output.Rdata")
gl <- factor(grouplist)

design <- model.matrix(~ 0 + gl)
rownames(design) <- rownames(tmp)
colnames(design) <- as.character(levels(gl))
fit <- lmFit(yy, design)

colnames(design)
gl_n <- colnames(design)
cont <- list()
all_up_genes <- c()
all_down_genes <- c()

for (i in 0:3) {
  x <- i*3+1
  cont <- c(cont,paste(gl_n[x+1],gl_n[x],sep = '-'))
  cont <- c(cont,paste(gl_n[x+2],gl_n[x],sep = '-'))
}
for (i in 1:length(cont)) {
  x <- cont[[i]]
  print(x)
  contrast.matrix <- makeContrasts(x,levels = design)
  fit2 <- contrasts.fit(fit, contrast.matrix)
  fit2 <- eBayes(fit2)
  cont[[i]]$res <- topTable(fit2,coef=1,n=Inf)
  with(cont[[i]]$res,plot(logFC,-log10(P.Value)))
  cont[[i]]$logFC_thred <- with(cont[[i]]$res,mean(abs(logFC))+2*sd(abs(logFC)))
  print(paste0("logFC_thred: ",round(cont[[i]]$logFC_thred*1000)/1000))
  cont[[i]]$res$change <- with(cont[[i]]$res,
                               ifelse(P.Value > 0.05,"stable",
                                 ifelse(logFC > cont[[i]]$logFC_thred,"up",
                                   ifelse(logFC < -cont[[i]]$logFC_thred,"down",
                                          "stable"))))
  print(table(cont[[i]]$res$change))
  cont[[i]]$up_genes <- with(cont[[i]],rownames(res[res$change=='up',]))
  all_up_genes <- c(all_up_genes,cont[[i]]$up_genes)
  
  cont[[i]]$down_genes <- with(cont[[i]],rownames(res[res$change=='down',]))
  all_down_genes <- c(all_down_genes,cont[[i]]$down_genes)
}
#[1] "B.24h-B.0h"
#[1] "logFC_thred: 0.308"
#  down stable     up 
#   265  20383    289 
#[1] "B.72h-B.0h"
#[1] "logFC_thred: 0.433"
#  down stable     up 
#   378  20159    400 
#[1] "H.24h-H.0h"
#[1] "logFC_thred: 0.275"
#  down stable     up 
#   262  20464    211 
#[1] "H.72h-H.0h"
#[1] "logFC_thred: 0.388"
#  down stable     up 
#   423  20165    349 
#[1] "M1.24h-M1.0h"
#[1] "logFC_thred: 0.261"
#  down stable     up 
#   173  20554    210 
#[1] "M1.72h-M1.0h"
#[1] "logFC_thred: 0.302"
#  down stable     up 
#   283  20312    342 
#[1] "M2.24h-M2.0h"
#[1] "logFC_thred: 0.366"
#  down stable     up 
#   306  20280    351 
#[1] "M2.72h-M2.0h"
#[1] "logFC_thred: 0.379"
#  down stable     up 
#   316  20228    393 

library(UpSetR)

all_up_genes <- unique(all_up_genes)
all_down_genes <- unique(all_down_genes)

library(dplyr)
for (i in 1:8) {
  cont[[i]]$up_genes_list <- ifelse(all_up_genes %in% cont[[i]]$up_genes,1,0)
  cont[[i]]$down_genes_list <- ifelse(all_down_genes %in% cont[[i]]$down_genes,1,0)
}

dat_down <- as.data.frame(do.call(cbind,lapply(cont,function(x){ return(x$down_genes_list)})))
dat_down <- cbind(dat_down,all_down_genes)
for (i in 1:length(cont)) colnames(dat_down)[i] <- cont[[i]][[1]]
upset(dat_down,nsets = 8)

UpSetR_down_gene_plot.png

dat_up <- as.data.frame(do.call(cbind,lapply(cont,function(x){ return(x$up_genes_list)})))
dat_up <- cbind(dat_up,all_down_genes)
for (i in 1:length(cont)) colnames(dat_up)[i] <- cont[[i]][[1]]
upset(dat_up,nsets = 8)

UpSetR_up_gene_plot.png

保存数据：

save(cont,file = '8-DEG-results.rdata')
save(all_down_genes,file = 'down_genes_list.rdata')
save(all_up_genes,file = 'up_genes_list.rdata')

step 4 - GO/KEGG

4.1 KEGG

rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)

load("up_genes_list.rdata")
load("down_genes_list.rdata")

S2E <- function(genelist) {
  genes <- as.data.frame(genelist)
  colnames(genes) <- c("SYMBOL")
  library(clusterProfiler)
  library(org.Hs.eg.db)
  df <- bitr(genes$SYMBOL,
             fromType = "SYMBOL",
             toType = c("ENTREZID"),
             OrgDb = org.Hs.eg.db)
  return(as.character(df$ENTREZID))
}

gene_up <- S2E(all_up_genes)
gene_down <- S2E(all_down_genes)
gene_diff <- unique(c(gene_up,gene_down))

library(clusterProfiler)
kk_up <- enrichKEGG(gene = gene_up,
                 organism = 'hsa',
                 pvalueCutoff = 0.9,
                 qvalueCutoff = 0.9)
head(kk_up)[,1:6]
#               ID                  Description GeneRatio  BgRatio       pvalue     p.adjust
#hsa03030 hsa03030              DNA replication    17/773  36/7978 7.207655e-09 2.255996e-06
#hsa04110 hsa04110                   Cell cycle    32/773 124/7978 1.512416e-07 2.366932e-05
#hsa03460 hsa03460       Fanconi anemia pathway    19/773  54/7978 3.064372e-07 2.434113e-05
#hsa03430 hsa03430              Mismatch repair    12/773  23/7978 3.110688e-07 2.434113e-05
#hsa05169 hsa05169 Epstein-Barr virus infection    43/773 201/7978 3.975693e-07 2.488784e-05
#hsa03440 hsa03440     Homologous recombination    15/773  41/7978 3.101859e-06 1.618136e-04
dotplot(kk_up,color = "p.adjust")

up_KEGG_dotplot.png

kk_down <- enrichKEGG(gene = gene_down,
                    organism = 'hsa',
                    pvalueCutoff = 0.9,
                    qvalueCutoff = 0.9)
head(kk_down)[,1:6]
#               ID                                 Description GeneRatio  BgRatio       pvalue    p.adjust
#hsa04216 hsa04216                                 Ferroptosis    13/672  40/7978 1.355675e-05 0.002715053
#hsa04141 hsa04141 Protein processing in endoplasmic reticulum    31/672 166/7978 1.757316e-05 0.002715053
#hsa04218 hsa04218                         Cellular senescence    25/672 160/7978 1.787752e-03 0.147907410
#hsa05110 hsa05110                   Vibrio cholerae infection    11/672  50/7978 2.481062e-03 0.147907410
#hsa04068 hsa04068                      FoxO signaling pathway    21/672 131/7978 2.945907e-03 0.147907410
#hsa00562 hsa00562               Inositol phosphate metabolism    14/672  74/7978 3.105142e-03 0.147907410
dotplot(kk_down,color = "p.adjust")

down_KEGG_dotplot.png

kk_diff <- enrichKEGG(gene = gene_diff,
                      organism = 'hsa',
                      pvalueCutoff = 0.9,
                      qvalueCutoff = 0.9)
head(kk_diff)[,1:6]
#               ID                    Description GeneRatio  BgRatio       pvalue     p.adjust
#hsa04110 hsa04110                     Cell cycle   45/1381 124/7978 2.715615e-07 8.662813e-05
#hsa04218 hsa04218            Cellular senescence   51/1381 160/7978 4.147968e-06 6.616008e-04
#hsa05165 hsa05165 Human papillomavirus infection   88/1381 330/7978 9.213633e-06 9.797164e-04
#hsa03440 hsa03440       Homologous recombination   19/1381  41/7978 1.530381e-05 1.220479e-03
#hsa03030 hsa03030                DNA replication   17/1381  36/7978 3.171412e-05 1.870973e-03
#hsa05110 hsa05110      Vibrio cholerae infection   21/1381  50/7978 3.519072e-05 1.870973e-03
dotplot(kk_diff,color = "p.adjust")

diff_KEGG_dotplot.png

kegg_plot <- function(up_kegg,down_kegg){
  library(ggplot2)
  dat=rbind(up_kegg,down_kegg)
  colnames(dat)
  dat$pvalue = -log10(dat$pvalue)
  dat$pvalue=dat$pvalue*dat$group 
  
  dat=dat[order(dat$pvalue,decreasing = F),]
  
  g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) + 
    geom_bar(stat="identity") + 
    scale_fill_gradient(low="blue",high="red",guide = FALSE) + 
    scale_x_discrete(name ="Pathway names") +
    scale_y_continuous(name ="log10P-value") +
    coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
    ggtitle("Pathway Enrichment") 
}

kegg_diff_dt <- as.data.frame(kk_diff)
kegg_down_dt <- as.data.frame(kk_down)
kegg_up_dt <- as.data.frame(kk_up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1

g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)

kegg_up_down.png

4.2 GO

rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)

load("up_genes_list.rdata")
load("down_genes_list.rdata")

S2E <- function(genelist) {
  genes <- as.data.frame(genelist)
  colnames(genes) <- c("SYMBOL")
  library(clusterProfiler)
  library(org.Hs.eg.db)
  df <- bitr(genes$SYMBOL,
             fromType = "SYMBOL",
             toType = c("ENTREZID"),
             OrgDb = org.Hs.eg.db)
  return(as.character(df$ENTREZID))
}

gene_up <- S2E(all_up_genes)
gene_down <- S2E(all_down_genes)
gene_diff=unique(c(gene_up,gene_down)))

{
  
  g_list=list(gene_up=gene_up,
              gene_down=gene_down,
              gene_diff=gene_diff
  
  if(F){
    go_enrich_results <- lapply( g_list , function(gene) {
      lapply( c('BP','MF','CC') , function(ont) {
        print(paste('Now process ',ont))
        ego <- enrichGO(gene          = gene,
#                        universe      = gene_all,
                        OrgDb         = org.Hs.eg.db,
                        ont           = ont ,
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.99,
                        qvalueCutoff  = 0.99,
                        readable      = TRUE)
        
#        print( head(ego) )
        return(ego)
      })
    })
    save(go_enrich_results,file = 'go_enrich_results.Rdata')
    
  }
  
  
  load(file = 'go_enrich_results.Rdata')
  
  n1= c('gene_up','gene_down','gene_diff')
  n2= c('BP','MF','CC') 
  for (i in 1:3){
    for (j in 1:3){
      fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
      cat(paste0(fn,'\n'))
      png(fn,res=150,width = 1080)
      print( dotplot(go_enrich_results[[i]][[j]] ))
      dev.off()
    }
  }

dotplot_gene_down_BP.png

dotplot_gene_up_CC.png

dotplot_gene_up_MF.png

dotplot_gene_up_BP.png

dotplot_gene_diff_CC.png

dotplot_gene_diff_MF.png

dotplot_gene_diff_BP.png

dotplot_gene_down_CC.png

dotplot_gene_down_MF.png

【生信技能树】2020-01-19作业：GSE103115 - task1