UCSC Xena数据下载后的后续分析代码——转载自麦麦大人

#已有UCSCXena数据,进行后续分析
rm(list=ls())

options(stringsAsFactors = F)

##可以直接读没有解压的数据,mRNA-SEQ数据

mRNA_HiSeqV2=read.table(“HiSeqV2”,header = T,sep = ‘t‘)

dim(mRNA_HiSeqV2)

mRNA_HiSeqV2[1:4,1:4]

#查看NA的数据

na.omit(mRNA_HiSeqV2)[1:4,1:4]

dim(na.omit(mRNA_HiSeqV2))

save(mRNA_HiSeqV2,file=“mRNA_HiSeqV2.Rdata”)
##临床信息

mRNA_clinical=read.table(“CESC_clinicalMatrix” ,header = T,sep = ‘t‘, quote = “”)

dim(mRNA_clinical)

mRNA_clinical[1:4,1:4]

save(mRNA_clinical,file=“mRNA_clinical.Rdata”)

 

##生存相关信息

mRNA_survival=read.table(“CESC_survival.txt.gz” ,header = T,sep = ‘t‘)

dim(mRNA_survival)

mRNA_survival[1:4,1:4]

save(mRNA_survival,file=“mRNA_survival.Rdata”)

rm(list=ls())

options(stringsAsFactors = F)

load(“mRNA_HiSeqV2.Rdata”)

 

###########

# 这里需要解析TCGA数据库的ID规律,来判断样本归类问题。https://mp.weixin.qq.com/s/Ph1O6V5RkxkyrKpVmB5ODA

#01–09是癌症,10–19是正常,20–29是癌旁

expr <- mRNA_HiSeqV2

expr[1:4,1:4]

rownames(expr)=expr[,1]

expr=expr[,-1]

expr[1:4,1:4]

#通过数字来判断样本类型

ls <- unlist(substr(colnames(expr),14,15))

table(ls)                

group_list=ifelse(as.numeric(substr(colnames(expr),14,15)) < 10,‘tumor’,‘normal’)

 

table(group_list)

 

exprSet=na.omit(expr)

dim(exprSet)

exprSet[1:4,1:4]

exprSet=exprSet[,group_list==“tumor”]

dim(exprSet)

##xena下载的数据是经过log的count值,所以要还原才可以进行差异分析

exprSet <- 2^exprSet-1

dim(exprSet)

exprSet[1:4,1:4]

#并且还要取整数

exprSet <- floor(exprSet)

exprSet[1:4,1:4]

save(exprSet,file=“exprSet_DEG.Rdata”)

#四、分组
#根据感兴趣的基因的表达量进行分组,之后进行差异分析。

load(“exprSet_DEG.Rdata”)

####根据感兴趣的基因高低表达做差异分析,这里用了中位数,也可以用平均值

group_list=ifelse(as.numeric(exprSet[“MYC”,])>median(as.numeric(exprSet[“MYC”,])),‘high’,‘low’)

table(group_list)

class(group_list)

group_list <- factor(group_list)

class(group_list)

#五、差异分析
#转录组数据最常用DESeq2包进行差异分析

library(DESeq2)

colData <- data.frame(row.names=colnames(exprSet), 

                      group_list=group_list) 

colData

dds <- DESeqDataSetFromMatrix(countData = exprSet,

                              colData = colData,

                              design = ~ group_list)

dds <- DESeq(dds)

res <- results(dds, 

               contrast=c(“group_list”,“high”,“low”))

head(res)

##sort by padj 进一步优化,安装标准化的P值排序

resOrdered <- res[order(res$padj),]

head(resOrdered)

DEG =as.data.frame(resOrdered)

DESeq2_DEG = na.omit(DEG)

head(DESeq2_DEG)

nrDEG=DESeq2_DEG#[,c(2,6)]

#colnames(nrDEG)=c(‘log2FoldChange’,‘pvalue’) 

head(nrDEG)

#六、筛选差异基因
#对得到的差异基因,按照fold change和P值,制定自己的标准,进行筛选。

##筛选

#get diff_gene 得到差异基因,选P值及logFC大于1的

diff_gene <- subset(nrDEG,pvalue<0.01 & (log2FoldChange>=1 | log2FoldChange<=-1))

head(diff_gene)

#给数据标记上调下调

resdata <- diff_gene

resdata$significant <- c(rep(“unchange”,nrow(resdata)))

resdata$significant[resdata$pvalue<=0.01&resdata$log2FoldChange>=1] <- “up”

resdata$significant[resdata$pvalue<=0.01&resdata$log2FoldChange<=-1] <- “down”

#查看最新数据的前五行

head(resdata)

resdata[1:40,]

diff_gene <- resdata

table(diff_gene$significant)

#输出文件为TXT格式

write.table(x=diff_gene,

            file=“diff_result_CESC_MYC.txt”,

            quote=F,

            sep = “t“,

            row.names = T,

            col.names = T)

 

save(nrDEG,DESeq2_DEG,diff_gene,file=“DESeq2_DEG_result.Rdata”)

load(“DESeq2_DEG_result.Rdata”)

#七、热图
rm(list=ls())

options(stringsAsFactors = F)

load(“DESeq2_DEG_result.Rdata”)

load(“exprSet_DEG.Rdata”)

####根据感兴趣的基因高低表达做差异分析,这里用了中位数,也可以用平均值

group_list=ifelse(as.numeric(exprSet[“MYC”,])>median(as.numeric(exprSet[“MYC”,])),‘high’,‘low’)

table(group_list)

class(group_list)

group_list <- factor(group_list)

class(group_list)

######3提取差异最大的50个基因画热图,用P值判断

## heatmap

library(pheatmap)

choose_gene=head(rownames(diff_gene),50) ## 50 maybe better

choose_matrix=exprSet[choose_gene,]

#把太小的值去掉

boxplot(choose_matrix)

n=t(scale(t(choose_matrix)))

boxplot(n)

n[n>2]=2

n[n< –2]=-2

n[1:4,1:4]

boxplot(n)

 

#分组信息

annotation_col=data.frame(group=group_list)

row.names(annotation_col) <- colnames(exprSet)

annotation_col

#pheatmap(choose_matrix,filename = paste0(n,‘_need_DEG_top50_heatmap.png’))

pheatmap(n,annotation_col=annotation_col,show_rownames=F,show_colnames=F)

 

######3提取差异最大的50个基因画热图,用logFC判断

## heatmap

library(pheatmap)

diff_gene<- diff_gene[order(abs(diff_gene$log2FoldChange),decreasing = T),]

head(diff_gene)

exprSet <- exprSet[,order(group_list)]

exprSet[1:4,1:4]

choose_gene=head(rownames(diff_gene),50) ## 50 maybe better

choose_matrix=exprSet[choose_gene,]

#把太小的值去掉

boxplot(choose_matrix)

n=t(scale(t(choose_matrix)))

boxplot(n)

n[n>2]=2

n[n< –2]=-2

n[1:4,1:4]

boxplot(n)

 

#分组信息

annotation_col=data.frame(group=group_list)

row.names(annotation_col) <- colnames(exprSet)

annotation_col

#pheatmap(choose_matrix,filename = paste0(n,‘_need_DEG_top50_heatmap.png’))

pheatmap(n,annotation_col=annotation_col,

         show_rownames=F,show_colnames=F)
         
#八、火山图
#做火山图数据处理

######火山图

rm(list = ls())  ## 魔幻操作,一键清空~

options(stringsAsFactors = F)

 

#导入数据,火山图需要的数据前期处理

load(“DESeq2_DEG_result.Rdata”)

head(DESeq2_DEG)

library(ggpubr)

df<-DESeq2_DEG

df<-DESeq2_DEG[-1,] ##MYC的P值太小,为了作图好看

attach(df)

##火山图横坐标是logFC,纵坐标是-log10(P.Value)

plot(log2FoldChange,-log10(pvalue))

df$v=-log10(pvalue) #df新增加一列‘v’,值为-log10(P.Value)

ggscatter(df, x = “log2FoldChange”, y = “v”,size=0.8)

##加上上调下调信息

df$group=ifelse(df$pvalue>0.01,‘stable’, #if 判断:如果这一基因的P.Value>0.01,则为stable基因

            ifelse( df$log2FoldChange >=1,‘up’, #接上句else 否则:接下来开始判断那些P.Value<0.01的基因,再if 判断:如果logFC >1.5,则为up(上调)基因

                    ifelse( df$log2FoldChange <= –1,‘down’,‘stable’) )#接上句else 否则:接下来开始判断那些logFC <1.5 的基因,再if 判断:如果logFC <1.5,则为down(下调)基因,否则为stable基因

)

table(df$g)

df$name=rownames(df)

head(df)

用gpubr画火山图

##方法一:ggpubr画火山图

ggscatter(df, x = “log2FoldChange”, y = “v”,size=0.8,color = ‘group’)

##挑选一些感兴趣的基因标记出来

ggscatter(df, x = “log2FoldChange”, y = “v”, color = “group”,size = 1,

          label = “name”, repel = T,

          #label.select = rownames(df)[df$g != ‘stable’] ,

          label.select = head(rownames(df)), #挑选一些基因在图中显示出来

          palette = c(“#00AFBB”, “#999999”, “#FC4E07”) )

ggsave(‘volcano.png’)


###方法二:ggplot画火山图

library(ggplot2)

p <- ggplot(data = df, 

            aes(x = log2FoldChange, 

                y = v)) +

  geom_point(alpha=0.4, size=3.5, 

             aes(color=group)) +

  scale_color_manual(values=c(“blue”, “grey”,“red”))+

  geom_vline(xintercept=c(-1,1),lty=4,col=“black”,lwd=0.8) +

  geom_hline(yintercept = –log10(0.01),lty=4,col=“black”,lwd=0.8) +

  theme_bw()

p

#如果想要标记基因名字,需要生成用于添加图层的新数据

#新数据框的内容是你想要标记的基因,这里根据logFC和Pvalue的大小来筛选,可以自定义阈值来调整要显示的基因的数量:

library(dplyr)

head(df)

for_label <- df %>% 

  filter(abs(logFC) >4 & v>-log10(0.01))

for_label <- data.frame(head(df)) #挑选一些基因在图中显示出来

class(for_label)

head(for_label)

###新图层叠加到原图上去

p +

  geom_point(size = 3, shape = 1, data = for_label) +

  ggrepel::geom_label_repel(

    aes(label = name),

    data = for_label,

    color=“black”

  )

ggsave(“ggplot2_volcano.png”)

九、GO_KEGG分析前数据处理
rm(list = ls())  ## 魔幻操作,一键清空~

options(stringsAsFactors = F)

#导入数据

load(“DESeq2_DEG_result.Rdata”)

head(DESeq2_DEG)

df <- DESeq2_DEG

##加上上调下调信息

df$g=ifelse(df$pvalue>0.01,‘stable’, #if 判断:如果这一基因的P.Value>0.01,则为stable基因

            ifelse( df$log2FoldChange >=1,‘up’, #接上句else 否则:接下来开始判断那些P.Value<0.01的基因,再if 判断:如果logFC >1.5,则为up(上调)基因

                    ifelse( df$log2FoldChange <= –1,‘down’,‘stable’) )#接上句else 否则:接下来开始判断那些logFC <1.5 的基因,再if 判断:如果logFC <1.5,则为down(下调)基因,否则为stable基因

)

table(df$g)

#增加一列基因名字

df$symbol=rownames(df)

head(df)

#载入GO、KEGG、GSEA分析需要的包

library(ggplot2)

library(clusterProfiler)

library(org.Hs.eg.db)

#将基因名转化成ENTREZID才能进行GO分析

deg <- bitr(unique(df$symbol), fromType = “SYMBOL”,

           toType = c( “ENTREZID”),

           OrgDb = org.Hs.eg.db)

head(deg)

DEG=df

head(DEG)

#通过合并完成SYMBOL和ENTREZID的转换

DEG=merge(DEG,deg,by.y=‘SYMBOL’,by.x=‘symbol’)

head(DEG)

 

##将差异基因的上调下调分成不同的数据框,分别进行、KEGG、GSEA分析

gene_up= DEG[DEG$g == ‘up’,‘ENTREZID’] 

gene_down=DEG[DEG$g == ‘down’,‘ENTREZID’] 

gene_diff=c(gene_up,gene_down)

gene_all=as.character(DEG[ ,‘ENTREZID’] )

#data(geneList, package=”DOSE”)

#head(geneList)

#boxplot(geneList)

boxplot(DEG$log2FoldChange)

 

geneList=DEG$log2FoldChange

head(geneList)

names(geneList)=DEG$ENTREZID

geneList=sort(geneList,decreasing = T)

head(geneList)

 

save(DEG,gene_up, gene_down,gene_diff, gene_all, file = ‘anno_DEG_list.Rdata’)

GO
分析
方法一,得到3种分析‘BP’,’MF’,’CC’合并的图

rm(list = ls())  ## 魔幻操作,一键清空~

options(stringsAsFactors = F)

###完成前期的数据处理,直接load数据就可以开始进行分析

load(‘anno_DEG_list.Rdata’)

 

### GO database analysis 

### 做GO数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。

###方法一,得到3种分析‘BP’,’MF’,’CC’合并的图

ego <- enrichGO(gene          = gene_up,  ##手动换上调下调的基因

                #universe      = gene_all,

                OrgDb         = org.Hs.eg.db,

                ont           = “all”, #BP/CC/MF或者“ALL”,如果是“ALL”的时候要加“pool=TRUE”

                pAdjustMethod = “BH”,  #选哪个都差距不大

                #pvalueCutoff  = 0.1,

                #qvalueCutoff  = 0.1,

                pool          = TRUE,

                readable      = TRUE

)

head(ego[,1:8])

go_data <- data.frame(ego)

head(go_data)

write.table(x=go_data,

            file=“go_data_CESC_MYC.txt”,

            quote=F,

            sep = “t“,

            row.names = T,

            col.names = T)

#barplot(ego)

#barplot(ego, showCategory = 10)

#dotplot(ego, showCategory=10)

library(ggplot2)

setting=element_text(size = 12,color=“black”,family=“Arial”)

barplot(ego, split=“ONTOLOGY”,showCategory = 5)+ facet_grid(ONTOLOGY~., scale=“free”)

ggsave(“go_barplot_5.png”)

dotplot(ego, split=“ONTOLOGY”,showCategory = 10)+ facet_grid(ONTOLOGY~., scale=“free”)

ggsave(“go_dotplot_10.png”)

#dotplot(ego, split=”ONTOLOGY”)+ facet_grid(ONTOLOGY~., scale=”free”)

#ggsave(“go_dotplot.png”)

###方法二:得到分开的图  

##厉害的循环将3个列表分别进行了3种分析‘BP’,’MF’,’CC’,得到九个结果

g_list=list(gene_up=gene_up,

            gene_down=gene_down,

            gene_diff=gene_diff)

 

  go_enrich_results <- lapply( g_list , function(gene) {

    lapply( c(‘BP’,‘MF’,‘CC’) , function(ont) {

      cat(paste(‘Now process ‘,ont ))

      ##最关键的函数

      ego <- enrichGO(gene          = gene,  

                      universe      = gene_all,

                      OrgDb         = org.Hs.eg.db,

                      ont           = ont , #BP/CC/MF或者“ALL”,如果是“ALL”的时候要加“pool=TRUE”

                      pAdjustMethod = “BH”,

                      pvalueCutoff  = 0.99,

                      qvalueCutoff  = 0.99,

                      readable      = TRUE)

      

      print( head(ego) )

      return(ego)

    })

  })

  save(go_enrich_results,file = ‘go_enrich_results.Rdata’)

  

 

load(file = ‘go_enrich_results.Rdata’)

##循环批量成图,并保存

n1= c(‘gene_up’,‘gene_down’,‘gene_diff’)

n2= c(‘BP’,‘MF’,‘CC’) 

for (i in 1:3){

  for (j in 1:3){

    fn=paste0(‘dotplot_’,n1[i],‘_’,n2[j],‘.png’)

    cat(paste0(fn,‘n‘))

    png(fn,res=150,width = 1080)

    print( dotplot(go_enrich_results[[i]][[j]] ))

    dev.off()

  }

}

###方法三,得到3种分析‘BP’,’MF’,’CC’单独的图

##BP

ego <- enrichGO(gene          = gene_up,  ##手动换上调下调的基因

                #universe      = gene_all,

                OrgDb         = org.Hs.eg.db,

                ont           = “BP”, #BP/CC/MF或者“ALL”,如果是“ALL”的时候要加“pool=TRUE”

                pAdjustMethod = “BH”,  #选哪个都差距不大

                #pvalueCutoff  = 0.1,

                #qvalueCutoff  = 0.1,

                pool          = TRUE,

                readable      = TRUE

)

head(ego[,1:8])

p1_BP <- dotplot(ego)

p1_BP

##CC

ego <- enrichGO(gene          = gene_up,  ##手动换上调下调的基因

                #universe      = gene_all,

                OrgDb         = org.Hs.eg.db,

                ont           = “CC”, #BP/CC/MF或者“ALL”,如果是“ALL”的时候要加“pool=TRUE”

                pAdjustMethod = “BH”,  #选哪个都差距不大

                #pvalueCutoff  = 0.1,

                #qvalueCutoff  = 0.1,

                pool          = TRUE,

                readable      = TRUE

)

head(ego[,1:8])

p1_CC <- dotplot(ego,showCategory = 5)

p1_CC

##MF

ego <- enrichGO(gene          = gene_up,  ##手动换上调下调的基因

                #universe      = gene_all,

                OrgDb         = org.Hs.eg.db,

                ont           = “MF”, #BP/CC/MF或者“ALL”,如果是“ALL”的时候要加“pool=TRUE”

                pAdjustMethod = “BH”,  #选哪个都差距不大

                #pvalueCutoff  = 0.1,

                #qvalueCutoff  = 0.1,

                pool          = TRUE,

                readable      = TRUE

)

head(ego[,1:8])

p1_MF <- dotplot(ego,showCategory = 5)

p1_MF


KEGG
分析
rm(list = ls())  ## 魔幻操作,一键清空~

options(stringsAsFactors = F)

#载入GO、KEGG、GSEA分析需要的包

library(ggplot2)

library(clusterProfiler)

library(org.Hs.eg.db)

###完成前期的数据处理,直接load数据就可以开始进行分析

load(‘anno_DEG_list.Rdata’)

 

## KEGG pathway analysis

### 做KEGG数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。

 

  ###   over-representation test

  kk.up <- enrichKEGG(gene         = gene_up,

                      organism     = ‘hsa’,

                      universe     = gene_all,

                      pvalueCutoff = 0.9,

                      qvalueCutoff =0.9)

  head(kk.up)[,1:6]

  class(kk.up)

  dotplot(kk.up )

  ggsave(‘kk.up.dotplot_1.png’)

  #dev.off()

  

kk.down <- enrichKEGG(gene         =  gene_down,

                        organism     = ‘hsa’,

                        universe     = gene_all,

                        pvalueCutoff = 0.9,

                        qvalueCutoff =0.9)

  head(kk.down)[,1:6]

  dotplot(kk.down );ggsave(‘kk.down.dotplot.png’)

 

 kk.diff <- enrichKEGG(gene         = gene_diff,

                        organism     = ‘hsa’,

                        pvalueCutoff = 0.05)

  head(kk.diff)[,1:6]

  dotplot(kk.diff )

  ggsave(‘kk.diff.dotplot.png’)

  kegg_diff_dt <- as.data.frame(kk.diff)

  kegg_down_dt <- as.data.frame(kk.down)

  kegg_up_dt <- as.data.frame(kk.up)

  down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1

  up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1

  

  ##自定义了一个函数source(‘functions.R’)

  kegg_plot <- function(up_kegg,down_kegg){

    dat=rbind(up_kegg,down_kegg)

    colnames(dat)

    dat$pvalue = –log10(dat$pvalue)

    dat$pvalue=dat$pvalue*dat$group 

    

    dat=dat[order(dat$pvalue,decreasing = F),]

    

    g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) + 

      geom_bar(stat=“identity”) + 

      scale_fill_gradient(low=“blue”,high=“red”,guide = FALSE) + 

      scale_x_discrete(name =“Pathway names”) +

      scale_y_continuous(name =“log10P-value”) +

      coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+

      ggtitle(“Pathway Enrichment”) 

  }

  

  g_kegg=kegg_plot(up_kegg,down_kegg)

  print(g_kegg)

  

  ggsave(g_kegg,filename = ‘kegg_up_down.png’)

十、GSEA分析
rm(list = ls())  ## 魔幻操作,一键清空~

options(stringsAsFactors = F)

#载入GO、KEGG、GSEA分析需要的包

library(ggplot2)

library(clusterProfiler)

library(org.Hs.eg.db)

library(enrichplot)

#BiocManager::install(“ReactomePA”)

library(ReactomePA)

library(data.table)

###完成前期的数据处理,直接load数据就可以开始进行分析

load(‘anno_DEG_list.Rdata’)

geneList=DEG$log2FoldChange

names(geneList)=DEG$ENTREZID

geneList=sort(geneList,decreasing = T)

head(geneList)

###  GSEA 

kk_gse <- gseKEGG(geneList     = geneList,

                  organism     = ‘hsa’,

                  nPerm        = 1000,

                  minGSSize    = 120,

                  pvalueCutoff = 0.9,

                  verbose      = FALSE)

dim(kk_gse)

GSEA_data <- data.frame(kk_gse)

head(GSEA_data)

write.table(x=GSEA_data,

            file=“GSEA_data_CESC_MYC.txt”,

            quote=F,

            sep = “t“,

            row.names = T,

            col.names = T)

head(kk_gse)[,1:6]

###单独一个条形码

gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))

ggsave(‘gseaplot.png’)

library(ReactomePA)

#这三种分析的结果都可以输出文本查看。有时候会出现这种情况:没有富集到任何结果。研究后发现,pvalueCutoff界定值应该背锅,虽然这里写的是P值,但这个包里实际是以FDR作为界定。经常用FDR的同学会遇到这种情况:P值很多很显著的,但是FDR算出来都是一个值,时而还都是大于0.05的相同值。所以解决办法就是pvalueCutoff设置等于1,要是不设定也不行,只要设定成1,就会输出所有结果。

###GSEA分析——GO

Go_gseresult <- gseGO(geneList, ‘org.Hs.eg.db’, keyType = “ENTREZID”, ont=“BP”,

                      nPerm = 1000, minGSSize = 10, maxGSSize = 1000, pvalueCutoff=0.05)

save(Go_gseresult,file = “Go_gseresult.Rdata”)

load(“Go_gseresult.Rdata”)

 

###GSEA分析——KEGG

KEGG_gseresult <- gseKEGG(geneList, nPerm = 1000, minGSSize = 10, 

                          maxGSSize = 1000, pvalueCutoff=1)

save(KEGG_gseresult,file = “KEGG_gseresult.Rdata”)

load(“KEGG_gseresult.Rdata”)

 

##GSEA分析——Reactome

Go_Reactomeresult <- gsePathway(geneList, nPerm = 1000, minGSSize = 10,

                                maxGSSize = 1000, pvalueCutoff=1)

save(Go_Reactomeresult,file = “Go_Reactomeresult.Rdata”)

load(“Go_Reactomeresult.Rdata”)

##波浪图

ridgeplot(KEGG_gseresult, 5) #输出前十个结果

ggsave(‘ridgeplot_1.png’)

###还是gseaplot2看着舒服

gseaplot2(KEGG_gseresult, 1)

ggsave(‘gseaplot2.png’)



###gseaplot2还可以同时显示复数个功能组的富集曲线,并标记P值

gseaplot2(KEGG_gseresult, 1:4, pvalue_table = TRUE)

ggsave(‘KEGG_gseresult_1.png’)

你可能感兴趣的:(UCSC Xena数据下载后的后续分析代码——转载自麦麦大人)