2021-07-09:细数我在GO注释时遇到的那些报错

上回说到,我废了九牛二虎之力,终于拿到了百十来个GB的GO EVIDENCE 数据,你猜怎么着,接着往下进行的时候发现,R语言根本读不了。。。。心疼自己一秒钟

来吧,今天接着搞起:

library(GOstats)

ago <- read.delim('/vol3/agis/zhoushaoqun_group/wangyantao/GO/swiss_goev_sme',row.names = NULL)

#报错

> colnames(ago) <- c('gene_id','go_id','evi')

Error in names(x) <- value :

  'names' attribute [3] must be the same length as the vector [1]


之前我用write.csv生成的EVIDENCE注释文件,在读取时,列与列之间多了',',因此在读入时加了sep=',',而这次列与列之间是空格,同理,要加入sep=' '。

ago <- read.delim('/vol3/agis/zhoushaoqun_group/wangyantao/GO/swiss_goev_sme',row.names = NULL,sep=' ')

成功读入,进入下一步

goframeData <- na.omit(data.frame(ago$go_id,ago$evi,ago$gene_id))

goFrame <- GOFrame(goframeData)

goAllFrame <- GOAllFrame(goFrame)

library('GSEABase')

gsc <- GeneSetCollection(goAllFrame,setType = GOCollection())

universe <- as.vector(ago$gene_id)

DEGfile <- read.csv('xxxxxx.csv',header = T,row.names = NULL)

#读入差异表达基因#

genes = as.vector(DEGfile$gene_id)

params <- GSEAGOHyperGParams(name = 'custom',

                            geneSetCollection = gsc,

                            geneIds = genes,

                            universeGeneIds = universe,

                            ontology = 'CC',

                            pvalueCutoff = 0.05,

                            conditional = FALSE,

                            testDirection = 'over')

Over <- hyperGTest(params)

write.csv(summary(Over),xxxxxx_GO_CC_hyper.csv')

问题很多:

1、EVIDENCE注释文件的gene_id与DESeq2差异表达的文件中的gene_id要保持一致

sed "s/.1 / /g" swiss_goev_sme > swiss_goev_sme.1

终于成功生成GO注释文件,以下为全部脚本

> library(GOstats)

> ago <- read.delim('/vol3/agis/zhoushaoqun_group/wangyantao/GO/swiss_goev_sly.1',row.names = NULL,sep=' ')

> colnames(ago) <- c('gene_id','go_id','evi')

> goframeData <- na.omit(data.frame(ago$go_id,ago$evi,ago$gene_id))

> goFrame <- GOFrame(goframeData)

> goAllFrame <- GOAllFrame(goFrame)

There were 21 warnings (use warnings() to see them)

> library('GSEABase')

Loading required package: annotate

Loading required package: XML

Attaching package: ‘XML’

The following object is masked from ‘package:graph’:

    addNode

Warning messages:

1: package ‘GSEABase’ was built under R version 4.0.3

2: package ‘annotate’ was built under R version 4.0.3

> gsc <- GeneSetCollection(goAllFrame,setType = GOCollection())

> universe <- as.vector(ago$gene_id)

> DEGfile <- read.csv('/vol3/agis/zhoushaoqun_group/wangyantao/GO/diffgene_Sly48.48.0_deseq2.csv',header = T,row.names = NULL)

> genes = as.vector(DEGfile[,1])

> params <- GSEAGOHyperGParams(name = 'custom',

+                              geneSetCollection = gsc,

+                              geneIds = genes,

+                              universeGeneIds = universe,

+                              ontology = 'CC',

+                              pvalueCutoff = 0.05,

+                              conditional = FALSE,

+                              testDirection = 'over')

                              geneSetCollection = gsc,

                              geneIds = genes,

                              universeGeneIds = universe,

                              ontology = 'MF',

                              pvalueCutoff = 0.05,

                              conditional = FALSE,

                              testDirection = 'over')

Warning messages:

1: In makeValidParams(.Object) : removing duplicate IDs in universeGeneIds

2: In makeValidParams(.Object) : removing geneIds not in universeGeneIds

> params1 <- GSEAGOHyperGParams(name = 'custom',

+                              geneSetCollection = gsc,

+                              geneIds = genes,

+                              universeGeneIds = universe,

+                              ontology = 'BP',

+                              pvalueCutoff = 0.05,

+                              conditional = FALSE,

+                              testDirection = 'over')

Warning messages:

1: In makeValidParams(.Object) : removing duplicate IDs in universeGeneIds

2: In makeValidParams(.Object) : removing geneIds not in universeGeneIds

> params2 <- GSEAGOHyperGParams(name = 'custom',

+                              geneSetCollection = gsc,

+                              geneIds = genes,

+                              universeGeneIds = universe,

+                              ontology = 'MF',

+                              pvalueCutoff = 0.05,

+                              conditional = FALSE,

+                              testDirection = 'over')

Warning messages:

1: In makeValidParams(.Object) : removing duplicate IDs in universeGeneIds

2: In makeValidParams(.Object) : removing geneIds not in universeGeneIds

>

> Over <- hyperGTest(params)

> Over1 <- hyperGTest(params1)

> Over2 <- hyperGTest(params2)

> write.csv(summary(Over),'Sly48_GO_CC_hyper.csv')

> write.csv(summary(Over1),'Sly48_GO_BP_hyper.csv')

> write.csv(summary(Over2),'Sly48_GO_MF_hyper.csv')

> library(ggplot2)

> library(RColorBrewer)

Warning message:

package ‘RColorBrewer’ was built under R version 4.0.3

> display.brewer.all()

> col3 <- brewer.pal(4,'Set3')[c(1,3,4)]

> cc = read.csv('Sly48_GO_CC_hyper.csv',header = T,row.names = NULL)

> bp = read.csv('Sly48_GO_BP_hyper.csv',header = T,row.names = NULL)

> mf = read.csv('Sly48_GO_MF_hyper.csv',header = T,row.names = NULL)

> cc2 <- cc[cc$Pvalue<1e-5,]

> bp2 <- bp[bp$Pvalue<1e-5,]

> mf2 <- mf[mf$Pvalue<1e-5,]

> colnames(cc2)[2] <- c('GOID')

> colnames(bp2)[2] <- c('GOID')

> colnames(mf2)[2] <- c('GOID')

> data <- rbind(mf2,cc2)

> data <- rbind(data,bp2)

> data2 <- data.frame(data,type = c(rep('MF',dim(mf2)[1]),rep('CC',dim(cc2)[1]),rep('BP',dim(bp2)[1])))

> p <- ggplot(data2,aes(x = Term,y = log2(Count),fill = type,order = type))+

+  geom_bar(stat = 'identity')+coord_flip()+

+  scale_fill_manual(values = col3,limits = c('BP','CC','MF'),labels = c('BP','CC','MF'))+

+  theme_bw()+theme(legend.title=element_blank(),

+                    axis.text = element_text(size = 10),

+                    axis.title.x = element_text(size = rel(1.5)),

+                    axis.title.y = element_text(size = rel(1.5),angle = 90),

+                    legend.text = element_text(size = rel(1.5)))+

+  ylab('Number of Genes (log2)') + xlab('Terms')

> ggsave('Sly48.go_hyper.tiff',p,device = 'tiff',scale = 1,width = 20,height = 30,dpi = 300)

>

> display.brewer.all()

> col3 <- brewer.pal(4,'Set3')[c(1,3,4)]

> cc = read.csv('Sly48_GO_CC_hyper.csv',header = T,row.names = NULL)

> bp = read.csv('Sly48_GO_BP_hyper.csv',header = T,row.names = NULL)

> mf = read.csv('Sly48_GO_MF_hyper.csv',header = T,row.names = NULL)

> cc2 <- cc[cc$Pvalue<1e-2,]

> bp2 <- bp[bp$Pvalue<1e-2,]

> mf2 <- mf[mf$Pvalue<1e-2,]

> colnames(cc2)[2] <- c('GOID')

> colnames(bp2)[2] <- c('GOID')

> colnames(mf2)[2] <- c('GOID')

> data <- rbind(mf2,cc2)

> data <- rbind(data,bp2)

> data2 <- data.frame(data,type = c(rep('MF',dim(mf2)[1]),rep('CC',dim(cc2)[1]),rep('BP',dim(bp2)[1])))

> p <- ggplot(data2,aes(x = Term,y = log2(Count),fill = type,order = type))+

+  geom_bar(stat = 'identity')+coord_flip()+

+  scale_fill_manual(values = col3,limits = c('BP','CC','MF'),labels = c('BP','CC','MF'))+

+  theme_bw()+theme(legend.title=element_blank(),

+                    axis.text = element_text(size = 10),

+                    axis.title.x = element_text(size = rel(1.5)),

+                    axis.title.y = element_text(size = rel(1.5),angle = 90),

+                    legend.text = element_text(size = rel(1.5)))+

+  ylab('Number of Genes (log2)') + xlab('Terms')

> ggsave('Sly48.go_hyper.tiff',p,device = 'tiff',scale = 1,width = 20,height = 30,dpi = 300)

一开始的时候,pvalue设置为10的-5次方,结果并没有生产图片,后面我把pvalue改成10的-2次方,好歹有个结果。

经过测试,以上脚本可用,开始批量处理相关物种差异表达的GO注释

你可能感兴趣的:(2021-07-09:细数我在GO注释时遇到的那些报错)