尝试过两个方法下载探针矩阵数据:1.GEO网站直接下载txt。
probeMatrix<-read.table("GSE16011_series_matrix.txt",sep="\t",
fill = T,comment.char = "!",header = T)
class(probeMatrix)
row.names(probeMatrix)=probeMatrix[,1]
probeMatrix=probeMatrix[,-1]
2.GEOquery
library(GEOquery)
gset <- getGEO("GSE16011", GSEMatrix =TRUE, AnnotGPL=TRUE)
gset$GSE16011_series_matrix.txt.gz@featureData[[1]]
gset$GSE16011_series_matrix.txt.gz@featureData[[]]
#探针矩阵
expr<-exprs(gset[[1]])
两个方法下载平台注释文件:1.网页下载
ids1<-read.table("GPL570-55999.txt",sep="\t",
fill = T,comment.char = "#",header = T)
ids1=ids1[c(01,11)]
names(ids1)=c("probe_id","symbol")
length(unique(ids1$symbol))#看有多少个基因
tail(sort(table(ids1$symbol)))#看有的基因涉及到几个探针
table(sort(table(ids1$symbol)))#看涉及到多少探针的基因有多少
2.使用R包
#BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ids=toTable(hgu133plus2SYMBOL)
#ids=data.frame(ann[,c(1,11)])
length(unique(ids$symbol))#看有多少个基因
tail(sort(table(ids$symbol)))#看有的基因涉及到几个探针
table(sort(table(ids$symbol)))#看涉及到多少探针的基因有多少
目前
1x1:从GEO网站下载探针矩阵和GPL570
> probeMatrix<-read.table("GSE16011_series_matrix.txt",sep="\t",
+ fill = T,comment.char = "!",header = T)
> class(probeMatrix)
[1] "data.frame"
> row.names(probeMatrix)=probeMatrix[,1]
> probeMatrix=probeMatrix[,-1]
> ids1<-read.table("GPL570-55999.txt",sep="\t",
+ fill = T,comment.char = "#",header = T)
Warning message:
In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
EOF within quoted string
> ids1=ids1[c(01,11)]
> names(ids1)=c("probe_id","symbol")
> length(unique(ids1$symbol))#看有多少个基因
[1] 15256
> tail(sort(table(ids1$symbol)))#看有的基因涉及到几个探针
ARHGEF12 CALD1
9 9
OFCC1 LOC100506403 /// LOC101928269 /// RUNX1
10 11
HFE
13 4522
> table(sort(table(ids1$symbol)))#看涉及到多少探针的基因有多少
1 2 3 4 5 6 7 8 9 10 11 13 4522
9938 3379 1267 436 162 47 16 5 2 1 1 1 1
> exprSet=probeMatrix
> table(rownames(exprSet)%in%ids1$probe_id)
FALSE TRUE
17416 111
> table(ids1$probe_id%in%rownames(exprSet))
FALSE TRUE
27948 111
只有111个探针名字对应上
1x2:从网页下载探针矩阵文件,用R包下载注释文件
> probeMatrix<-read.table("GSE16011_series_matrix.txt",sep="\t",
+ fill = T,comment.char = "!",header = T)
> class(probeMatrix)
[1] "data.frame"
> row.names(probeMatrix)=probeMatrix[,1]
> probeMatrix=probeMatrix[,-1]
> library(hgu133plus2.db)
> ids=toTable(hgu133plus2SYMBOL)
> #ids=data.frame(ann[,c(1,11)])
> length(unique(ids$symbol))#看有多少个基因
[1] 20174
> exprSet=probeMatrix
> table(rownames(exprSet)%in%ids$probe_id)
FALSE TRUE
17368 159
> table(ids$probe_id%in%rownames(exprSet))
FALSE TRUE
41763 159
只有159个探针名字对应上
第三种方法:GEOquery获取探针信息,网页下载平台信息
> library(GEOquery)
> gset <- getGEO("GSE16011", GSEMatrix =TRUE, AnnotGPL=FALSE)
Found 1 file(s)
GSE16011_series_matrix.txt.gz
Using locally cached version: C:\Users\123\AppData\Local\Temp\RtmpuE41Xm/GSE16011_series_matrix.txt.gz
-- Column specification ---------------------------------------------------------------------------
cols(
.default = col_double(),
ID_REF = col_character()
)
i Use `spec()` for the full column specifications.
Using locally cached version of GPL8542 found here:
C:\Users\123\AppData\Local\Temp\RtmpuE41Xm/GPL8542.soft
Warning: 1 parsing failure.
row col expected actual file
10614 -- 8 columns 7 columns literal data
>
> #探针矩阵
> expr<-exprs(gset[[1]])
> dim(expr)
[1] 17527 284
> ids1<-read.table("GPL570-55999.txt",sep="\t",
+ fill = T,comment.char = "#",header = T)
Warning message:
In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
EOF within quoted string
> ids1=ids1[c(01,11)]
> names(ids1)=c("probe_id","symbol")
> length(unique(ids1$symbol))#看有多少个基因
[1] 15256
> exprSet=expr
> table(rownames(exprSet)%in%ids1$probe_id)
FALSE TRUE
17429 98
> table(ids1$probe_id%in%rownames(exprSet))
FALSE TRUE
27961 98
对应上98个
第四种方法,GEOquery获取探针文件(expr),R包下载注释文件
> ids=toTable(hgu133plus2SYMBOL)
> #ids=data.frame(ann[,c(1,11)])
> length(unique(ids$symbol))#看有多少个基因
[1] 20174
> exprSet=expr
> table(rownames(exprSet)%in%ids$probe_id)
FALSE TRUE
17386 141
> table(ids$probe_id%in%rownames(exprSet))
FALSE TRUE
41781 141
用excel把探针矩阵的探针id和GPL570的探针ID放在一个文本文件下。还是只有100多个对应。
> GPL570=read.table("GPL570.txt",header = T,sep = "\t")
> View(GPL570)
> table(GPL570$ID1%in%GPL570$ID)
FALSE TRUE
54503 172
用GSE1643数据集发现五万多个几乎完全吻合,怀疑可能是数据集的问题。