GEO芯片数据探针id转化

# 以数据集GSE89657为例,芯片平台是GPL6244。

1.下载表达谱数据

# GEO网站手动下载表达谱数据,解压,去注释

gunzip GSE89657_series_matrix.txt.gz

cat GSE89657_series_matrix.txt|grep -v "\!" >GSE89657_series_matrix_no_anno.txt
expr_df <- read.table("GSE89657_series_matrix_no_anno.txt",
                      header=TRUE) 
class(expr_df) # dataframe
dim(expr_df)
expr_df[1:3,]

sample_names <- colnames(expr_df)[-1]
probe_ids <- expr_df$ID_REF

2.得到probe_id和基因symbol的对应数据框

2.1 通过GPL文件

#gpl <- getGEO("GPL6244", destdir=".") 
#确定数据下载成功!文件太大,R下载失败,可以手动下载探针注释txt文件。
#colnames(Table(gpl))
#head(Table(gpl)[,c(1,10)]) ## you need to check this , which column do you need
#ID2symbol=Table(gpl)[,c(1,10)]

去除注释信息,方便读入R

cat GPL6244-17930.txt|grep -v "#" > GPL6244-17930_no_anno.txt
id_table <- read.table("GPL6244-17930_no_anno.txt",header=TRUE,sep = "\t")
id_table[1:11,1:2]
colnames(id_table)

# install.packages("data.table")
require(data.table)

probe2symbol <- id_table[,c("ID","gene_assignment")]

# 得到基因symbol,一个探针可能对应多个基因。
symbol <-tstrsplit(id_table$gene_assignment, "//", fixed=TRUE)[[2]]
# 去除空格
symbol<- trimws(symbol, which = c("both", "left", "right"),whitespace = "[ \t\r\n]")
probe2symbol["symbol"] <- symbol
# 去掉gene_assignment列
probe2symbol <-  probe2symbol[,c("ID","symbol")]
 
head(probe2symbol)
# ID列名改为probe_id
colnames(probe2symbol) <- c("probe_id","symbol")

2.2 通过bioconductor注释包


# 找到GPL6244相应的注释包hugene10sttranscriptcluster.db
# 参考https://blog.csdn.net/weixin_40739969/article/details/103186027

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("hugene10sttranscriptcluster.db")

library(hugene10sttranscriptcluster.db)

## Bimap interface:
x <- hugene10sttranscriptclusterSYMBOL

# Get the probe identifiers that are mapped to a gene symbol 
mapped_probes <- mappedkeys(x)
# Convert to dataframe
probe2symbol2 <- as.data.frame(x[mapped_probes])

# probe2symbol2[which(probe2symbol2$probe_id == "7898916"),]

# check
dim(probe2symbol2)
probe2symbol[1:30,]

probe2symbol2[which(probe2symbol2$probe_id == "7896859"),]
probe2symbol[which(probe2symbol$probe_id == "7896859"),]
id_table[which(id_table$ID == "7896859"),c("gene_assignment")]

3.合并数据框,得到基因表达矩阵。

merged_expr_df <- merge(x = expr_df, y = probe2symbol, by.x = "ID_REF",
                        by.y = "probe_id", all.x= TRUE)
# 去掉probe_id没有对应基因symbol的行
filt_expr_df <- merged_expr_df[complete.cases(merged_expr_df),]
#针对某一列过滤,本例效果一样。
filt_expr_df <- merged_expr_df[complete.cases(merged_expr_df[,c("gene_symbol")]),]
# check
dim(merged_expr_df)
dim(filt_expr_df)

table(filt_expr_df$symbol) # 有重复,多个探针对应一个基因,不能作为行名

filt_expr_df[1:3,]

# 去掉ID_REF列(probe id)
filt_expr_df <- subset(filt_expr_df, select = -ID_REF)
# 取每个基因所有探针的平均值或最大值作为基因的表达量
m_df <- aggregate(.~symbol,data=filt_expr_df,mean)
m_df <- aggregate(.~symbol,data=filt_expr_df,max)
dim(m_df)
# 查看结果
filt_expr_df[which(filt_expr_df$symbol == "ARHGDIA"),]
m_df[which(m_df$symbol == "ARHGDIA"),]

rownames(m_df) <- m_df$symbol

m_df <- subset(m_df, select = -symbol) #去掉symbol列
exprSet <- as.matrix(m_df)
head(exprSet)

你可能感兴趣的:(r语言,生物信息学)