HPA数据库有丰富的切片数据和免疫组化的切片,可以从网站一个个下载保存,然后把对应的信息保存,再重命名,再次比对信息。
但是,这样的操作,可谓耗时耗力,还有可能在对比的时候出错。
然后这个时候使用R专门爬下来,就会很方便,下面就是整个流程的代码
下载并加载所需要的包
加载所需要的包,没有哪一个,下载相应的包即可
library(BiocStyle)
library(HPAanalyze)
library(dplyr)
library(tibble)
library(readr)
library(tidyr)
创建存储位置并确定需要下载基因和组织
dir.create("./output_data/step11_img/")
gene="IGF2BP2"
tissue="Kidney"
获取相关的信息
#获得HPA网站中该基因的xml文件
hpa_target_gene<-hpaXmlGet(gene)
#将xml中组织染色的信息提取出来
hpa_target_gene_fig_url<-hpaXmlTissueExpr(hpa_target_gene)
hpa_target_gene_fig_url_1<-as.data.frame(hpa_target_gene_fig_url[[1]])
hpa_target_gene_fig_url_1[1:6,1:18]
hpa_target_gene_fig_url_2<-as.data.frame(hpa_target_gene_fig_url[[2]])
hpa_target_gene_fig_url_2[1:6,1:18]
选择需要下载的目标内容
#选择自己感兴趣的组织
hpa_target_gene_fig_url_tissue<-hpa_target_gene_fig_url_1[hpa_target_gene_fig_url_1$tissueDescription2==tissue,]
hpa_target_gene_fig_url_tissue<-hpa_target_gene_fig_url_2[hpa_target_gene_fig_url_2$tissueDescription2==tissue,]
创建下载位置并保存图片
#为该组织该基因单独建个文件夹储存
picDir <- paste('./output_data/step11_img/',gene, tissue,"IHC-2/", sep = "_")
if (!dir.exists(picDir)) {
dir.create(picDir)
}
for (i in 1:nrow(hpa_target_gene_fig_url_tissue)) {
file_url<-hpa_target_gene_fig_url_tissue$imageUrl[i]
file_dir<-paste(picDir,gene,tissue,hpa_target_gene_fig_url_tissue$patientId[i],hpa_target_gene_fig_url_tissue$tissueDescription1[i],hpa_target_gene_fig_url_tissue$tissueDescription2[i],".tiff",sep = "_")
download.file(url = file_url,destfile = file_dir,mode = "wb")
}
最后保存图片的所有信息
write.csv(hpa_target_gene_fig_url_tissue,paste(picDir,gene,"IHC-2_result_tab.csv",sep = "_"))