实现代码
实现的代码主要是Selenium通过分析网页DOM结点进行爬取。
爬取的地址是:http://www.medlive.cn/pubmed/
在网址中搜索Protein(蛋白质)后,分析网址可发现设置Page=1~20可爬取前1~20页的URL信息。链接如下:
http://www.medlive.cn/pubmed/pubmed_search.do?q=protein&page=1
# coding=utf-8 """ Created on 2015-12-05 Ontology Spider @author Eastmount CSDN URL: http://www.meddir.cn/cate/736.htm http://www.medlive.cn/pubmed/ http://paper.medlive.cn/literature/1502224 """ import time import re import os import shutil import sys import codecs from selenium import webdriver from selenium.webdriver.common.keys import Keys import selenium.webdriver.support.ui as ui from selenium.webdriver.common.action_chains import ActionChains #Open PhantomJS driver = webdriver.Firefox() driver2 = webdriver.PhantomJS(executable_path="G:\phantomjs-1.9.1-windows\phantomjs.exe") wait = ui.WebDriverWait(driver,10) ''' Load Ontoloty 去到每个生物本体页面下载摘要信息 http://paper.medlive.cn/literature/literature_view.php?pmid=26637181 http://paper.medlive.cn/literature/1526876 ''' def getAbstract(num,title,url): try: fileName = "E:\\PubMedSpider\\" + str(num) + ".txt" #result = open(fileName,"w") #Error: 'ascii' codec can't encode character u'\u223c' result = codecs.open(fileName,'w','utf-8') result.write("[Title]\r\n") result.write(title+"\r\n\r\n") result.write("[Astract]\r\n") driver2.get(url) elem = driver2.find_element_by_xpath("//div[@class='txt']/p") #print elem.text result.write(elem.text+"\r\n") except Exception,e: print 'Error:',e finally: result.close() print 'END\n' ''' 循环获取搜索页面的URL 规律 http://www.medlive.cn/pubmed/pubmed_search.do?q=protein&page=1 ''' def getURL(): page = 1 #跳转的页面总数 count = 1 #统计所有搜索的生物本体个数 while page<=20: url_page = "http://www.medlive.cn/pubmed/pubmed_search.do?q=protein&page="+str(page) print url_page driver.get(url_page) elem_url = driver.find_elements_by_xpath("//div[@id='div_data']/div/div/h3/a") for url in elem_url: num = "%05d" % count title = url.text url_content = url.get_attribute("href") print num print title print url_content #自定义函数获取内容 getAbstract(num,title,url_content) count = count + 1 else: print "Over Page " + str(page) + "\n\n" page = page + 1 else: "Over getUrl()\n" time.sleep(5) ''' 主函数预先运行 ''' if __name__ == '__main__': path = "F:\\MedSpider\\" if os.path.isfile(path): #Delete file os.remove(path) elif os.path.isdir(path): #Delete dir shutil.rmtree(path, True) os.makedirs(path) #Create the file directory getURL() print "Download has finished."
分析HTML
1.获取每页Page中的20个关于Protein(蛋白质)的URL链接和标题。其中getURL()函数中的核心代码获取URL如下:
url = driver.find_elements_by_xpath("//div[@id='div_data']/div/div/h3/a")
url_content = url.get_attribute("href")
getAbstract(num,title,url_content)
运行结果
得到的运行结果如下所示:00001.txt~00400.txt共400个txt文件,每个文件包含标题和摘要,该数据集可简单用于生物医学的本体学习、命名实体识别、本体对齐构建等。