前面我讲述过如何通过BeautifulSoup获取维基百科的消息盒,同样可以通过Spider获取网站内容,最近学习了Selenium+Phantomjs后,准备利用它们获取百度百科的旅游景点消息盒(InfoBox),这也是毕业设计实体对齐和属性的对齐的语料库前期准备工作。希望文章对你有所帮助~
源代码
# coding=utf-8 """ Created on 2015-09-04 @author: Eastmount """ import time import re import os import sys import codecs from selenium import webdriver from selenium.webdriver.common.keys import Keys import selenium.webdriver.support.ui as ui from selenium.webdriver.common.action_chains import ActionChains #Open PhantomJS driver = webdriver.PhantomJS(executable_path="G:\phantomjs-1.9.1-windows\phantomjs.exe") #driver = webdriver.Firefox() wait = ui.WebDriverWait(driver,10) global info #全局变量 #Get the infobox of 5A tourist spots def getInfobox(name): try: #create paths and txt files global info basePathDirectory = "Tourist_spots_5A" if not os.path.exists(basePathDirectory): os.makedirs(basePathDirectory) baiduFile = os.path.join(basePathDirectory,"BaiduSpider.txt") if not os.path.exists(baiduFile): info = codecs.open(baiduFile,'w','utf-8') else: info = codecs.open(baiduFile,'a','utf-8') #locate input notice: 1.visit url by unicode 2.write files print name.rstrip('\n') #delete char '\n' driver.get("http://baike.baidu.com/") elem_inp = driver.find_element_by_xpath("//form[@id='searchForm']/input") elem_inp.send_keys(name) elem_inp.send_keys(Keys.RETURN) info.write(name.rstrip('\n')+'\r\n') #codecs不支持'\n'换行 #print driver.current_url time.sleep(5) #load infobox elem_name = driver.find_elements_by_xpath("//div[@class='basic-info']/dl/dt") elem_value = driver.find_elements_by_xpath("//div[@class='basic-info']/dl/dd") #create dictionary key-value #字典是一种散列表结构,数据输入后按特征被散列,不记录原来的数据,顺序建议元组 elem_dic = dict(zip(elem_name,elem_value)) for key in elem_dic: print key.text,elem_dic[key].text info.writelines(key.text+" "+elem_dic[key].text+'\r\n') time.sleep(5) except Exception,e: #'utf8' codec can't decode byte print "Error: ",e finally: print '\n' info.write('\r\n') #Main function def main(): global info #By function get information source = open("Tourist_spots_5A_BD.txt",'r') for name in source: name = unicode(name,"utf-8") if u'故宫' in name: #else add a '?' name = u'北京故宫' getInfobox(name) print 'End Read Files!' source.close() info.close() driver.close() main()
#设置编码utf-8 import sys reload(sys) sys.setdefaultencoding('utf-8') #显示当前默认编码方式 print sys.getdefaultencoding()
import codecs #用codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode if not os.path.exists(baiduFile): info = codecs.open(baiduFile,'w','utf-8') else: info = codecs.open(baiduFile,'a','utf-8') #该方法不是io故换行是'\r\n' info.writelines(key.text+":"+elem_dic[key].text+'\r\n')