Selenium是一款页面自动化开源工具,多用于Web测试场景,其原理是调用浏览器实际绘制网页界面并通过其API模拟用户行为。
安装
环境ubuntu14.04 64-bit
需要安装的软件包括xvfb、firefox(也可以使用Chrome等其它浏览器)、selenium及其python驱动
apt-get install xvfb #安装xvfb
apt-get install firefox #安装firefox
(补充: 我使用firefox47.0版本一直无法使用,webdriver.Firefox()这里一直报错——The browser appears to have exited before we could connect. If you specified a log_file in the FirefoxBinary constructor, check it for details.经过搜索后发现可能是firefox的版本问题,于是我装了个11.0版本,问题解决。
apt-cache search firefox #搜索可安装版本
apt-get install firefox=version #安装指定版本
)
pip install -U selenium #前提是已安装python-pip
在使用时发现偶尔xvfb会停止响应,为此这里是用PyVirtualDisplay(1个python版本的xvfb管理lib,在爬虫启动时开启xvfb,完成后关闭xvfb)
pip install pyvirtualdisplay #安装PyVirtualDisplay
爬虫实现
代码编写相对很简单,使用selenium的驱动操作页面即可:
# coding: UTF-8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
import time
from pyvirtualdisplay import Display
from bs4 import BeautifulSoup
from pymongo import MongoClient
from bson import ObjectId
# 获取id
mongoid = str(sys.argv[1])
# 获取用户账号
account = str(sys.argv[2])
try:
collection = MongoClient(*****).op.inquiry
_id = ObjectId(mongoid)
username = 'username'
password = 'password'
#开启xvfb
display = Display(visible=0, size=(400, 400))
display.start()
#获取firefox本地session
# browser = webdriver.Firefox()
# 获取phantomjs
browser = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs')
# 设置Waits
browser.implicitly_wait(10)
#访问登陆页面
browser.get("http://**********/index.php/Public/login.html")
# browser.get_screenshot_as_file(u"/www/screenshot.png")
# 获取返回界面name为account的输入框并填写用户名
accountname = browser.find_element_by_name('account')
accountname.send_keys(username)
# 获取返回界面name为pwd的输入框并填写密码
accountpassword = browser.find_element_by_name('pwd')
accountpassword.send_keys(password)
# # 点击登录
submit = browser.find_element_by_xpath("//div[@class='loginbox-submit']/input")
submit.click()
time.sleep(2)
# 跳转至新的页面
browser.get("http://**********/index.php/DataPayorder/index.html")
# browser.execute_script('window.location.href="http:/************/index.php/DataPayorder/index.html";')
# 键入游戏账号
account_input = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.ID, 'userid')))
account_input.send_keys(account)
# 选择显示条数
select = Select(browser.find_element_by_name('example_length'))
# select.selct_by_index(index)
# select.select_by_visible_text("text")
select.select_by_value("100")
# 点击搜索
aSrh_btn = browser.find_element_by_id("aSrh")
aSrh_btn.click()
time.sleep(2)
# # 弃用,耗时,导致异常:Element is no longer attached to the DOM
# # 新方案:先保存html,再使用BeautifulSoup解析获取数据。
# table = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.ID, 'example')))
# if table != None:
# trs = table.find_elements_by_xpath(".//tbody/tr")
# tmp = []
# for tr in trs:
# item = [
# str(tr.find_elements_by_xpath(".//td")[1].text),
# str(tr.find_elements_by_xpath(".//td")[2].text),
# str(tr.find_elements_by_xpath(".//td")[3].text),
# str(tr.find_elements_by_xpath(".//td")[4].text),
# str(tr.find_elements_by_xpath(".//td")[5].text),
# str(tr.find_elements_by_xpath(".//td")[6].text),
# str(tr.find_elements_by_xpath(".//td")[7].text),
# str(tr.find_elements_by_xpath(".//td")[8].text),
# str(tr.find_elements_by_xpath(".//td")[9].text),
# str(tr.find_elements_by_xpath(".//td")[10].text),
# str(tr.find_elements_by_xpath(".//td")[11].text),
# str(tr.find_elements_by_xpath(".//td")[12].text),
# str(tr.find_elements_by_xpath(".//td")[13].text),
# ]
# tmp.append(item)
# print tmp
# 调用js获取页面html
html = browser.execute_script("return document.documentElement.outerHTML")
# print str(html)
# 解析html
# http://beautifulsoup.readthedocs.io/zh_CN/latest/
soup = BeautifulSoup(html, 'html5lib')
table = soup.find('table', attrs={'id':'example'})
table_body = table.find('tbody')
tmp = []
if len(table_body.findAll('tr')[0].findAll('td')[0].contents) > 0:
collection.update({'_id':_id}, {'$set':{'result':tmp, 'status':'processed'}})
else:
for tr in table_body.findAll('tr'):
tds = tr.findAll('td')
item = [
str(tds[1].contents[0]),
str(tds[2].contents[0]),
str(tds[3].contents[0]),
str(tds[4].contents[0]),
str(tds[5].contents[0]),
str(tds[6].contents[0]),
str(tds[7].contents[0]),
str(tds[8].contents[0]),
str(tds[9].contents[0]),
str(tds[10].contents[0]),
]
tmp.append(item)
collection.update({'_id':_id}, {'$set':{'result':tmp, 'status':'processed'}})
# 截图
# browser.get_screenshot_as_file(u"/www/screenshot.png")
except Exception, e:
print e
finally:
if browser != None:
# 关闭session
browser.close()
browser.quit()
if display != None:
# 关闭xvfb
display.stop()
PhantomJS
由于Firefox占用较大的cpu,所以换成PhantomJS
安装
wget http://phantomjs.googlecode.com/files/phantomjs-1.8.2-linux-x86_64.tar.bz2
tar -xvf phantomjs-1.8.2-linux-x86_64.tar.bz2
sudo mv phantomjs-1.8.2-linux-x86_64 /usr/local/src/phantomjs
sudo ln -sf /usr/local/src/phantomjs/bin/phantomjs /usr/local/bin/phantomjs
phantomjs --version
如果遇到error:libfontconfig.so.1: cannot open shared object file: No such file or directory
可以使用下面这一条命令进行修复:
sudo apt-get install libfontconfig
使用
跟Firefox的用法差不多,简单用例:
from selenium import webdriver
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
driver = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs')
driver.get("http://www.ip.cn")
print driver.current_url
print driver.page_source
print driver.find_element_by_id('result').text.split('\n')[0].split('from:')[1]
driver.quit()
Chrome
安装
chromedriver
apt-get install chromium-browser
wget http://chromedriver.storage.googleapis.com/2.7/chromedriver_linux64.zip
unzip chromedriver_linux64.zip
mv chromedriver /usr/bin
使用
from selenium import webdriver
from pyvirtualdisplay import Display
display = Display(visible=0, size=(800, 600))
display.start()
driver = webdriver.Chrome(executable_path="/usr/bin/chromedriver")
driver.get("http://www.baidu.com")
print driver.title
driver.quit()
display.stop()