Selenium没有提供获取HTTP状态码的API,并且似乎以后也不准备提供该功能,还好有变通的方法。这里提供Python+Selenium+PhantomJS的实现供参考:
# Python 2.7
from selenium import webdriver
import json
from collections import OrderedDict
def getResponseHeaders(browser):
har = json.loads(browser.get_log('har')[0]['message'])
return OrderedDict(sorted([(header["name"], header["value"]) for header in har['log']['entries'][0]['response']["headers"]], key = lambda x: x[0]))
def getResponseStatus(browser):
har = json.loads(browser.get_log('har')[0]['message'])
return (har['log']['entries'][0]['response']["status"],\
str(har['log']['entries'][0]['response']["statusText"]))
browser = webdriver.PhantomJS()
# Simple Test
print ">>>>> 404"
browser.get("http://www.questionfish.cn/notfound.html")
print "status: ", getResponseStatus(browser)
headers = getResponseHeaders(browser)
for key in headers:
print key, "=>", headers[key]
print
Python+Selenium+ChromeDriver当然也有解决方法:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json
d = DesiredCapabilities.CHROME
d['loggingPrefs'] = { 'performance':'ALL' }
def getHttpStatus(browser):
for responseReceived in browser.get_log('performance'):
try:
response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
if response[u'url'] == browser.current_url:
return (response[u'status'], response[u'statusText'])
except:
pass
return None
def getHttpResponseHeader(browser):
for responseReceived in browser.get_log('performance'):
try:
response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
print
if response[u'url'] == browser.current_url:
return response[u'headers']
except:
pass
return None
browser = webdriver.Chrome(desired_capabilities=d)
url = 'http://www.questionfish.cn/notfound.html'
browser.get(url)
print getHttpStatus(browser)
# 因get_log后旧的日志将被清除,两个函数切勿同时使用
# print getHttpResponseHeader(browser)
browser.quit()
REF: How to get status code by using selenium.py (python code) - Stack Overflow