如何通过Python+Selenium+PhantomJS/Chrome获取HTTP状态和Response Headers

Selenium没有提供获取HTTP状态码的API,并且似乎以后也不准备提供该功能,还好有变通的方法。这里提供Python+Selenium+PhantomJS的实现供参考:

# Python 2.7
from selenium import webdriver  
import json
from collections import OrderedDict

def getResponseHeaders(browser):
    har = json.loads(browser.get_log('har')[0]['message'])
    return OrderedDict(sorted([(header["name"], header["value"]) for header in har['log']['entries'][0]['response']["headers"]], key = lambda x: x[0]))

def getResponseStatus(browser):
    har = json.loads(browser.get_log('har')[0]['message'])
    return (har['log']['entries'][0]['response']["status"],\
            str(har['log']['entries'][0]['response']["statusText"]))

browser = webdriver.PhantomJS()

# Simple Test
print ">>>>> 404"
browser.get("http://www.questionfish.cn/notfound.html")
print "status: ", getResponseStatus(browser)
headers = getResponseHeaders(browser)
for key in headers:
    print key, "=>", headers[key]
print 

Python+Selenium+ChromeDriver当然也有解决方法:

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json

d = DesiredCapabilities.CHROME
d['loggingPrefs'] = { 'performance':'ALL' }

def getHttpStatus(browser):
    for responseReceived in browser.get_log('performance'):
        try:
            response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
            if response[u'url'] == browser.current_url:
                return (response[u'status'], response[u'statusText'])
        except:
            pass
    return None

def getHttpResponseHeader(browser):
    for responseReceived in browser.get_log('performance'):
        try:
            response = json.loads(responseReceived[u'message'])[u'message'][u'params'][u'response']
            print
            if response[u'url'] == browser.current_url:
                return response[u'headers']
        except:
            pass
    return None

browser = webdriver.Chrome(desired_capabilities=d)
url = 'http://www.questionfish.cn/notfound.html'
browser.get(url)
print getHttpStatus(browser)
# 因get_log后旧的日志将被清除,两个函数切勿同时使用
# print getHttpResponseHeader(browser)
browser.quit()

REF: How to get status code by using selenium.py (python code) - Stack Overflow

你可能感兴趣的:(Python)