[169]selenium设置proxy、headers(phantomjs、Chrome、Firefox)

phantomjs

设置代理

方法1:

service_args = [
    '--proxy=%s' % ips,             # 代理 IP:prot(eg:192.168.0.28:808)
    '--ssl-protocol=any',           #忽略ssl协议
    '--proxy-type=http’,            # 代理类型:http/https
    '--load-images=no',             # 关闭图片加载(可选)
    '--disk-cache=yes',             # 开启缓存(可选)
    '--ignore-ssl-errors=true'      # 忽略https错误(可选)
]
driver = webdriver.PhantomJS(service_args=service_args)

方法2:

browser=webdriver.PhantomJS(PATH_PHANTOMJS)

# 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url
proxy=webdriver.Proxy()
proxy.proxy_type=ProxyType.MANUAL
proxy.http_proxy='1.9.171.51:800'

# 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
browser.get('http://1212.ip138.com/ic.asp')

print('1: ',browser.session_id)
print('2: ',browser.page_source)
print('3: ',browser.get_cookies())

还原为系统代理

# 还原为系统代理
proxy=webdriver.Proxy()
proxy.proxy_type=ProxyType.DIRECT
proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
browser.get('http://1212.ip138.com/ic.asp')

设置请求头

import random,requests,json
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import ProxyType


#随机获取一个ip
def proxies():
    r = requests.get("http://120.26.166.214:9840/JProxy/update/proxy/scoreproxy")
    rr = json.loads(r.text)
    hh = rr['ip'] + ":" + "8907"
    print(hh)
    return hh
ips =proxies()
#-------------------------------------------------------------------------------------
#方法一:

#设置请求头
user_agent = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) " +
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
    )
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = user_agent
driver = webdriver.PhantomJS(
	executable_path=r"C:\soft\phantomjs-2.1.1-windows\bin\phantomjs.exe",
    desired_capabilities=dcap)

driver.get(url='http://www.baidu.com')
page=driver.page_source
print(page)

#-------------------------------------------------------------------------------------
#方法二:

desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
# 从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器
desired_capabilities["phantomjs.page.settings.userAgent"] = (random.choice('请求头池'))
# 不载入图片,爬页面速度会快很多
desired_capabilities["phantomjs.page.settings.loadImages"] = False

# 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url
proxy = webdriver.Proxy()
proxy.proxy_type = ProxyType.MANUAL
proxy.http_proxy = random.choice('ip池')
proxy.add_to_capabilities(desired_capabilities)
phantomjs_driver = r'C:\phantomjs-2.1.1-windows\bin\phantomjs.exe'
# 打开带配置信息的phantomJS浏览器
driver = webdriver.PhantomJS(executable_path=phantomjs_driver,desired_capabilities=desired_capabilities)
driver.start_session(desired_capabilities)

driver.get(url='http://www.baidu.com')
page=driver.page_source
print(page)

# 隐式等待5秒,可以自己调节
driver.implicitly_wait(5)
# 设置10秒页面超时返回,类似于requests.get()的timeout选项,driver.get()没有timeout选项
# 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
driver.set_page_load_timeout(20)
# 设置10秒脚本超时时间
driver.set_script_timeout(20)

#翻页命令
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')

Firefox

设置代理

import time 
from selenium import webdriver
from selenium.webdriver.common.proxy import * 

myProxy = '202.202.90.20:8080'

proxy = Proxy({
  'proxyType': ProxyType.MANUAL, 
  'httpProxy': myProxy, 
  'ftpProxy': myProxy, 
  'sslProxy': myProxy, 
  'noProxy': ''
 })
 
profile = webdriver.FirefoxProfile()
profile.set_proxy(proxy)
driver=webdriver.Firefox(firefox_options=options, executable_path=Firefox_path,firefox_profile=profile) 
# 或者
driver=webdriver.Firefox(firefox_options=options, executable_path=Firefox_path,proxy=proxy) 

driver.get('https://www.baidu.com') 
time.sleep(3) 
driver.quit()

设置代理请求头

import time 
from selenium import webdriver
from selenium.webdriver.common.proxy import * 

myProxy = '202.202.90.20:8080'

# 用于快速设置 profile 的代理信息的方法
def get_firefox_profile_with_proxy_set(profile, proxy_host):
    # proxy_host
    proxy_list = proxy_host.split(':')
    agent_ip = proxy_list[0]
    agent_port = proxy_list[1]

    profile.set_preference('network.proxy.type', 1)  # 使用代理
    profile.set_preference('network.proxy.share_proxy_settings', True)  # 所有协议公用一种代理配置
    profile.set_preference('network.proxy.http', agent_ip)
    profile.set_preference('network.proxy.http_port', int(agent_port))
    profile.set_preference('network.proxy.ssl', agent_ip)
    profile.set_preference('network.proxy.ssl_port', int(agent_port))
    # 对于localhost的不用代理,这里必须要配置,否则无法和 webdriver 通讯
    profile.set_preference('network.proxy.no_proxies_on', 'localhost,127.0.0.1')
    profile.set_preference('network.http.use-cache', False)

    return profile

profile = webdriver.FirefoxProfile()
if proxy:
    profile = get_firefox_profile_with_proxy_set(profile, myProxy)
if user_agent:
    profile.set_preference("general.useragent.override", user_agent)

driver=webdriver.Firefox(profile=profile) 
driver.get('https://www.baidu.com') 
time.sleep(3) 
driver.quit()

firefox无头模式

from selenium import webdriver

# 创建的新实例驱动
options = webdriver.FirefoxOptions()
#火狐无头模式
options.add_argument('--headless')
options.add_argument('--disable-gpu')
# options.add_argument('window-size=1200x600')

executable_path='./source/geckodriver/geckodriver.exe'
driver_path = webdriver.Firefox(firefox_options=options,executable_path=executable_path)

Firefox设置代理(认证)

熟悉Firefox的同学都知道,Firefox在配置HTTP代理时无法设置用户名和密码。而收费的HTTP代理大多都是需要进行用户名和密码认证的(有的也支持IP白名单,但前提是你的IP需要固定不变)。这就使得使用Selenium + Firefox进行自动化操作非常不方便,因为每次启动一个新的浏览器实例就会弹出一个授权验证窗口,被要求输入用户名和密码(如下图所示),打断了自动化操作流程。

[169]selenium设置proxy、headers(phantomjs、Chrome、Firefox)_第1张图片
另外,Firefox也没有提供设置用户名密码的命令行参数(PS:phantomjs就有–proxy-auth这样的参数)。难道真的没有解决方法了?

鲲之鹏的技术人员通过研究终于找到了一个有效并且稳定的解决方案:

先介绍一个重要的角色,它的主页是https://addons.mozilla.org/en-US/firefox/addon/close-proxy-authentication/。close-proxy-authentication实现了自动完成代理用户名密码认证(Proxy Authentication)的功能,它提供了一个extensions.closeproxyauth.authtoken参数用来设置代理的用户名和密码,其值为经过base64编码后的用户名密码对(如下图所示)。close-proxy-authentication会使用该值构造出"Proxy-Authorization: Basic dGVzdDp0ZXN0"头发给代理服务器,以通过认证,这就是它的工作原理。
[169]selenium设置proxy、headers(phantomjs、Chrome、Firefox)_第2张图片
我们就是要借助这个插件在Selenium + Firefox时自动完成HTTP代理认证,流程是这样的:

(1)通过Firefox配置选项动态添加close-proxy-authentication这个插件(默认不加载任何插件);
(2)通过配置选项设置HTTP代理的IP和端口参数;
(3)设置extensions.closeproxyauth.authtoken的值为base64encode(“用户名:密码”);
(4)后续访问网站的时候close-proxy-authentication插件将自动完成代理的授权验证过程,不会再弹出认证窗口;

上述环境涉及文件打包下载地址:http://pan.webscraping.cn:8000/index.php/s/PMDjc77gbCFJzpO

需要特别注意的是:

(1)close-proxy-authentication的最新版本目前是V1.1,它并不兼容最新版的Firefox,鲲之鹏的技术人员测试发现Firefox V56.0以下版本能够兼容close-proxy-authentication V1.1。

(2)不同geckodriver(Firefox的webdriver程序)版本,支持的Firefox版本也不相同,具体支持哪些版本,在geckodriver的releases页面上有说明。

测试结果如下图所示。没有再弹出认证窗口,访问http://httpbin.org/ip直接回显了HTTP代理的IP:

[169]selenium设置proxy、headers(phantomjs、Chrome、Firefox)_第3张图片

Python + Firefox + 插件(closeproxy.xpi)

其中,closeproxy.xpi文件,需要Google、Bing搜下都能搜到下载地址

完整的测试代码如下:

'''
# Python + Selenium + Firefox 设置密码时,需要使用到两个插件:
# 插件1: modify_headers-0.7.1.1-fx.xpi
# 下载地址:https://github.com/mimvp/mimvp-proxy-demo
#
# 方式2: close_proxy_authentication-1.1.xpi
# 下载地址:https://github.com/mimvp/mimvp-proxy-demo
'''
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.proxy import *
from pyvirtualdisplay import Display
from base64 import b64encode
 
 
proxy = {
    "host": "123.57.78.100",
    "port": "12345",
    "user": "username",
    "pass": "password"
}
 
profile = webdriver.FirefoxProfile()
 
# add new header
profile.add_extension("modify_headers-0.7.1.1-fx.xpi")
profile.set_preference("extensions.modify_headers.currentVersion", "0.7.1.1-fx")
profile.set_preference("modifyheaders.config.active", True)
profile.set_preference("modifyheaders.headers.count", 1)
profile.set_preference("modifyheaders.headers.action0", "Add")
profile.set_preference("modifyheaders.headers.name0", "Proxy-Switch-Ip")
profile.set_preference("modifyheaders.headers.value0", "yes")
profile.set_preference("modifyheaders.headers.enabled0", True)
 
# add proxy
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', proxy['host'])
profile.set_preference('network.proxy.http_port', int(proxy['port']))
profile.set_preference('network.proxy.no_proxies_on', 'localhost, 127.0.0.1')
#profile.set_preference("network.proxy.username", 'aaaaa')
#profile.set_preference("network.proxy.password", 'bbbbb')
 
# Proxy auto login
profile.add_extension('closeproxy.xpi')
credentials = '{user}:{pass}'.format(**proxy)
credentials = b64encode(credentials.encode('ascii')).decode('utf-8')
profile.set_preference('extensions.closeproxyauth.authtoken', credentials)
 
profile.update_preferences()
 
driver = webdriver.Firefox(profile)
driver.get("https://proxy.mimvp.com/ip.php")
print driver.page_source
 
driver.quit()

firefox代理认证参考:https://cloud.tencent.com/info/4ebc3027294687320cd3a096dba6f09e.html
https://www.cnblogs.com/lgh344902118/p/6339378.html
https://cuiqingcai.com/4880.html
https://blog.csdn.net/ithomer/article/details/81051721
https://segmentfault.com/q/1010000007148702/a-1020000007157385

chrome

设置代理请求头

# !/usr/bin/python
# -*- coding: utf-8 -*-
from selenium import webdriver

# 进入浏览器设置
options = webdriver.ChromeOptions()
#谷歌无头模式
options.add_argument('--headless')
options.add_argument('--disable-gpu')
# options.add_argument('window-size=1200x600')
# 设置中文
options.add_argument('lang=zh_CN.UTF-8')
# 更换头部
options.add_argument('user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"')
#设置代理
if proxy:
    options.add_argument('--proxy-server=http://ip:port')
if user_agent:
    options.add_argument('user-agent=' + user_agent)

driver= webdriver.Chrome(chrome_options=options)
url = "https://httpbin.org/get?show_env=1"
driver.get(url)
#driver.get("http://ip138.com/")  
#print(driver.page_source) 
driver.quit()

selenium设置chrome–cookie

# !/usr/bin/python
# -*- coding: utf-8 -*-
from selenium import webdriver

browser = webdriver.Chrome()

url = "https://www.baidu.com/"
browser.get(url)
# 通过js新打开一个窗口
newwindow='window.open("https://www.baidu.com");'
# 删除原来的cookie
browser.delete_all_cookies()
# 携带cookie打开
browser.add_cookie({'name':'ABC','value':'DEF'})
# 通过js新打开一个窗口
browser.execute_script(newwindow)
input("查看效果")
browser.quit()

selenium设置chrome图片不加载

from selenium import webdriver

options = webdriver.ChromeOptions()
prefs = {
    'profile.default_content_setting_values': {
        'images': 2
    }
}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options)

# browser = webdriver.Chrome()
url = "http://image.baidu.com/"
browser.get(url)
input("是否有图")
browser.quit()

Chrome设置代理(认证)

默认情况下,Chrome的–proxy-server="http://ip:port"参数不支持设置用户名和密码认证。这样就使得"Selenium + Chrome Driver"无法使用HTTP Basic Authentication的HTTP代理。一种变通的方式就是采用IP地址认证,但在国内网络环境下,大多数用户都采用ADSL形式网络接入,IP是变化的,也无法采用IP地址绑定认证。因此迫切需要找到一种让Chrome自动实现HTTP代理用户名密码认证的方案。

Stackoverflow上有人分享了一种利用Chrome插件实现自动代理用户密码认证的方案非常不错,详细地址http://stackoverflow.com/questions/9888323/how-to-override-basic-authentication-in-selenium2-with-java-using-chrome-driver。

鲲之鹏的技术人员在该思路的基础上用Python实现了自动化的Chrome插件创建过程,即根据指定的代理“username:password@ip:port”自动创建一个Chrome代理插件,然后就可以在"Selenium + Chrome Driver"中通过安装该插件实现代理配置功能,具体代码如下:

# -*- coding:utf-8 -*-
# 测试"Selenium + Chrome"使用带用户名密码认证的代理
import os,re,time,zipfile
from selenium import webdriver


# Chrome代理模板插件(https://github.com/RobinDev/Selenium-Chrome-HTTP-Private-Proxy)目录
CHROME_PROXY_HELPER_DIR = 'chrome-proxy-extensions\Chrome-proxy-helper'
# 存储自定义Chrome代理扩展文件的目录
CUSTOM_CHROME_PROXY_EXTENSIONS_DIR = 'chrome-proxy-extensions'

def get_chrome_proxy_extension(proxy):
    """获取一个Chrome代理扩展,里面配置有指定的代理(带用户名密码认证)
    proxy - 指定的代理,格式: username:password@ip:port
    """
    m = re.compile('([^:]+):([^\@]+)\@([\d\.]+):(\d+)').search(proxy)
    if m:
        # 提取代理的各项参数
        username = m.groups()[0]
        password = m.groups()[1]
        ip = m.groups()[2]
        port = m.groups()[3]
        # print(username,password,ip,port)
        # 创建一个定制Chrome代理扩展(zip文件)
        if not os.path.exists(CUSTOM_CHROME_PROXY_EXTENSIONS_DIR):
            os.mkdir(CUSTOM_CHROME_PROXY_EXTENSIONS_DIR)
        extension_file_path = os.path.join(CUSTOM_CHROME_PROXY_EXTENSIONS_DIR, '{}.zip'.format(proxy.replace(':', '_')))
        if not os.path.exists(extension_file_path):
            # 扩展文件不存在,创建
            zf = zipfile.ZipFile(extension_file_path, mode='w')
            if not os.path.exists(CHROME_PROXY_HELPER_DIR):
                os.mkdir(CHROME_PROXY_HELPER_DIR)
            zf.write(os.path.join(CHROME_PROXY_HELPER_DIR, 'manifest.json'), 'manifest.json')
            # 替换模板中的代理参数
            background_content = open(os.path.join(CHROME_PROXY_HELPER_DIR, 'background.js')).read()
            background_content = background_content.replace('%proxy_host', ip)
            background_content = background_content.replace('%proxy_port', port)
            background_content = background_content.replace('%username', username)
            background_content = background_content.replace('%password', password)
            zf.writestr('background.js', background_content)
            zf.close()
        # print(extension_file_path)
        return extension_file_path
    else:
        raise Exception('Invalid proxy format. Should be username:password@ip:port')


if __name__ == '__main__':
    # 测试
    options = webdriver.ChromeOptions()
    # 添加一个自定义的代理插件(配置特定的代理,含用户名密码认证)
    options.add_extension(get_chrome_proxy_extension(proxy='username:password@ip:port'))
    driver = webdriver.Chrome(chrome_options=options, executable_path='./source/chromedriver_win32_2.35/chromedriver.exe')
    # 访问一个IP回显网站,查看代理配置是否生效了
    driver.get('http://httpbin.org/ip')
    # driver.get('http://ip138.com/')
    # driver.get('http://www.baidu.com/')
    # driver.get('https://www.google.com.hk/search?q=%E6%B8%A4%E6%B5%B7%E9%87%91%E6%8E%A7&safe=strict&tbs=sbd:1&tbm=nws&ei=&start=10&sa=N&biw=&bih=&dpr=1')
    # print(driver.page_source)
    time.sleep(60)
    driver.quit()

测试结果如下所示:

{
  "origin": "192.168.8.84"
}

其它

# -*- coding: utf-8 -*-
import time,string,zipfile,os
from selenium import webdriver

def create_proxyauth_extension(proxy_host, proxy_port,proxy_username, proxy_password,
                               scheme='http', plugin_path=None):
    """Proxy Auth Extension
    args:
        proxy_host (str): domain or ip address, ie proxy.domain.com
        proxy_port (int): port
        proxy_username (str): auth username
        proxy_password (str): auth password
    kwargs:
        scheme (str): proxy scheme, default http
        plugin_path (str): absolute path of the extension

    return str -> plugin_path
    """
    if plugin_path is None:
        file='./chrome_proxy_helper'
        if not os.path.exists(file):
            os.mkdir(file)
        plugin_path = file+'/%s_%s@%s_%s.zip'%(proxy_username,proxy_password,proxy_host,proxy_port)

    manifest_json = """
    {
        "version": "1.0.0",
        "manifest_version": 2,
        "name": "Chrome Proxy",
        "permissions": [
            "proxy",
            "tabs",
            "unlimitedStorage",
            "storage",
            "",
            "webRequest",
            "webRequestBlocking"
        ],
        "background": {
            "scripts": ["background.js"]
        },
        "minimum_chrome_version":"22.0.0"
    }
    """
    background_js = string.Template(
    """
    var config = {
            mode: "fixed_servers",
            rules: {
              singleProxy: {
                scheme: "${scheme}",
                host: "${host}",
                port: parseInt(${port})
              },
              bypassList: ["foobar.com"]
            }
          };

    chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});

    function callbackFn(details) {
        return {
            authCredentials: {
                username: "${username}",
                password: "${password}"
            }
        };
    }

    chrome.webRequest.onAuthRequired.addListener(
                callbackFn,
                {urls: [""]},
                ['blocking']
    );
    """
    ).substitute(
        host=proxy_host,
        port=proxy_port,
        username=proxy_username,
        password=proxy_password,
        scheme=scheme,
    )
    with zipfile.ZipFile(plugin_path, 'w') as zp:
        zp.writestr("manifest.json", manifest_json)
        zp.writestr("background.js", background_js)

    return plugin_path


if __name__=='__main__':
    proxyauth_plugin_path = create_proxyauth_extension(
        proxy_host="139.92.6.230",
        proxy_port=3100,
        proxy_username="",
        proxy_password="",
        scheme='http'
    )
    options = webdriver.ChromeOptions()
    #浏览器最大化
    options.add_argument("--start-maximized")
    #增加扩展
    options.add_extension(proxyauth_plugin_path)
    driver = webdriver.Chrome(chrome_options=options,executable_path='../source/chromedriver_win32_2.40/chromedriver.exe')
    driver.get("http://httpbin.org/ip")
    # driver.get('http://ip138.com/')
    # driver.get('https://www.google.com.hk/search?q=%E6%B8%A4%E6%B5%B7%E9%87%91%E6%8E%A7&safe=strict&tbs=sbd:1&tbm=nws&ei=&start=10&sa=N&biw=&bih=&dpr=1')
    # print(driver.page_source)
    time.sleep(10)
    driver.quit()

插件源代码 https://github.com/RobinDev/Selenium-Chrome-HTTP-Private-Proxy

chrome代理认证参考:https://www.cnblogs.com/rookies/p/6119786.html
https://www.cnblogs.com/roystime/p/6935543.html

参考:https://www.zhihu.com/question/35547395

你可能感兴趣的:(爬虫)