https://selenium-python.readthedocs.io/
https://sites.google.com/a/chromium.org/chromedriver/
https://chromedevtools.github.io/devtools-protocol/tot/Network/
pip3 install selenium
wget https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm
yum install ./google-chrome-stable_current_x86_64.rpm
在 http://chromedriver.storage.googleapis.com/这个页面找对应版本的下载地址
wget http://chromedriver.storage.googleapis.com/87.0.4280.20/chromedriver_linux64.zip
解压unzip chromedriver_linux64.zip
mv chromedriver /usr/bin即可
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
option = webdriver.ChromeOptions()
option.add_argument('--no-sandbox')
option.add_argument('--headless')
#设置user-agent
option.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:55.0) Gecko/20100101 Firefox/55.0')
caps = DesiredCapabilities.CHROME
driver = webdriver.Chrome(options=option,desired_capabilities=caps)
p = '222.185.28.38:32374'
proxy = Proxy(
{
'proxyType':ProxyType.MANUAL,
'httpProxy':'{}'.format(p),
}
)
proxy.add_to_capabilities(caps)
driver.start_session(caps)
driver.get('http://httpbin.org/get')
print(driver.page_source)
#更换proxy后 重新执行proxy.add_to_capabilities(caps) 和 driver.start_session(caps)即可。不用重启driver更新代理成功。
由于是浏览器模式, headers等信息不必过多干预。换proxy和UA 是爬虫核心需求。其他基本操作详见官方文档https://python-selenium-zh.readthedocs.io/zh_CN/ 或者博客https://cloud.tencent.com/developer/article/1067129。
以前不喜欢使用webdriver还有一个原因, 一些ajax请求,还有一些通过开发者模式可以找到API返回json格式的请求,通过page_source解析效率低且字段没有API全。所以我就开始想,既然浏览器开发者模式可以对network进行分析,webdriver应该也有相应的功能。
但是方案A有以下几个弊端难以成气候。
方案A诸多问题, 果断放弃。
然后就开始调研webdriver日志性能分析方法。 经过一些列调研尝试,终于可以拿到webdriver的network日志了。但是结果有些失落。因为日志没有response.body。日志结构如下:
{
"message":{
"method":"Network.responseReceived",
"params":{
"frameId":"35489C704F5D95B9D01BD99C00BA3AFA",
"loaderId":"BA3FE8EAD2BB30562343F3940AEB8CBC",
"requestId":"BA3FE8EAD2BB30562343F3940AEB8CBC",
"response":{
"connectionId":12,
"connectionReused":false,
"encodedDataLength":230,
"fromDiskCache":false,
"fromPrefetchCache":false,
"fromServiceWorker":false,
"headers":{
"Access-Control-Allow-Credentials":"true",
"Access-Control-Allow-Origin":"*",
"Connection":"keep-alive",
"Content-Length":"571",
"Content-Type":"application/json",
"Date":"Tue, 24 Nov 2020 01:59:03 GMT",
"Server":"gunicorn/19.9.0"
},
"headersText":"HTTP/1.1 200 OK
Date: Tue, 24 Nov 2020 01:59:03 GMT
Content-Type: application/json
Content-Length: 571
Connection: keep-alive
Server: gunicorn/19.9.0
Access-Control-Allow-Origin: *
Access-Control-Allow-Credentials: true
",
"mimeType":"application/json",
"protocol":"http/1.1",
"remoteIPAddress":"3.211.1.78",
"remotePort":80,
"requestHeaders":{
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"en-US",
"Connection":"keep-alive",
"Host":"httpbin.org",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:55.0) Gecko/20100101 Firefox/55.0"
},
"requestHeadersText":"GET /get HTTP/1.1
Host: httpbin.org
Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:55.0) Gecko/20100101 Firefox/55.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Accept-Encoding: gzip, deflate
Accept-Language: en-US
",
"responseTime":1606183143651.414,
"securityState":"insecure",
"status":200,
"statusText":"OK",
"timing":{
"connectEnd":282.386,
"connectStart":47.972,
"dnsEnd":47.972,
"dnsStart":46.748,
"proxyEnd":-1,
"proxyStart":-1,
"pushEnd":0,
"pushStart":0,
"receiveHeadersEnd":519.16,
"requestTime":43961290.624483,
"sendEnd":282.616,
"sendStart":282.547,
"sslEnd":-1,
"sslStart":-1,
"workerFetchStart":-1,
"workerReady":-1,
"workerRespondWithSettled":-1,
"workerStart":-1
},
"url":"http://httpbin.org/get"
},
"timestamp":43961291.145069,
"type":"Document"
}
},
"webview":"35489C704F5D95B9D01BD99C00BA3AFA"
}
内容很丰富,但是没有我们最想要的。
接下来,上重点。我这个小白都能想到的方法果然已经有人在做了。很庆幸找到了完美的解决办法。**
**Chrome devtool **的 Network.getResponseBody 方法,就是我们想要的。
https://chromedevtools.github.io/devtools-protocol/tot/Network/
Network.getResponseBody #
Returns content served for the given request.
parameters
requestId
RequestId
Identifier of the network request to get content for.
Return Object
body
string
Response body.
base64Encoded
boolean
True, if content was sent as base64.
import time
import json
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
caps = DesiredCapabilities.CHROME
caps['loggingPrefs'] = {
'browser':'ALL',
'performance':'ALL',
}
caps['perfLoggingPrefs'] = {
'enableNetwork' : True,
'enablePage' : False,
'enableTimeline' : False
}
option = webdriver.ChromeOptions()
option.add_argument('--no-sandbox')
option.add_argument('--headless')
option.add_argument("--disable-extensions")
option.add_argument("--allow-running-insecure-content")
option.add_argument("--ignore-certificate-errors")
option.add_argument("--disable-single-click-autofill")
option.add_argument("--disable-autofill-keyboard-accessory-view[8]")
option.add_argument("--disable-full-form-autofill-ios")
option.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:55.0) Gecko/20100101 Firefox/55.0')
option.add_experimental_option('w3c',False)
option.add_experimental_option('perfLoggingPrefs',{
'enableNetwork':True,
'enablePage':False,
})
driver = webdriver.Chrome(options=option,desired_capabilities=caps)
driver.get('http://httpbin.org/get')
for typelog in driver.log_types:
perfs = driver.get_log(typelog)
for row in perfs:
log_data = row
log_json = json.loads(log_data['message'])
log = log_json['message']
if log['method'] == 'Network.responseReceived':
requestId = log['params']['requestId']
try:
response_body = driver.execute_cdp_cmd('Network.getResponseBody',{'requestId': requestId})
print(response_body['body'])
except:
print('response.body is null')
#coding=utf-8
import time,sys,os
from requestium import Session, Keys
s = Session(webdriver_path='/Users/lib/Documents/chromedriver',
browser='chrome',
default_timeout=15,
webdriver_options={'arguments': ['disable-gpu',"--user-agent=Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20"]})
#webdriver_options={'arguments': ['headless','--proxy-server=http://user:[email protected]:40275']})
#webdriver_options={'arguments': ['headless','--proxy-server=http://116.11.254.37:80']})
#url = 'https://icanhazip.com/'
url = 'https://www.che300.com/pinggu/v9c9m30614r2016-06g2.8'
url = 'https://magi.com/search?q=%E6%96%B0%E5%86%A0'
url = 'http://m.baidu.com/s?word=%E6%88%BF%E5%B1%B1%E6%96%B0%E7%9B%98'
s.driver.get(url)
html = s.driver.page_source
print (html)
#pic = response.xpath('//img/@src').extract_first()
pic = None
#if pic is not None:
if u'访问太过于频繁,请输入验证码后再次访问' in html:
#print pic
code = raw_input('请输入验证码:')
print ('您输入的验证码是:%s' % code)
s.driver.find_element_by_xpath("//input[@name='code']").send_keys(code,Keys.ENTER)
time.sleep(1)
s.driver.ensure_element_by_xpath("//input[@type='submit']").click()
#self.s.transfer_driver_cookies_to_session()
#s.transfer_session_cookies_to_driver()
time.sleep(3)
print (s.driver.page_source)
s.driver.quit()
1、获取日志过程中如果没有 option.add_experimental_option(‘w3c’,False) 会报错。
2、Network.getResponseBody 在抓取请求的body过程中,如果body为空 程序报错
加异常处理。
3、Centos 服务器 安装好 chrome 和 chromedriver后 截图不能显示中文。 原因:系统默认是英文字符集 ,没有中文语言和字体库。解决办法如下。
查看修改服务器字符集
[root@192-168-17-194 htms-op-ui-automation-test]# echo $LANG en_US.UTF-8
确认系统是否支持中文字符集
locale -a |grep CN
Centos7修改系统默认字符集
vim /etc/locale.conf # LANG=“en_US.UTF-8” LANG="zh_CN.UTF-8"
安装中文字符集支持
yum -y groupinstall “X Window System” yum -y groupinstall chinese-support # Centos7如果报错找不到chinese-support可忽略 yum -y groupinstall Fonts
QQ:739669518
爬虫技术交流群:259788518