网络爬虫排除标准(Robots exclusion protocol)
在网站的域名后加上/robots.txt就可以查看网站的robots协议.
如:https://www.baidu.com/robots.txt
1.升级ChromeDriver版本
http://chromedriver.storage.googleapis.com/index.html
2.Chrome浏览器的驱动安装:https://localprod.pandateacher.com/python-manuscript/crawler-html/chromedriver/ChromeDriver.html
3.获取python文件夹路径
import sys
pythonpath = sys.executable
print(pythonpath)#输出本地python路径
4.将ChromeDriver放入python文件夹中
头
import time
from selenium import webdriver # 从selenium库中调用webdriver模块
from selenium.webdriver import ActionChains # 鼠标悬停
from selenium.webdriver.chrome.options import Options # 从options模块中调用Options类
# 准备-本地Chrome浏览器的静默默模式设置:
chrome_options = Options() # 实例化Option对象\
chrome_options.add_argument('--headless') # 把Chrome浏览器设置为静默模式
driver_path = './chromedriver.exe' #这里放的就是下载的driver本地路径
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])#去掉控制台的无意义输出
chrome_options.add_argument('--headless')
# driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome(options = chrome_options, executable_path = driver_path)# 后台打开
# driver = webdriver.Chrome()# 真实打开
身
print(res.status_code)#检查状态码
#伪装请求头
headers = {
'origin':'https://y.qq.com',
# 请求来源
'referer':'https://y.qq.com/n/yqq/song/004Z8Ihr0JIu5s.html',
# 请求来源,携带的信息比“origin”更丰富
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like =|Gecko) Chrome/71.0.3578.98 Safari/537.36',
# 标记了请求从什么设备,什么浏览器上发出
}
params={
'include':'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
'offset':'10',
'limit':'20',
'sort_by':'voteups',
}
#封装参数
res_music = requests.get(url,headers=headers,params=params)
# 发起请求,填入请求头和参数
#先查看相关网站参数,然后自行填写
data = {
'log': 'spiderman', #写入账户
'pwd': 'crawler334566', #写入密码
'wp-submit': '登录',
'redirect_to': 'https://wordpress-edu-3autumn.localprod.oc.forchange.cn',
'testcookie': '1'
}#把有关登录的参数封装成字典,赋值给data。
元素定位:
Selenium 八种元素定位方法 - eastonliu - 博客园
解决网加载慢,元素出不来时会报错情况:
循环
def ABC(self):
driver.get('www.baidu.com/')#目标网址
while 1:
try:
driver.find_element_by_xpath("html/")#目标元素
except:
False
time.sleep(0.1)