1、连接失败:
selenium.common.exceptions.WebDriverException: Message: newSession
直接将geckodriver.exe拷贝到当前py脚本的路径下就可以了
2、如果是在Linux环境下,需要一个虚拟化桌面
from pyvirtualdisplay import Display
from selenium import webdriver
display = Display(visible=0, size=(1920, 1080))
3、模拟器被反爬
原因是在webdriver发送请求的时候,会有webdriver的js判断,当检测到此字段时会被作为爬虫处理,应对策略如下。
工具:mitmproxy做代理,替换掉请求里面的webdriver为别的字段
部分代码如下:
if “/_next/static/js/common_pdd” in flow.request.url:
flow.response.text = flow.response.text.replace(“webdriver”, “userAgent”)
4、滑动验证码验证失败(验证码这一块在后续验证码专题中专门描述)
同样的代码,chromedriver验证码通过,firefox滑动到正常位置报失败,最后发现原因是firefox在滑动模块的时候速度太慢被机器识别出来,解决方法,增大滑动的速度,附上滑动验证的部分代码,如下:
def crack_geetest(self, max_retry=10):
driver = self.driver
l = self.logger
l.info("process handle geetest captcha...")
def get_position():
"""
获取验证码位置
:return: 验证码位置元组
"""
img = driver.find_element_by_xpath('//div[@class="geetest_canvas_img geetest_absolute"]')
time.sleep(2)
location = img.location
size = img.size
top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + \
size['width']
return (top, bottom, left, right)
def get_geetest_image(name):
"""
获取验证码图片
:return: 图片对象
"""
full_img_path = './zhilian_screenshot_{}.png'.format(self.account['user_id'])
driver.save_screenshot(filename=full_img_path)
image = Image.open(fp=full_img_path, mode='r')
top, bottom, left, right = get_position()
print('验证码位置:({},{},{},{})'.format(left, top, right, bottom))
t = driver.execute_script('var q=document.documentElement.scrollTop; return q;')
print('验证码位置:({},{},{},{})'.format(left, top - int(t), right, bottom - int(t)))
print('p--->>>', t)
captcha = image.crop((left, top - int(t), right, bottom - int(t)))
captcha_file_name = './zhilian_captcha_{}_{}.png'.format(self.account['user_id'], name)
captcha.save(captcha_file_name)
return captcha, captcha_file_name
def get_slider():
"""
获取滑块
:return: 滑块对象
"""
slider = driver.find_element_by_xpath('//div[@class="geetest_slider_button"]')
return slider
def get_gap(captcha_file_name):
"""
获取缺口偏移量
:param image1: 不带缺口图片
:param image2: 带缺口图片
:return:
"""
res = self.dama2.decode_captcha(6137, captcha_file_name)
print(res)
# ('b800b4f6-0d9a-40e2-a972-d87c91582b46', [(176, 101)])
return int(res[1][0][0])
def calculate_tracks(distance):
def generate_rand(n, sum_v): # 随机生成n个总和为sum_v的list
Vector = [random.randint(1, 3) for _ in range(n)]
Vector = [int(i / sum(Vector) * sum_v) for i in Vector]
if sum(Vector) < sum_v:
res = sum_v - sum(Vector)
for i in range(res):
Vector[random.randint(0, n - 1)] += 1
return [0 - i for i in Vector]
back_dis = random.randint(16, 26)
distance += back_dis # 先滑过一点,最后再反着滑动回来
v = 0
t = 0.2
forward_tracks = []
current = 0
mid = distance * 3 / 5
while current < distance:
if current < mid:
a = 2
else:
a = -3
s = v * t + 0.5 * a * (t ** 2)
v = v + a * t
current += s
forward_tracks.append(round(s))
# 反着滑动到准确位置
back_tracks = generate_rand(15, back_dis) # 总共等于 back_dis
return {'forward_tracks': forward_tracks, 'back_tracks': back_tracks}
def move_to_gap(slider, tracks):
"""
拖动滑块到缺口处
:param slider: 滑块
:param track: 轨迹
:return:
"""
ActionChains(driver).click_and_hold(slider).perform()
# 往后移动
for i in tracks['forward_tracks']:
ActionChains(driver).move_by_offset(i, 0).perform()
# 往回移动
time.sleep(0.5)
for i in tracks['back_tracks']:
ActionChains(driver).move_by_offset(i, 0).perform()
# 小范围震荡一下
# time.sleep(0.3)
random_sc = random.randint(3, 8)
ActionChains(driver).move_by_offset(0-random_sc, 0).perform()
time.sleep(0.5)
ActionChains(driver).move_by_offset(random_sc, 0).perform()
# 释放
time.sleep(0.5)
ActionChains(driver).release().perform()
def crack(retry=0):
# 输入用户名密码
# 点击验证按钮
# 获取验证码图片
print('get_geetest_image')
captcha_obj, captcha_file_name = get_geetest_image('2')
gap = get_gap(captcha_file_name)
l.info('缺口位置:{}'.format(gap))
print('缺口位置:{}'.format(gap))
# 减去起始缺口位移
BORDER = 29
gap -= BORDER
# 获取移动轨迹
track = calculate_tracks(gap)
l.info('滑动轨迹:{}'.format(track))
print('滑动轨迹:{}'.format(track))
# # 拖动滑块
slider = get_slider()
move_to_gap(slider, track)
driver.save_screenshot('./zhilian_capresult_{}_{}.png'.format(self.account['user_id'], retry))
#
time.sleep(3)
# #
result = driver.find_element_by_xpath('//div[@class="geetest_result_title"]').get_attribute('textContent')
l.info(result)
print(result)
return result
retry = 1
while True:
l.info(f'{retry}/{max_retry} crack geetest.')
if retry == max_retry:
l.info("max retry reached, return False")
return False
success = crack(retry)
if '秒的速度超过' in success or 'passport.lagou.com/login/login' not in driver.current_url:
l.info("crack succeeded!")
print("crack succeeded!")
return True
elif '拖动滑块将悬浮图像正确拼合' in success:
retry += 1
l.info("crack failed, retry:{}/{}".format(retry, max_retry))
driver.find_element_by_xpath('//a[@class="geetest_refresh_1"]').click()
time.sleep(5)
continue
else:
time.sleep(5)
retry += 1
l.info("crack failed, retry:{}/{}".format(retry, max_retry))
continue