爬虫进阶:Selenium框架--3、调试

1、连接失败:
selenium.common.exceptions.WebDriverException: Message: newSession

直接将geckodriver.exe拷贝到当前py脚本的路径下就可以了

2、如果是在Linux环境下,需要一个虚拟化桌面

from pyvirtualdisplay import Display
from selenium import webdriver
display = Display(visible=0, size=(1920, 1080))

3、模拟器被反爬
原因是在webdriver发送请求的时候,会有webdriver的js判断,当检测到此字段时会被作为爬虫处理,应对策略如下。

工具:mitmproxy做代理,替换掉请求里面的webdriver为别的字段

部分代码如下:

if “/_next/static/js/common_pdd” in flow.request.url:
flow.response.text = flow.response.text.replace(“webdriver”, “userAgent”)

4、滑动验证码验证失败(验证码这一块在后续验证码专题中专门描述)
同样的代码,chromedriver验证码通过,firefox滑动到正常位置报失败,最后发现原因是firefox在滑动模块的时候速度太慢被机器识别出来,解决方法,增大滑动的速度,附上滑动验证的部分代码,如下:

def crack_geetest(self, max_retry=10):
        driver = self.driver
        l = self.logger
        l.info("process handle geetest captcha...")
 
        def get_position():
            """
            获取验证码位置
            :return: 验证码位置元组
            """
            img = driver.find_element_by_xpath('//div[@class="geetest_canvas_img geetest_absolute"]')
            time.sleep(2)
            location = img.location
            size = img.size
            top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + \
                                       size['width']
            return (top, bottom, left, right)
 
        def get_geetest_image(name):
            """
            获取验证码图片
            :return: 图片对象
            """
            full_img_path = './zhilian_screenshot_{}.png'.format(self.account['user_id'])
            driver.save_screenshot(filename=full_img_path)
            image = Image.open(fp=full_img_path, mode='r')
            top, bottom, left, right = get_position()
            print('验证码位置:({},{},{},{})'.format(left, top, right, bottom))
            t = driver.execute_script('var q=document.documentElement.scrollTop; return q;')
            print('验证码位置:({},{},{},{})'.format(left, top - int(t), right, bottom - int(t)))
            print('p--->>>', t)
            captcha = image.crop((left, top - int(t), right, bottom - int(t)))
            captcha_file_name = './zhilian_captcha_{}_{}.png'.format(self.account['user_id'], name)
            captcha.save(captcha_file_name)
            return captcha, captcha_file_name
 
        def get_slider():
            """
            获取滑块
            :return: 滑块对象
            """
            slider = driver.find_element_by_xpath('//div[@class="geetest_slider_button"]')
            return slider
 
        def get_gap(captcha_file_name):
            """
            获取缺口偏移量
            :param image1: 不带缺口图片
            :param image2: 带缺口图片
            :return:
            """
            res = self.dama2.decode_captcha(6137, captcha_file_name)
            print(res)
            # ('b800b4f6-0d9a-40e2-a972-d87c91582b46', [(176, 101)])
            return int(res[1][0][0])
 
        def calculate_tracks(distance):
            def generate_rand(n, sum_v):  # 随机生成n个总和为sum_v的list
                Vector = [random.randint(1, 3) for _ in range(n)]
                Vector = [int(i / sum(Vector) * sum_v) for i in Vector]
                if sum(Vector) < sum_v:
                    res = sum_v - sum(Vector)
                    for i in range(res):
                        Vector[random.randint(0, n - 1)] += 1
                return [0 - i for i in Vector]
 
            back_dis = random.randint(16, 26)
            distance += back_dis  # 先滑过一点,最后再反着滑动回来
            v = 0
            t = 0.2
            forward_tracks = []
 
            current = 0
            mid = distance * 3 / 5
            while current < distance:
                if current < mid:
                    a = 2
                else:
                    a = -3
 
                s = v * t + 0.5 * a * (t ** 2)
                v = v + a * t
                current += s
                forward_tracks.append(round(s))
 
            # 反着滑动到准确位置
            back_tracks = generate_rand(15, back_dis)  # 总共等于 back_dis
            return {'forward_tracks': forward_tracks, 'back_tracks': back_tracks}
 
        def move_to_gap(slider, tracks):
            """
            拖动滑块到缺口处
            :param slider: 滑块
            :param track: 轨迹
            :return:
            """
            ActionChains(driver).click_and_hold(slider).perform()
 
            # 往后移动
            for i in tracks['forward_tracks']:
                ActionChains(driver).move_by_offset(i, 0).perform()
 
            # 往回移动
            time.sleep(0.5)
            for i in tracks['back_tracks']:
                ActionChains(driver).move_by_offset(i, 0).perform()
 
            # 小范围震荡一下
            # time.sleep(0.3)
            random_sc = random.randint(3, 8)
            ActionChains(driver).move_by_offset(0-random_sc, 0).perform()
            time.sleep(0.5)
            ActionChains(driver).move_by_offset(random_sc, 0).perform()
 
            # 释放
            time.sleep(0.5)
            ActionChains(driver).release().perform()
 
        def crack(retry=0):
            # 输入用户名密码
            # 点击验证按钮
            # 获取验证码图片
            print('get_geetest_image')
            captcha_obj, captcha_file_name = get_geetest_image('2')
            gap = get_gap(captcha_file_name)
            l.info('缺口位置:{}'.format(gap))
            print('缺口位置:{}'.format(gap))
            # 减去起始缺口位移
            BORDER = 29
            gap -= BORDER
            # 获取移动轨迹
            track = calculate_tracks(gap)
            l.info('滑动轨迹:{}'.format(track))
            print('滑动轨迹:{}'.format(track))
            #     # 拖动滑块
            slider = get_slider()
            move_to_gap(slider, track)
            driver.save_screenshot('./zhilian_capresult_{}_{}.png'.format(self.account['user_id'], retry))
            #
            time.sleep(3)
            # #
            result = driver.find_element_by_xpath('//div[@class="geetest_result_title"]').get_attribute('textContent')
            l.info(result)
            print(result)
            return result
 
        retry = 1
        while True:
            l.info(f'{retry}/{max_retry} crack geetest.')
            if retry == max_retry:
                l.info("max retry reached, return False")
                return False
            success = crack(retry)
            if '秒的速度超过' in success or 'passport.lagou.com/login/login' not in driver.current_url:
                l.info("crack succeeded!")
                print("crack succeeded!")
                return True
            elif '拖动滑块将悬浮图像正确拼合' in success:
                retry += 1
                l.info("crack failed, retry:{}/{}".format(retry, max_retry))
                driver.find_element_by_xpath('//a[@class="geetest_refresh_1"]').click()
                time.sleep(5)
                continue
            else:
                time.sleep(5)
                retry += 1
                l.info("crack failed, retry:{}/{}".format(retry, max_retry))
                continue

你可能感兴趣的:(爬虫)