selenium补充

selenium补充

  • 等待元素加载
    • time.sleep()
    • web.implicitly_wait()
    • WebDriverWait
  • 实例1
  • 跳过滑块检测

等待元素加载

selenium中等待方法

time.sleep()

  • 强行等待,不论元素是否加载出来,都需等

web.implicitly_wait()

  • 设置后全局使用,后面元素默认遵守等待。
  • 如果没加载出来,会等待一段时间。
  • 如果元素加载出来了,就不用等

WebDriverWait

  • 局部使用,单独等一个元素
  • 如果出现了,就不用等待
  • 如果等待期间没有出现,超时会报错.
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
web = webdriver()
# 等待10秒,0.5秒去看一次
ele = WebDriverWait(web, 10, 0.5).until(
	# 等待xpath路径下的元素出现,一旦出现,结束等待
    EC.presence_of_element_located((By.XPATH, "/html/body/div[5]/div[2]/div[1]/div/div"))
)

实例1

import base64 # 图片转成字节码串,在网上传输
import json
import time
from selenium.webdriver import ActionChains
from selenium.webdriver.support import expected_conditions as EC
import requests
from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium import webdriver
# 图鉴验证码类
class VerifyCode():
    def __init__(self, username="用户名", password='密码'):
        self.username = username
        self.password = password
    # 类型18识别,缺口识别(需要2张图 一张目标图一张缺口图)
    def verify_缺口(self, img, img_back):
        typeid = 18
        with open(img, 'rb') as f:
            base64_front_data = base64.b64encode(f.read())
            b64_front = base64_front_data.decode()
        with open(img_back, 'rb') as f:
            base64_bg_data = base64.b64encode(f.read())
            b64_bg = base64_bg_data.decode()
        data = {"username": self.username, "password": self.password, "typeid": typeid, "image": b64_front, "imageback": b64_bg}
        result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
        if result['success']:
            return result["data"]["result"]
        else:
            return result["message"]
    # 单缺口识别(返回X轴坐标 只需要1张图)
    def verify_单缺口(self, img):
        typeid = 33
        with open(img, 'rb') as f:
            base64_front_data = base64.b64encode(f.read())
            base64_img = base64_front_data.decode()
        data = {"username": self.username, "password": self.password, "typeid": typeid, "image": base64_img}
        result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
        if result['success']:
            return result["data"]["result"]
        else:
            return result["message"]
    # 1 ~ 4个坐标
    def verify_点击(self, img):
        typeid = 27
        with open(img, 'rb') as f:
            base64_front_data = base64.b64encode(f.read())
            base64_img = base64_front_data.decode()
        data = {"username": self.username, "password": self.password, "typeid": typeid, "image": base64_img}
        result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
        if result['success']:
            return result["data"]["result"]
        else:
            return result["message"]
# 登录网站
def login(username="xxxx", password="xxxx"):
    # 1. 完成登录
    login_url = "https://login.zhipin.com/?ka=header-login"
    web = webdriver.Chrome()
    time.sleep(3)
    web.get(login_url)
    web.find_element_by_xpath('//*[@id="wrap"]/div[2]/div[1]/div[2]/div[1]/form/div[3]/span[2]/input').send_keys(username)
    web.find_element_by_xpath('//*[@id="wrap"]/div[2]/div[1]/div[2]/div[1]/form/div[4]/span/input').send_keys(password)
    web.find_element_by_xpath('//*[@id="pwdVerrifyCode"]/div').click()
    ele = WebDriverWait(web, 10, 0.5).until(
        EC.presence_of_element_located((By.XPATH, "/html/body/div[5]/div[2]/div[1]/div/div"))
    )
    # 对找到的xpath元素进行截图,并命名为
    ele.screenshot("图片验证.png")
    # 点击元素类的图片验证,实例类,用类中的方法
    result = VerifyCode().verify_点击("图片验证.png")
    points = result.split("|")
    for point in points:
        ps = point.split(",")
        x = int(ps[0])
        y = int(ps[1])
        # 动作链
        ActionChains(web).move_to_element_with_offset(ele, x, y).click().perform()
        time.sleep(1)
    web.find_element_by_xpath('/html/body/div[5]/div[2]/div[1]/div/div/div[3]/a/div').click()
    time.sleep(2)
    web.find_element_by_xpath('//*[@id="wrap"]/div[2]/div[1]/div[2]/div[1]/form/div[6]/button').click()
    time.sleep(3)
# 获取网站源码
def get_page_source(url):
    # 登陆网站
    login()
    # 获取cookies
    web.get_cookies()
    web.get(url)
    # 隐式等待20秒网页上的xpath元素出现
    ele = WebDriverWait(web, 20).until(
        EC.presence_of_element_located((By.XPATH, "//ul[@class='job-list-box']//span[@class='job-name']/a/text()"))
    )
    # 查看获取的源码,并写入文件ab.html中
    print(web.page_source)
    i = True
    with open('ab.html','w',encoding='utf-8') as f:
        if i == True:
            f.write(web.page_source)
            i = False
    return web.page_source
if __name__ == '__main__':
    web = webdriver.Chrome()
    for i in range(1, 10):
        url = f"https://www.zhipin.com/c101010100/?query=python&page={i}&ka=page-{i}"
        print(url)
        content = get_page_source(url)
        tree = etree.HTML(content)
        job_names = tree.xpath("//ul[@class='job-list-box']//span[@class='job-name']/a/text()")
        print(job_names)
        print(f"正在抓取第{i}页")

跳过滑块检测

验证码之滑块检测,跳过浏览器检测设置

# 88版本以后可以用
option = Options()
# option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument('--disable-blink-features=AutomationControlled')
web = Chrome(options=option)

你可能感兴趣的:(spider,selenium,python,selenium补充)