Python Selenium 工具类

#!/usr/bin/python3
# -*- coding:utf-8 -*-
"""
@author: JHC
@file: temp_selenium.py
@time: 2023/6/23 20:39
@desc:
"""
import time
# selenium 不支持捕获接口
# from selenium import webdriver
# seleniumwire 支持捕获接口
from seleniumwire import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import sys


SLIDING_DISTANCE = 500

class BaseSelenium():
    """
    Selenium 基类
    """

    def __init__(self, headless=False, headers=None, proxy=None,
                 cookies=None, dev=False, incognito=False,debug_port=None,timeout=100):
        """

        :param headless:是否无头模式
        :param headers:
        :param proxy:
        :param cookies:
        :param dev:是否开启开发者模式
        :param incognito:是否开启无痕模式
        :param timeout:等待超时时间
        """
        super(BaseSelenium, self).__init__()
        self.chrome_options = None
        self.driver = None
        self.temp_height = 0
        self.timeout = timeout
        self.tree = None
        self.history = {}
        self.headless = headless
        self.headers = headers
        self.cookies = cookies
        # proxy=127.0.0.1:8080
        self.proxy = proxy
        self.dev = dev
        # 无痕模式
        self.incognito = incognito
        # 浏览器接管监听端口 9527  chrome.exe --proxy-server=127.0.0.1:7890 --remote-debugging-port=9527 --user-data-dir= %~dp0\Application\userdata
        # chrome浏览器默认user data 地址:C:\Users\JHC00\AppData\Local\Google\Chrome\User Data 里边数据直接放到user-data-dir里就行了
        # 存在 debug_port 即为接管模式
        self.debug_port = debug_port
        self.load_options()

    def _init_tree(self):
        """
        初始化xpath
        :return:
        """
        if not self.tree:
            page_source = self.get_page_source()
            tree = etree.HTML(page_source)
            self.tree = tree
        return self.tree

    def load_options(self):
        """
        加载排至参数
        :return:
        """
        chrome_options = Options()
        if self.incognito:
            chrome_options.add_argument('--incognito')
        if self.debug_port:
            chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:{}".format(self.debug_port))
        else:
            if self.headless:
                chrome_options.add_argument('--headless')  # 可视化界面
            if self.proxy:
                chrome_options.add_argument('--proxy-server={}'.format(self.proxy))
            if self.headers:
                chrome_options.add_argument('user-agent={}'.format(self.headers))
            if self.dev:
                chrome_options.add_argument("--auto-open-devtools-for-tabs")
            chrome_options.add_experimental_option('detach', True)  # 不自动关闭浏览器
            chrome_options.add_experimental_option('useAutomationExtension', False)
            chrome_options.add_experimental_option('useAutomationExtension', False)
            chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
            chrome_options.add_argument('--start-maximized')  # 浏览器窗口最大化
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--ignore-certificate-errors')
            chrome_options.add_argument(
                '--disable-blink-features=AutomationControlled')

            chrome_options.add_argument("disable-blink-features")
            chrome_options.add_argument(
                "disable-blink-features=AutomationControlled")
            chrome_options.add_argument('--ignore-certificate-errors')
            chrome_options.add_argument('--ignore-urlfetcher-cert-requests')


        self.chrome_options = chrome_options

    def get_driver(self):
        """
        获取driver
        :return:
        """
        if not self.chrome_options:
            self.load_options()

        if not self.driver:
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(
                service=service, options=self.chrome_options)
            self.driver = driver
        return self.driver

    def rolldown(self, sleep=1, repeat=3):
        """
        下滑到网页底部
        :param driver:
        :return:
        """
        driver = self.get_driver()
        driver.execute_script("window.scrollBy(0,{})".format(SLIDING_DISTANCE))
        time.sleep(0.1)
        check_height = driver.execute_script(
            "return document.documentElement.scrollTop "
            "|| window.pageYOffset "
            "|| document.body.scrollTop;"
        )
        while check_height != self.temp_height:
            self.temp_height = check_height
            if self.history.get(check_height) is not None:
                if self.history[check_height] > repeat:
                    break
                else:
                    self.history[check_height] = self.history[check_height] + 1
                    time.sleep(sleep)
            else:
                self.history[check_height] = 0
                time.sleep(sleep)
                self.rolldown()


    def get_page_source(self):
        """
        获取源码
        :return:
        """
        driver = self.get_driver()
        return driver.page_source

    def close_chrome(self):
        """
        关闭浏览器
        :return:
        """
        driver = self.get_driver()
        try:
            driver.close()
            driver.quit()
        finally:
            sys.exit(0)

    def wait_load_finish(self, label):
        """
        通过判断页面元素是否存在 决定是否继续等待页面加载
        :param label:
        :return:
        """
        driver = self.get_driver()
        flag = False
        try:
            WebDriverWait(driver, self.timeout).until(
                EC.presence_of_element_located((By.XPATH, label))
            )
            flag = True
        except BaseException:
            print("Wait Timeout {}".format(self.timeout))
        finally:
            return flag

    def login(self):
        """
        加载cookies登录
        :param self.cookies:{'name':'ABC','value':'DEF'}
        :return:
        """
        driver = self.get_driver()
        driver.delete_all_cookies()
        for k, v in self.cookies.items():
            driver.add_cookie({"name": k, "value": v})
        driver.refresh()

    def load_url(self, url):
        """

        :param url:
        :return:
        """
        driver = self.get_driver()
        driver.get(url)
        if self.cookies:
            self.login()
            print("Login Success !!!")

    def hock_data_by_urls(self, url_list):
        """
        下钩子通过判断url捕获请求返回数据
        :param url_list:url唯一标识列表
        :return:
        """
        driver = self.get_driver()
        for request_method in driver.requests:
            url = request_method.url
            for args in url_list:
                if args in url:
                    response = request_method.response.body
                    yield {
                        "args":args,
                        "url": url,
                        "response": response,
                    }

    def main(self, url):
        """

        :param url:
        :return:
        """
        self.history.clear()
        self.load_url(url)
        self.rolldown()
        # if self.wait_load_finish('//div[@class="gWel-mailInfo-txt"]'):
        for result in self.hock_data_by_urls(["article/details/131357346"]):
            print(result["url"])



            # print(driver.get_cookies())
            # print(self.get_page_source())
            # res = driver.find_element(by=By.XPATH, value="//code[@class='prism language-python has-numbering']")
            # print(res.text)

        # self.close_chrome()


if __name__ == '__main__':
    url = 'https://blog.csdn.net/CXY00000/article/details/131357346'
    while True:
        BaseSelenium().main(url)


你可能感兴趣的:(python,selenium,开发语言)