cefpython3 调用浏览器示例(qbit)

引子

  • Chromium Embedded Framework(CEF)是个基于 Google Chromium 项目的开源 Web browser 控件,支持Windows,Linux,Mac平台。
  • CEF Python 为 CEF 提供 Python 绑定。
  • cefpython3 并未全部实现 CEF 所有接口,CefSharp 紧跟 CEF 实现,如果对 Python 和 C# 都熟悉,CefSharp 是更好的选择。

示例

访问并保存网页

本节代码主要参考: https://github.com/righthandabacus/stealweb/blob/master/fakechrome.py

# encoding: utf-8
# author: qbit
# date: 2020-02-02
# summary: 使用 cefpython 访问 qbit 专栏首页,并把网页保存到 source.html

import os
import sys
import threading
import time
from loguru import logger
from cefpython3 import cefpython as cef

# 在发生异常时关闭 CEF 进程
sys.excepthook = cef.ExceptHook 
class ClientHandler(object):
    r""" 自定义客户端 Handler """
    def __init__(self, chromeObject):
        self.chrome = chromeObject

    def GetViewRect(self, rect_out, **kwargs):        
        r""" 渲染接口 """
        # [x, y, width, height]
        rect_out.extend([0, 0, self.chrome.width, self.chrome.height]) 
        return True

    def OnConsoleMessage(self, browser, message, **kwargs):
        r""" 浏览器控制台接口 """
        self.chrome.console.append(message)

    def OnLoadError(self, browser, frame, error_code, failed_url, **_):
        self.chrome.ready = error_code
        self.chrome._getReadyLock.acquire()
        self.chrome._getReadyLock.notify()
        self.chrome._getReadyLock.release()

    def OnLoadingStateChange(self, browser, is_loading, **kwargs):
        r""" 加载接口,当浏览器加载状态变化时调用 """
        if is_loading:
            # 加载中
            self.chrome.ready = False
            logger.info('Loading ...')            
        else:
            # 加载完成
            self.chrome.ready = True
            self.chrome._getReadyLock.acquire()
            self.chrome._getReadyLock.notify()
            self.chrome._getReadyLock.release()
            logger.info('Loaded.')
class Client(object):    
    def __init__(self, width=1920, height=1080, headless=False):
        self.width = width
        self.height = height
        self.headless = headless
        
        self.console = []
        self.browser = None
        self.source = None
        self.domArray = None
        self.windowParams = None
        self.ready = True
        self._getSourceLock = threading.Condition()
        self._getDOMLock = threading.Condition()
        self._getReadyLock = threading.Condition()
        self._handler = ClientHandler(self)

        settings = {
            'user_agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) ' \
                         'AppleWebKit/537.36 (KHTML, like Gecko) ' \
                         'Chrome/64.0.3282.140 Safari/537.36',
            # "debug": True,  # 调试模式
            # "log_severity": cef.LOGSEVERITY_INFO, # 日志的输出级别
            # "log_file": "debug.log",  # 设置日志文件
            # "user_agent": "from stonejianbu 0.0.1", 
        }
        switches = {
            # 取消获取媒体流(如音频、视频数据),必须以空字符串代表否!~~~~
            # "enable-media-stream": "",  
            # "proxy-server": "socks5://127.0.0.1:10808",  # 设置代理
            # "disable-gpu": "",  # 设置渲染方式CPU or GPU
        }
        if self.headless:
            settings['windowless_rendering_enabled'] = True
        cef.Initialize(settings=settings, switches=switches)

    def __getattr__(self, name):
        r""" 将所有未知的属性和方法传递给 CEF 浏览器 """
        return getattr(self.browser, name)

    def getBrowser(self):
        if self.browser:
            return self.browser
        # 创建浏览器实例
        if self.headless:
            parent_handle = 0
            wininfo = cef.WindowInfo()
            wininfo.SetAsOffscreen(parent_handle)
            self.browser = cef.CreateBrowserSync(window_info=wininfo)
        else:
            self.browser = cef.CreateBrowserSync()
            
        self.browser.SetClientHandler(self._handler)
        self.browser.SendFocusEvent(True)
        self.browser.WasResized() # 在 headless 模式下应至少调用一次这个方法
        
        return self

    def LoadUrl(self, url, synchronous=False):
        r""" 将 URL 传递给浏览器 """
        logger.info('LoadUrl %s ...' % url)
        self.ready = False 
        self.browser.LoadUrl(url)
        if synchronous: # 同步方式
            self._getReadyLock.acquire()
            if not self.ready:
                self._getReadyLock.wait() 
            self._getReadyLock.release()

    def getSource(self, synchronous=False):
        r""" 返回 main frame 的 html 源码,  """
        self.source = None
        self.browser.GetMainFrame().GetSource(self)

        if synchronous:
            self._getSourceLock.acquire()
            if not self.source:
                # 等待 Visit 函数准备好 source 的通知
                self._getSourceLock.wait()
            self._getSourceLock.release()
        return self.source

    def Visit(self, value):
        r"StringVisitor 接口"
        self.source = value
        self._getSourceLock.acquire()
        self._getSourceLock.notify()
        self._getSourceLock.release()
def BrowserThread(browser):
    r""" 线程入口函数 """
    browser.ready = False
    browser.LoadUrl(url, True) # True 为同步调用
    logger.info('Write source to source.html ...')
    with open('source.html', mode='w', encoding='utf8') as srcfp:
        source = browser.getSource(True) # 同步获取
        assert(source)
        srcfp.write(source)
    browser.CloseBrowser()

if __name__ == '__main__':
    url = r'http://sf.gg/blog/qbit'
    browser = Client(width=640, height=480).getBrowser()
    browserThread = threading.Thread(target=BrowserThread, args=(browser,))
    browserThread.start()
    cef.MessageLoop() 
    browserThread.join()
    browser = None
    cef.Shutdown()

引入 jQuery

本节代码主要参考: Cefsharp winforms: Inject jquery into page

检索

本节代码主要参考: Python GUI: cefpython3的简单分析和应用

禁用图片

本节代码主要参考: How to disable image loading in CEF/JCEF?

保存图片

本节代码主要参考: C#(csharp)用CefSharp开发实现一个浏览器,抓取网站任意资源

本文出自 qbit snap

你可能感兴趣的:(cef,chrome,python,网页爬虫)