引子
- Chromium Embedded Framework(CEF)是个基于 Google Chromium 项目的开源 Web browser 控件,支持Windows,Linux,Mac平台。
- CEF Python 为 CEF 提供 Python 绑定。
- cefpython3 并未全部实现 CEF 所有接口,CefSharp 紧跟 CEF 实现,如果对 Python 和 C# 都熟悉,CefSharp 是更好的选择。
示例
访问并保存网页
本节代码主要参考: https://github.com/righthandabacus/stealweb/blob/master/fakechrome.py
# encoding: utf-8
# author: qbit
# date: 2020-02-02
# summary: 使用 cefpython 访问 qbit 专栏首页,并把网页保存到 source.html
import os
import sys
import threading
import time
from loguru import logger
from cefpython3 import cefpython as cef
# 在发生异常时关闭 CEF 进程
sys.excepthook = cef.ExceptHook
class ClientHandler(object):
r""" 自定义客户端 Handler """
def __init__(self, chromeObject):
self.chrome = chromeObject
def GetViewRect(self, rect_out, **kwargs):
r""" 渲染接口 """
# [x, y, width, height]
rect_out.extend([0, 0, self.chrome.width, self.chrome.height])
return True
def OnConsoleMessage(self, browser, message, **kwargs):
r""" 浏览器控制台接口 """
self.chrome.console.append(message)
def OnLoadError(self, browser, frame, error_code, failed_url, **_):
self.chrome.ready = error_code
self.chrome._getReadyLock.acquire()
self.chrome._getReadyLock.notify()
self.chrome._getReadyLock.release()
def OnLoadingStateChange(self, browser, is_loading, **kwargs):
r""" 加载接口,当浏览器加载状态变化时调用 """
if is_loading:
# 加载中
self.chrome.ready = False
logger.info('Loading ...')
else:
# 加载完成
self.chrome.ready = True
self.chrome._getReadyLock.acquire()
self.chrome._getReadyLock.notify()
self.chrome._getReadyLock.release()
logger.info('Loaded.')
class Client(object):
def __init__(self, width=1920, height=1080, headless=False):
self.width = width
self.height = height
self.headless = headless
self.console = []
self.browser = None
self.source = None
self.domArray = None
self.windowParams = None
self.ready = True
self._getSourceLock = threading.Condition()
self._getDOMLock = threading.Condition()
self._getReadyLock = threading.Condition()
self._handler = ClientHandler(self)
settings = {
'user_agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/64.0.3282.140 Safari/537.36',
# "debug": True, # 调试模式
# "log_severity": cef.LOGSEVERITY_INFO, # 日志的输出级别
# "log_file": "debug.log", # 设置日志文件
# "user_agent": "from stonejianbu 0.0.1",
}
switches = {
# 取消获取媒体流(如音频、视频数据),必须以空字符串代表否!~~~~
# "enable-media-stream": "",
# "proxy-server": "socks5://127.0.0.1:10808", # 设置代理
# "disable-gpu": "", # 设置渲染方式CPU or GPU
}
if self.headless:
settings['windowless_rendering_enabled'] = True
cef.Initialize(settings=settings, switches=switches)
def __getattr__(self, name):
r""" 将所有未知的属性和方法传递给 CEF 浏览器 """
return getattr(self.browser, name)
def getBrowser(self):
if self.browser:
return self.browser
# 创建浏览器实例
if self.headless:
parent_handle = 0
wininfo = cef.WindowInfo()
wininfo.SetAsOffscreen(parent_handle)
self.browser = cef.CreateBrowserSync(window_info=wininfo)
else:
self.browser = cef.CreateBrowserSync()
self.browser.SetClientHandler(self._handler)
self.browser.SendFocusEvent(True)
self.browser.WasResized() # 在 headless 模式下应至少调用一次这个方法
return self
def LoadUrl(self, url, synchronous=False):
r""" 将 URL 传递给浏览器 """
logger.info('LoadUrl %s ...' % url)
self.ready = False
self.browser.LoadUrl(url)
if synchronous: # 同步方式
self._getReadyLock.acquire()
if not self.ready:
self._getReadyLock.wait()
self._getReadyLock.release()
def getSource(self, synchronous=False):
r""" 返回 main frame 的 html 源码, """
self.source = None
self.browser.GetMainFrame().GetSource(self)
if synchronous:
self._getSourceLock.acquire()
if not self.source:
# 等待 Visit 函数准备好 source 的通知
self._getSourceLock.wait()
self._getSourceLock.release()
return self.source
def Visit(self, value):
r"StringVisitor 接口"
self.source = value
self._getSourceLock.acquire()
self._getSourceLock.notify()
self._getSourceLock.release()
def BrowserThread(browser):
r""" 线程入口函数 """
browser.ready = False
browser.LoadUrl(url, True) # True 为同步调用
logger.info('Write source to source.html ...')
with open('source.html', mode='w', encoding='utf8') as srcfp:
source = browser.getSource(True) # 同步获取
assert(source)
srcfp.write(source)
browser.CloseBrowser()
if __name__ == '__main__':
url = r'http://sf.gg/blog/qbit'
browser = Client(width=640, height=480).getBrowser()
browserThread = threading.Thread(target=BrowserThread, args=(browser,))
browserThread.start()
cef.MessageLoop()
browserThread.join()
browser = None
cef.Shutdown()
引入 jQuery
本节代码主要参考: Cefsharp winforms: Inject jquery into page
检索
本节代码主要参考: Python GUI: cefpython3的简单分析和应用
禁用图片
本节代码主要参考: How to disable image loading in CEF/JCEF?
保存图片
本节代码主要参考: C#(csharp)用CefSharp开发实现一个浏览器,抓取网站任意资源
本文出自 qbit snap