大家知道 用Python做爬虫如遇动态渲染javascript则需要一些框架如PyQt,然而 PyQt5 相比4做了很大的改动,似乎难用了许多,没有了findAllElements方法,无法直接查找元素,用lxml加载之后又与原文档失去了关联,那么该如何动态修改页面元素呢?下面给出了我的改进用法,希望对大家有所帮助:
import csv
import time
try:
from PySide2.QtWidgets import QApplication
from PySide2.QtWebEngineWidgets import *
from PySide2.QtCore import *
except ImportError:
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl, QEventLoop, QTimer
from PyQt5.QtWebEngineWidgets import QWebEngineView,QWebEnginePage
import lxml.html
class BrowserRender(QWebEngineView):
def __init__(self, display=True):
self.app = QApplication([])
QWebEngineView.__init__(self)
self.html = ''
self.tree:lxml.html.etree._Element = None
if display:
self.show() # show the browser
def open(self, url, timeout=60):
"""Wait for download to complete and return result"""
loop = QEventLoop()
timer = QTimer()
timer.setSingleShot(True)
timer.timeout.connect(loop.quit)
self.loadFinished.connect(loop.quit)
self.load(QUrl(url))
timer.start(timeout * 1000)
loop.exec_() # delay here until download finished
if timer.isActive():
# downloaded successfully
timer.stop()
self.page().toHtml(self.callable)
else:
# timed out
print ('Request timed out:' + url)
self.app.exec_()
# def _loadFinished(self):
# self.page().toHtml(self.callable)
def callable(self, data):
self.html = data
self.tree = lxml.html.fromstring(self.html)
# dodo = self.page().action(QWebEnginePage.SelectAll)
self.app.quit()
def get_html(self):
"""Shortcut to return the current HTML"""
return self.html
def find(self, pattern):
"""Find all elements that match the pattern"""
# return self.page().mainFrame().findAllElements(pattern)
return self.tree.cssselect(pattern)
def attr(self, pattern, name, value):
"""Set attribute for matching elements"""
for e in self.find(pattern):
e.attrib.update({name:value})
# self.page().setHtml(str(lxml.html.tostring(self.tree), encoding="utf8"), baseUrl=QUrl('http://example.python-scraping.com/search'))
# self.setHtml(str(lxml.html.tostring(self.tree), encoding="utf8"))
def text(self, pattern, value):
"""Set attribute for matching elements"""
for e in self.find(pattern):
e.text = value
# self.page().setHtml(str(lxml.html.tostring(self.tree), encoding="utf8"), baseUrl=QUrl('http://example.python-scraping.com/search'))
# self.setHtml(str(lxml.html.tostring(self.tree), encoding="utf8"))
def setSearchItem(self, pattern, search_value):
"""Click matching elements"""
page:QWebEnginePage = self.page()
js_string = '''
function myFunction(id, value)
{{
document.getElementById(id).value = value;
document.getElementById('page_size').children[1].selected = true
document.getElementById('page_size').children[1].innerText = 1000
return document.getElementById(id).value;
}}
myFunction("{id}", "{value}");
'''
for e in self.find(pattern):
page.runJavaScript(js_string.format(id=e.attrib['id'],value=search_value), self.js_callback)
self.app.exec_()
def click(self, pattern):
"""Click matching elements"""
page:QWebEnginePage = self.page()
js_string = '''
function myFunction(id)
{{
document.getElementById(id).click();
return id
}}
myFunction("{id}");
'''
for e in self.find(pattern):
page.runJavaScript(js_string.format(id=e.attrib['id']), self.js_callback)
self.app.exec_()
def js_callback(self,result):
print(result)
self.app.quit()
# QMessageBox.information(self, "提示", str(result))
def wait_load(self, pattern, timeout=60):
"""Wait for this pattern to be found in webpage and return matches"""
deadline = time.time() + timeout
while time.time() < deadline:
self.app.processEvents()
matches = self.find(pattern)
if matches:
return matches
else:
self.page().toHtml(self.callable)
self.app.exec_()
print('Wait load timed out')
def main():
br = BrowserRender()
br.open('http://example.python-scraping.com/search')
br.attr('#search_term', 'value', '.')
br.setSearchItem('#search_term', '.')
br.text('#page_size option:checked', '1000')
br.click('#search')
elements = br.wait_load('#results a')
writer = csv.writer(open('countries_or_districts.csv', 'w'))
for country_or_district in [e.text_content().strip() for e in elements]:
writer.writerow([country_or_district])
if __name__ == '__main__':
main()
如有疑问,可以留言咨询。