https://blog.csdn.net/mouday/article/details/89574718
https://blog.csdn.net/ywdhzxf/article/details/94649327
https://www.jianshu.com/p/fd9eb385a70e scrapy整合
https : //github.com/Python3WebSpider/ScrapyPyppeteer scrapy整合
https://www.cnblogs.com/dyfblog/p/10170959.html 笔记
# -*- coding: utf-8 -*-
import asyncio
from pyppeteer import launch
from pyquery import PyQuery as pq
# 示例一: 渲染页面
async def crawl_page():
brower = await launch({
"headless": False, # 设置模式, 默认无头
'userDataDir': r'C:\DataDir', # 设置缓存目录
'args':
[
'--no-sandbox',
'--disable-infobars', # 隐藏 ‘浏览器正在被监控提示条’
], 'dumpio': True # 防止阻塞
})
# 打开新页面
page = await brower.newPage()
# 设置页面视图大小
await page.setViewport(viewport={'width': 1280, 'height': 800})
# 是否启用JS,enabled设为False,则无渲染效果
await page.setJavaScriptEnabled(enabled=True)
print("默认UA", await brower.userAgent())
# 设置当前页面UA
await page.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")
#####################
# 输入内容 不会写选择器可以直接选择标签copy js path
##await page.type("#kw.s_ipt", "百度")
# 点击搜索
## await page.click("input#su")
# 等待0.5S
# await asyncio.sleep(0.5)
# 在while循环里强行查询某元素进行等待 用 querySelector 和 xpath都行
# while not await page.querySelector("#content_left"):
# pass
##while not await page.xpath("//div[@id='content_left']"):
## pass
# waitFor 不能用, 这个方法官方api是可以根据 "//" 来判断是xpath还是Selector, 而且还有timeout参数
# while not await page.waitFor("#content_left"):
# pass
## print("页面cookie", await page.cookies())
## print("页面标题", await page.title())
## print("页面内容", await page.content())
# 截图
await page.screenshot({'path': 'test.png'})
# 滚动到页面底部
await page.evaluate('window.scrollBy(0, window.innerHeight)')
await brower.close()
#####################
"""
-----------------------
title = await page.xpath('//*[@class="info-title"]')
if title:
# await title[0].click()
await asyncio.wait([
title[0].click(),
page.waitForNavigation()
])
# reload重新加载跳转的页面
await page.reload()
response = await page.content()
-----------------------
"""
# 输入网址回车
await page.goto('http://quotes.toscrape.com/js/')
# 获取内容并解析
doc = pq(await page.content())
print('Quotes:', doc('.quote').length)
# 关闭浏览器
await browser.close()
# 示例二:截图,保存pdf,执行js
async def save_pdf():
browser = await launch(executablePath=executable_path)
page = await browser.newPage()
await page.goto('http://quotes.toscrape.com/js/')
# 网页截图保存
await page.screenshot(path='example.png')
# 网页导出 PDF 保存
await page.pdf(path='example.pdf')
# 执行 JavaScript
dimensions = await page.evaluate('''() => {
return {
width: document.documentElement.clientWidth,
height: document.documentElement.clientHeight,
deviceScaleFactor: window.devicePixelRatio,
}
}''')
print(dimensions)
await browser.close()
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(crawl_page())
# asyncio.get_event_loop().run_until_complete(save_pdf())
与scrapy整合
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
import pyppeteer
import asyncio
import os
from scrapy.http import HtmlResponse
# 设置控制台日志输出
pyppeteer_level = logging.WARNING
logging.getLogger('websockets').setLevel(pyppeteer_level)
logging.getLogger('pyppeteer').setLevel(pyppeteer_level)
class FundscrapyDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self) :
print("Init downloaderMiddleware use pypputeer.")
# 指定浏览器的版本号
# os.environ['PYPPETEER_CHROMIUM_REVISION'] ='588429'
# pyppeteer.DEBUG = False
print(os.environ.get('PYPPETEER_CHROMIUM_REVISION'))
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(self.getbrowser())
loop.run_until_complete(task)
#self.browser = task.result()
print(self.browser)
print(self.page)
# self.page = await browser.newPage()
async def getbrowser(self):
self.browser = await pyppeteer.launch()
self.page = await self.browser.newPage()
# return await pyppeteer.launch()
async def getnewpage(self):
return await self.browser.newPage()
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(self.usePypuppeteer(request))
loop.run_until_complete(task)
# return task.result()
return HtmlResponse(url=request.url, body=task.result(), encoding="utf-8",request=request)
async def usePypuppeteer(self, request):
print(request.url)
# page = await self.browser.newPage()
await self.page.goto(request.url)
content = await self.page.content()
return content
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)