pyppeteer爬取动态加载的网站

https://blog.csdn.net/mouday/article/details/89574718

https://blog.csdn.net/ywdhzxf/article/details/94649327

https://www.jianshu.com/p/fd9eb385a70e  scrapy整合 

https : //github.com/Python3WebSpider/ScrapyPyppeteer  scrapy整合 

https://www.cnblogs.com/dyfblog/p/10170959.html 笔记

# -*- coding: utf-8 -*-

import asyncio
from pyppeteer import launch
from pyquery import PyQuery as pq



# 示例一: 渲染页面
async def crawl_page():
     brower = await launch({
        "headless": False,  # 设置模式, 默认无头
        'userDataDir': r'C:\DataDir',  # 设置缓存目录
                            'args': 
                                [
                                    '--no-sandbox',
                                    '--disable-infobars',  # 隐藏 ‘浏览器正在被监控提示条’
                                ], 'dumpio': True   # 防止阻塞
    })
    # 打开新页面
    page = await brower.newPage()

    # 设置页面视图大小
    await page.setViewport(viewport={'width': 1280, 'height': 800})

    # 是否启用JS,enabled设为False,则无渲染效果
    await page.setJavaScriptEnabled(enabled=True)

    print("默认UA", await brower.userAgent())
    # 设置当前页面UA
    await page.setUserAgent(
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")
    #####################
    # 输入内容  不会写选择器可以直接选择标签copy js path
    ##await page.type("#kw.s_ipt", "百度")
    # 点击搜索
    ## await page.click("input#su")
    # 等待0.5S
    # await asyncio.sleep(0.5)
    # 在while循环里强行查询某元素进行等待 用 querySelector 和 xpath都行
    # while not await page.querySelector("#content_left"):
    #     pass
    ##while not await page.xpath("//div[@id='content_left']"):
    ##    pass

    # waitFor 不能用, 这个方法官方api是可以根据 "//" 来判断是xpath还是Selector, 而且还有timeout参数
    # while not await page.waitFor("#content_left"):
    #     pass

    ## print("页面cookie", await page.cookies())
    ## print("页面标题", await page.title())
    ## print("页面内容", await page.content())

    # 截图
    await page.screenshot({'path': 'test.png'})

    # 滚动到页面底部
    await page.evaluate('window.scrollBy(0, window.innerHeight)')

    await brower.close()

    #####################
    
"""
    -----------------------
    title = await page.xpath('//*[@class="info-title"]')
    if title:
        # await title[0].click()
        await asyncio.wait([
            title[0].click(),
            page.waitForNavigation()
        ])
        # reload重新加载跳转的页面
        await page.reload()
        response = await page.content()

    -----------------------
"""
    # 输入网址回车
    await page.goto('http://quotes.toscrape.com/js/')

    # 获取内容并解析
    doc = pq(await page.content())
    print('Quotes:', doc('.quote').length)

    # 关闭浏览器
    await browser.close()


# 示例二:截图,保存pdf,执行js
async def save_pdf():
    browser = await launch(executablePath=executable_path)
    page = await browser.newPage()
    await page.goto('http://quotes.toscrape.com/js/')

    # 网页截图保存
    await page.screenshot(path='example.png')

    # 网页导出 PDF 保存
    await page.pdf(path='example.pdf')

    # 执行 JavaScript
    dimensions = await page.evaluate('''() => {
            return {
                width: document.documentElement.clientWidth,
                height: document.documentElement.clientHeight,
                deviceScaleFactor: window.devicePixelRatio,
            }
        }''')

    print(dimensions)

    await browser.close()


if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(crawl_page())
    # asyncio.get_event_loop().run_until_complete(save_pdf())

 与scrapy整合

from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
import pyppeteer
import asyncio
import os
from scrapy.http import HtmlResponse

# 设置控制台日志输出
pyppeteer_level = logging.WARNING
logging.getLogger('websockets').setLevel(pyppeteer_level)
logging.getLogger('pyppeteer').setLevel(pyppeteer_level) 

class FundscrapyDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    def __init__(self) :
        print("Init downloaderMiddleware use pypputeer.")
        # 指定浏览器的版本号
        # os.environ['PYPPETEER_CHROMIUM_REVISION'] ='588429'
        # pyppeteer.DEBUG = False
        print(os.environ.get('PYPPETEER_CHROMIUM_REVISION'))
        loop = asyncio.get_event_loop()
        task = asyncio.ensure_future(self.getbrowser())
        loop.run_until_complete(task)

        #self.browser = task.result()
        print(self.browser)
        print(self.page)
        # self.page = await browser.newPage()
    async def getbrowser(self):
        self.browser = await pyppeteer.launch()
        self.page = await self.browser.newPage()
        # return await pyppeteer.launch()
    async def getnewpage(self): 
        return  await self.browser.newPage()

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        loop = asyncio.get_event_loop()
        task = asyncio.ensure_future(self.usePypuppeteer(request))
        loop.run_until_complete(task)
        # return task.result()
        return HtmlResponse(url=request.url, body=task.result(), encoding="utf-8",request=request)

    async def usePypuppeteer(self, request):
        print(request.url)
        # page = await self.browser.newPage()
        await self.page.goto(request.url)
        content = await self.page.content()
        return content 

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

 

 

你可能感兴趣的:(py,测试)