pyppeteer: 基于asyncio的异步网页渲染爬虫库

简介

类似于selenium,pyppeteer也能渲染网页,但是它是异步的。

使用方法

安裝方法

pip install pyppeteer

# python 3.7.5

import asyncio

from pyppeteer import launch
from pyquery import PyQuery as pq


async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto("http://quotes.toscrape.com/js/")
    doc = pq(await page.content())
    print("Quotes:", doc(".quote").length)
    await browser.close()


asyncio.run(main())

复杂案例,屏蔽css,图片,字体等

import asyncio

from pyppeteer import launch
from pyquery import PyQuery as pq


class Global:
    browser = None


async def intercept_request(req):
    """屏蔽几类资源"""
    if req.resourceType in ["image", "media", "eventsource", "websocket", "stylesheet", "font"]:
        await req.abort()
    else:
        await req.continue_()


async def fetch():
    page = await Global.browser.newPage()
    await page.setUserAgent(
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"
    )  # 自定义useragent
    await page.setViewport({"width": 1080, "height": 960})

    await page.setRequestInterception(True)
    page.on("request", intercept_request)

    await page.goto("https://juejin.im/timeline")
    await asyncio.sleep(3)
    doc = pq(await page.content())
    print("Quotes:", doc("a").length)
    await page.close()


async def main():
    Global.browser = await launch()
    await asyncio.gather(*[fetch() for _ in range(10)])  # 并发
    await Global.browser.close()


asyncio.get_event_loop().run_until_complete(main())

你可能感兴趣的:(python)