python asyncio协程异步爬虫

实践一下python的asyncio异步协程相关的库,爬取豆瓣电影top250,自己边查边试,写出个最基本的用法吧。

import time
import asyncio
from functools import wraps

import requests
import aiohttp
from lxml import etree

base_url = 'https://movie.douban.com/top250?start={}&filter='
headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"}

#计算函数实践装饰器
def count_time(func):
    @wraps(func)
    def wrapper(*arg, **kwargs):
        s_time = time.time()
        res = func(*arg, **kwargs)
        e_time = time.time()
        c_time = e_time - s_time
        print('%s:%s' % (func.__name__, c_time))
        return res
    return wrapper


def download_one(url):
    page_source = requests.get(url, headers=headers).text
    html = etree.HTML(page_source)
    title_list = html.xpath('//ol[@class="grid_view"]//div[@class="hd"]/a/span[position()=1]/text()')
    print(title_list)


@count_time
def douban_synch():
    for i in range(10):
        url = base_url.format(i * 25)
        download_one(url)


async def download_one_synch(url, session):
    async with session.get(url) as response:
        page_source = await response.text()
        html = etree.HTML(page_source)
        title_list = html.xpath('//ol[@class="grid_view"]//div[@class="hd"]/a/span[position()=1]/text()')
        print(title_list)

#@count_time 装饰器直接加在这里不生效
async def download_all():
    #官方推荐不要每一个请求都新创早一个session,所以这里用一个共用 的session,直接当参数传进去
    async with aiohttp.ClientSession() as session:
        tasks = [(download_one_synch(base_url.format(i), session)) for i in range(10)]
        #官方文档例子中用的gather并发执行
        await asyncio.gather(*tasks)


@count_time
def douban_asynch():
    #同样,官方例子同用的run方法
    asyncio.run(download_all())


if __name__ == '__main__':
    douban_synch()
    douban_asynch()

运行结果:
douban_synch:3.096079111099243
douban_asynch:0.32878708839416504 

同步的3秒,异步的0.3秒,速度提升10倍,所以说效果还是很不错的。

参考:

https://docs.python.org/3/library/asyncio-task.html#running-tasks-concurrently

https://docs.aiohttp.org/en/stable/client_quickstart.html

https://blog.csdn.net/SL_World/article/details/86633611

https://morvanzhou.github.io/tutorials/data-manipulation/scraping/4-02-asyncio/

你可能感兴趣的:(python)