asyncio和aiohttp的简单应用

import os
import aiohttp
import asyncio

semaphore = asyncio.Semaphore(2)  # 同时允许的最大协程数量


async def fetch(page, url):
    async with semaphore:  # 使用Semaphore限制同时运行的协程数量
        save_html = str(page) + ".html"
        if os.path.exists(save_html):
            return
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                response_text = await response.text()
                with open(save_html, "w", encoding="utf-8") as f:
                    f.write(response_text)
                # 处理和打印响应内容
                print(f'Response from {url}:')
                print(response_text)
                print('---')
        await asyncio.sleep(5)  # 将sleep()移动到Semaphore的上下文内


async def main():
    urls = ['https://www.baidu.com/'] * 10
    tasks = [fetch(page, url) for page, url in enumerate(urls)]
    await asyncio.gather(*tasks)


if __name__ == '__main__':
    asyncio.run(main())


使用令牌桶算法提升并发量并限制并发量

场景:使用代理的并发量只支持1秒2并发

import os
import aiohttp
import asyncio
from asyncio_throttle import Throttler

async def fetch(throttler, page, url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            async with throttler:  # 使用令牌桶算法限制请求速率
                response_text = await response.text()
                save_html = str(page) + ".html"
                if os.path.exists(save_html):
                    return
                with open(save_html, "w", encoding="utf-8") as f:
                    f.write(response_text)
                # 处理和打印响应内容
                print(f'Response from {url}:')
                print(response_text)
                print('---')

async def main():
    urls = ['https://www.baidu.com/'] * 10

    rate_limit = 2  # 每秒允许的请求次数
    throttler = Throttler(rate_limit)

    tasks = [fetch(throttler, page, url) for page, url in enumerate(urls)]
    await asyncio.gather(*tasks)

if __name__ == '__main__':
    asyncio.run(main())

你可能感兴趣的:(python,爬虫)