import os
import aiohttp
import asyncio
semaphore = asyncio.Semaphore(2) # 同时允许的最大协程数量
async def fetch(page, url):
async with semaphore: # 使用Semaphore限制同时运行的协程数量
save_html = str(page) + ".html"
if os.path.exists(save_html):
return
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
response_text = await response.text()
with open(save_html, "w", encoding="utf-8") as f:
f.write(response_text)
# 处理和打印响应内容
print(f'Response from {url}:')
print(response_text)
print('---')
await asyncio.sleep(5) # 将sleep()移动到Semaphore的上下文内
async def main():
urls = ['https://www.baidu.com/'] * 10
tasks = [fetch(page, url) for page, url in enumerate(urls)]
await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(main())
使用令牌桶算法提升并发量并限制并发量
场景:使用代理的并发量只支持1秒2并发
import os
import aiohttp
import asyncio
from asyncio_throttle import Throttler
async def fetch(throttler, page, url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
async with throttler: # 使用令牌桶算法限制请求速率
response_text = await response.text()
save_html = str(page) + ".html"
if os.path.exists(save_html):
return
with open(save_html, "w", encoding="utf-8") as f:
f.write(response_text)
# 处理和打印响应内容
print(f'Response from {url}:')
print(response_text)
print('---')
async def main():
urls = ['https://www.baidu.com/'] * 10
rate_limit = 2 # 每秒允许的请求次数
throttler = Throttler(rate_limit)
tasks = [fetch(throttler, page, url) for page, url in enumerate(urls)]
await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(main())