asyncio
和aiohttp
库实现异步爬虫:import asyncio
from datetime import datetime
import aiohttp
import logging
from bs4 import BeautifulSoup
# 首先通过调用 basicConfig 方法进行配置,指定了日志级别为 INFO,输出格式为 "[时间] [级别] [消息]"
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
# 利用“漂亮汤”解析出电影标题
def parse(response):
soup = BeautifulSoup(response, 'lxml')
print(soup.select("#content > h1 > span:nth-child(1)")[0].get_text())
async def fetch(url, session):
# 简单设置了请求头来防止反扒,不同情况要根据具体网站设置
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299",
"Referer": "https://www.google.com/"
}
# 异步请求指定 url,并返回响应文本内容
async with session.get(url, headers=headers) as response:
# 在这里添加打印日志的功能,使用 logging 模块进行打印
logging.info(f'[{datetime.now()}] Crawled {url}, Response code: {response.status}')
return await response.text()
async def crawl(urls):
# 创建一个 aiohttp 的 ClientSession,它是异步上下文管理器,负责与 HTTP 服务器进行通信
async with aiohttp.ClientSession() as session:
tasks = []
for url in urls:
logging.info(f'Start crawling {url}')
# 创建一个 asyncio 的协程任务,使用 ensure_future 或者 create_task 方法都可以
task = asyncio.ensure_future(fetch(url, session))
tasks.append(task)
# 如果没有传入任何链接,即 tasks 列表为空,我们输出一个警告日志,提示用户当前没有链接需要爬取。
if not tasks:
logging.warning('No links to crawl.')
return []
# 并发运行多个协程任务,gather 方法接收一个可迭代对象,将其所有的协程任务加入事件循环中,并等待它们全部完成
responses = await asyncio.gather(*tasks)
# 返回所有协程任务的响应结果列表
return responses
if __name__ == '__main__':
# urls = [...] # 待爬取的 URL 列表
urls = ["https://movie.douban.com/subject/1292052/", "https://movie.douban.com/subject/1291546/"]
new_loop = asyncio.new_event_loop()
asyncio.set_event_loop(new_loop)
responses = new_loop.run_until_complete(crawl(urls))
# 对 responses 进行处理
for res in responses:
parse(res)
aiohttp
库向给定的URL发送HTTP GET请求,并返回响应文本aiohttp
库建立异步HTTP会话,然后使用asyncio.gather()
方法并发发送HTTP请求,最终将所有响应文本集中返回asyncio.new_event_loop()
方法获取一个异步事件循环对象loop.run_until_complete()
方法运行异步协程crawl()
crawl()
函数将异步执行网络请求,并在完成后返回响应文本2023-05-01 15:49:47,705 INFO Start crawling https://movie.douban.com/subject/1292052/
2023-05-01 15:49:47,706 INFO Start crawling https://movie.douban.com/subject/1291546/
2023-05-01 15:49:47,706 INFO Start crawling https://movie.douban.com/subject/1292720/
2023-05-01 15:49:48,915 INFO [2023-05-01 15:49:48.915036] Crawled https://movie.douban.com/subject/1292052/, Response code: 200
2023-05-01 15:49:49,011 INFO [2023-05-01 15:49:49.011986] Crawled https://movie.douban.com/subject/1291546/, Response code: 200
2023-05-01 15:49:49,379 INFO [2023-05-01 15:49:49.379012] Crawled https://movie.douban.com/subject/1292720/, Response code: 200
肖申克的救赎 The Shawshank Redemption
霸王别姬
阿甘正传 Forrest Gump
进程已结束,退出代码0
总的来说,这个异步爬虫示例代码简单易懂,使用了asyncio
和aiohttp
库,使得异步爬取数据更加高效