以下仅是部分内容,如需查看完整内容,点这里
目的:在爬虫中使用异步实现高性能的数据爬取操作
异步爬取的方式:
import requests
def get_content(url):
print("正在爬取:", url)
# get方法是一个阻塞的方法
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
return response.content
def parse_content(content):
print("响应数据的长度为: ",len(content))
if __name__ == '__main__':
# UA伪装,相关的头信息封装在字典结构中
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
urls = [
'https://downsc.chinaz.net/Files/DownLoad/jianli/202111/jianli16476.rar',
'https://downsc.chinaz.net/Files/DownLoad/jianli/202111/jianli16473.rar',
'https://downsc.chinaz.net/Files/DownLoad/jianli/202112/jianli16531.rar'
]
for url in urls:
content = get_content(url)
parse_content(content)
代码如下,其中以单线程串行的方式执行后的耗时为8秒,线程池方式执行后的耗时为2秒,大大的减少运行时间。
import time
from multiprocessing.dummy import Pool # 导入线程池模块对应的类
def get_page(str):
print("正在下载: ",str)
time.sleep(2)
print("下载成功:", str)
if __name__ == '__main__':
# UA伪装,相关的头信息封装在字典结构中
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
name_list = ['xiaozi','aa','bb','cc']
start_time = time.time()
# 1.使用单线程串行的方式执行
# for i in range(len(name_list)):
# get_page(name_list[i])
# 2.使用线程池方式执行
# # 实例化一个线程池对象
pool = Pool(4)
# # 将列表中每一个列表元素传递给get_page进行处理
pool.map(get_page,name_list)
end_time = time.time()
print('%d second'% (end_time-start_time))
import asyncio
async def requests(url):
print("正在请求的url是:", url)
print("请求成功,", url)
return url
def callback_func(task):
"""回调函数"""
# result返回的就是任务对象中封装的协程对象对应函数的返回值
print(task.result())
if __name__ == '__main__':
# async修饰的函数,调用之后返回的一个协程对象
c = requests('www.baidu.com')
# 创建一个事件循环对象
# loop = asyncio.get_event_loop()
# # 将协程对象注册到loop中,然后启动loop
# loop.run_until_complete(c)
# task的使用
# loop = asyncio.get_event_loop()
# # 基于loop创建了一个task对象
# task = loop.create_task(c)
# print(task)
#
# loop.run_until_complete(task)
# print(task)
# future的使用
# loop = asyncio.get_event_loop()
# task = asyncio.ensure_future(c)
# print(task)
# loop.run_until_complete(task)
# print(task)
# 绑定回调
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(c)
# 将回调函数绑定到任务对象中
task.add_done_callback(callback_func)
loop.run_until_complete(task)
import asyncio
import time
async def request(url):
print("正在请求的url是:", url)
# 在异步协程中如果出现了同步模块相关的代码,那么就无法实现异步
# time.sleep(2)
# 当在asyncio中遇到阻塞操作必须进行手动挂起
await asyncio.sleep(2) # 异步模块的代码
print("请求成功,", url)
return url
if __name__ == '__main__':
start_time = time.time()
urls = [
'www.baidu.com',
'www.sogou.com',
'www.so.com'
]
# 任务列表:存放多个任务对象
tasks = []
for url in urls:
c = request(url)
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop()
# 需要将任务列表封装到wait中
loop.run_until_complete(asyncio.wait(tasks))
end_time = time.time()
print('total time is %d second' % (end_time - start_time))
环境安装:pip install aiohttp
流程:
建立flask服务,并将代码运行起来
from flask import Flask
import time
app = Flask(__name__)
@app.route('/lucy')
def index_lucy():
time.sleep(2)
return 'Hello lucy'
@app.route('/jay')
def index_jay():
time.sleep(2)
return 'Hello jay'
@app.route('/tom')
def index_tom():
time.sleep(2)
return 'Hello tom'
if __name__ == '__main__':
app.run(threaded=True)
使用aiohttp模块中的ClientSession()进行网络请求的发送
import asyncio
import time
import aiohttp
async def get_page(url):
# print("正在请求的url是:", url)
# aiohttp:基于异步网络请求的模型
async with aiohttp.ClientSession() as session:
# session.get(url,headers=headers,params=params,proxy='http://ip:port')
# session.post(url,headers=headers,data=data,proxy='http://ip:port')
async with await session.get(url) as response:
# text()返回的是字符串形式的响应数据
# read()返回的是二进制形式的响应数据
# json()返回的是json对象
# 注意:获取响应数据操作之前一定要使用await进行手动挂起
page_text = await response.text()
print("响应结果:", page_text)
if __name__ == '__main__':
start_time = time.time()
urls = [
'http://127.0.0.1:5000/lucy',
'http://127.0.0.1:5000/jay',
'http://127.0.0.1:5000/tom'
]
tasks = []
for url in urls:
c = get_page(url)
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
end_time = time.time()
print('total time is %d second' % (end_time - start_time))
如果本文对你有帮助,记得“点赞、收藏”哦~