事件循环
asyncio是python用于解决异步io编程的一整套解决方案
执行 10 个耗时的请求
import asyncio
import time
async def get_html(url):
print("start get url")
await asyncio.sleep(2)
print("end get url")
if __name__ == "__main__":
start_time = time.time()
loop = asyncio.get_event_loop()
tasks = [get_html("http://www.imooc.com") for i in range(10)]
loop.run_until_complete(asyncio.wait(tasks))
print(time.time()-start_time)
loop.close()
输出结果如下
获取协程的返回值
1 创建一个任务 task
2 通过调用 task.result 获取协程的返回值
import asyncio
import time
async def get_html(url):
print("start get url")
await asyncio.sleep(2)
return "zhangbiao"
if __name__ == "__main__":
start_time = time.time()
loop = asyncio.get_event_loop()
task = loop.create_task(get_html("http://www.imooc.com"))
loop.run_until_complete(task)
print(task.result())
输出结果如下
执行成功进行回调处理
可以通过 add_done_callback( 任务) 添加回调,因为这个函数只接受一个回调的函数名,不能传参,我们想要传参可以使用偏函数
# 获取协程的返回值
import asyncio
import time
from functools import partial
async def get_html(url):
print("start get url")
await asyncio.sleep(2)
return "zhangbiao"
def callback(url, future):
print(url)
print("send email to bobby")
if __name__ == "__main__":
start_time = time.time()
loop = asyncio.get_event_loop()
task = loop.create_task(get_html("http://www.imooc.com"))
task.add_done_callback(partial(callback, "http://www.imooc.com"))
loop.run_until_complete(task)
print(task.result())
输出结果如下
wait 和 gather区别
这两个都可以添加多个任务到事件循环中
gather 比wait更加的高级
1 可以对任务进行分组
2 可以取消任务
import asyncio
import time
async def get_html(url):
print("start get url")
await asyncio.sleep(2)
print("end get url")
if __name__ == "__main__":
start_time = time.time()
loop = asyncio.get_event_loop()
tasks = [get_html("http://www.imooc.com") for i in range(10)]
#gather和wait的区别
# tasks = [get_html("http://www.imooc.com") for i in range(10)]
# loop.run_until_complete(asyncio.wait(tasks))
group1 = [get_html("http://projectsedu.com") for i in range(2)]
group2 = [get_html("http://www.imooc.com") for i in range(2)]
group1 = asyncio.gather(*group1)
group2 = asyncio.gather(*group2)
loop.run_until_complete(asyncio.gather(group1, group2))
print(time.time() - start_time)
输出结果如下
task取消和子协程调用原理
程序运行时 通过 ctl +c 取消任务 调用task.cancel()取消任务
import asyncio
import time
async def get_html(sleep_times):
print("waiting")
await asyncio.sleep(sleep_times)
print("done after {}s".format(sleep_times))
if __name__ == "__main__":
task1 = get_html(2)
task2 = get_html(3)
task3 = get_html(3)
tasks = [task1, task2, task3]
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(asyncio.wait(tasks))
except KeyboardInterrupt as e:
all_tasks = asyncio.Task.all_tasks()
for task in all_tasks:
print("cancel task")
print(task.cancel())
loop.stop()
loop.run_forever()
finally:
loop.close()
在终端
python ceshi.py
# 运行成功后 按 ctl +c 取消任务
输出结果如下
子协程调用原理图
官方的一个实例如下
从下面的原理图我们可以看到
1 当时间循环处于运行状态的时候 任务Task 处于pending(等待),会把控制权交给委托生成器print_sum
2 委托生成器print_sum 会建立一个双向通道为Task和子生成器,调用子生成器compute并把值传递过去
3 子生成器compute会通过委托生成器建立的双向通道把自己当前的状态suspending(暂停),传给Task,Task告诉loop它数据还没处理完成
4 loop会循环检测Task ,Task通过双向通道去看自生成器是否处理完成
5 子生成器处理完成后会向委托生成器抛出一个异常和计算的值,并关闭生成器
6 委托生成器再把异常抛给任务(Task),把任务关闭
7 loop停止循环
call_soon、call_at、call_later、call_soon_threadsafe
call_soon 循环开始检测时,立即执行一个回调函数
call_at 循环开始的第几秒s执行
call_later 循环开始后10s后执行
call_soom_threadsafe 立即执行一个安全的线程
import asyncio
import time
def callback(str, loop):
print("success time {}".format(str))
def stoploop(str,loop):
time.sleep(str)
loop.stop()
#call_later, call_at
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.call_soon(callback, 'loop循环开始检测立即执行', loop)
now = loop.time() # loop循环时间
loop.call_at(now+2, callback, 2, loop)
loop.call_at(now+1, callback, 1, loop)
loop.call_at(now+3, callback, 3, loop)
loop.call_later(6,callback,"6s后执行",loop)
# loop.call_soon_threadsafe(stoploop,loop)
loop.run_forever()
运行结果如下
ThreadPollExecutor 和 asyncio 完成阻塞 IO 请求
在asyncio 中集成线程池处理耗时IO
在协程中同步阻塞的写法,但有些时候不得已就是一些同步耗时的接口
可以把线程池进程到asynico模块中
tasks = []
executor = ThreadPoolExecutor(3)
for url in range(20):
url = "http://shop.projectsedu.com/goods/{}/".format(url)
task = loop.run_in_executor(executor, get_url, url)
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
完整代码如下
#使用多线程:在携程中集成阻塞io
import asyncio
from concurrent.futures import ThreadPoolExecutor
import socket
from urllib.parse import urlparse
def get_url(url):
#通过socket请求html
url = urlparse(url)
host = url.netloc
path = url.path
if path == "":
path = "/"
#建立socket连接
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# client.setblocking(False)
client.connect((host, 80)) #阻塞不会消耗cpu
#不停的询问连接是否建立好, 需要while循环不停的去检查状态
#做计算任务或者再次发起其他的连接请求
client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))
data = b""
while True:
d = client.recv(1024)
if d:
data += d
else:
break
data = data.decode("utf8")
html_data = data.split("\r\n\r\n")[1]
print(html_data)
client.close()
if __name__ == "__main__":
import time
start_time = time.time()
loop = asyncio.get_event_loop()
executor = ThreadPoolExecutor(3)
tasks = []
for url in range(20):
url = "http://shop.projectsedu.com/goods/{}/".format(url)
task = loop.run_in_executor(executor, get_url, url)
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
print("last time:{}".format(time.time()-start_time))
输出结果如下
不用集成也是可以的 但是要在函数的前面加上 async使同步变成异步写法
#使用多线程:在携程中集成阻塞io
importasynciofrom concurrent.futures importThreadPoolExecutorimportsocketfrom urllib.parse importurlparseimporttime
asyncdefget_html(url):#通过socket请求html
url =urlparse(url)
host=url.netloc
path=url.pathif path == "":
path= "/"
#建立socket连接
client =socket.socket(socket.AF_INET, socket.SOCK_STREAM)#client.setblocking(False)
client.connect((host, 80)) #阻塞不会消耗cpu
#不停的询问连接是否建立好, 需要while循环不停的去检查状态
#做计算任务或者再次发起其他的连接请求
client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))
data= b""
whileTrue:
d= client.recv(1024)ifd:
data+=delse:breakdata= data.decode("utf8")
html_data= data.split("\r\n\r\n")[1]print(html_data)
client.close()if __name__ == "__main__":
start_time=time.time()
loop=asyncio.get_event_loop()
tasks= [get_html("http://shop.projectsedu.com/goods/2/") for i in range(10)]
loop.run_until_complete(asyncio.wait(tasks))print(time.time() - start_time)
View Code
输出结果如下
asyncio 模拟 http 请求
#asyncio 没有提供http协议的接口 aiohttp
import asyncio
import socket
from urllib.parse import urlparse
async def get_url(url):
#通过socket请求html
url = urlparse(url)
host = url.netloc
path = url.path
if path == "":
path = "/"
#建立socket连接
reader, writer = await asyncio.open_connection(host,80)
writer.write("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))
all_lines = []
async for raw_line in reader:
data = raw_line.decode("utf8")
all_lines.append(data)
html = "\n".join(all_lines)
return html
async def main():
tasks = []
for url in range(20):
url = "http://shop.projectsedu.com/goods/{}/".format(url)
tasks.append(asyncio.ensure_future(get_url(url)))
for task in asyncio.as_completed(tasks): # 获取执行完的任务
result = await task # 获取执行完任务的结果
print(result)
if __name__ == "__main__":
import time
start_time = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print('last time:{}'.format(time.time()-start_time))
输出结果如下
future 和 task
future 是一个结果的容器,结果执行完后在内部会回调call_back函数
task 是future的子类,可以用来激活协程
asyncio同步和通信
在多少线程中考虑安全性,需要加锁,在协程中是不需要的
import asyncio
total = 0
async def add():
global total
for _ in range(1000000):
total += 1
async def desc():
global total, lock
for _ in range(1000000):
total -= 1
if __name__ == '__main__':
tasks = [add(), desc()]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print(total)
输出结果如下
在有些情况在对协程中我们还是需要类似锁的机制
parse_stuff和use_stuff有共同调用的代码
get_stuff parse_stuff去请求的时候 如果get_stuff也去请求, 会触发网站的反爬虫机制.
这就需要我们像上诉代码那样加lock
get_stuff 和 use_stuff 中都调用了parse_stuff我们想在get_stuff中只请求一次,下次用缓存,所以要用到锁
import asyncio
import aiohttp
from asyncio import Lock
cache = {}
lock = Lock()
async def get_stuff(url):
async with lock: # 等价于 with await lock: 还有async for 。。。类似的用法
# 这里可以使用async with 是因为 Lock中有__await__ 和 __aenter__两个魔法方法
# 和线程一样, 这里也可以用 await lock.acquire() 并在结束时 lock.release
if url in cache:
return cache[url]
print("第一次请求")
stuff = aiohttp.request('GET', url)
cache[url] = stuff
return stuff
async def parse_stuff(url):
stuff = await get_stuff(url)
print('parse_stuff',stuff)
# do some parse
async def use_stuff(url):
stuff = await get_stuff(url)
print('use_stuff',stuff)
# use stuff to do something interesting
if __name__ == '__main__':
tasks = [parse_stuff('baidu'), use_stuff('baidu')]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
输出结果如下
asyncio 通信 queue
协程是单线程的,所以协程中完全可以使用全局变量实现queue来相互通信,但是如果想要 在queue中定义存放有限的最大数目。 我们需要使用 :
put 和get 的前面都要加 await
from asyncio import Queue
queue = Queue(maxsize=3)
await queue.get()
await queue.put()
aiohttp实现高并发爬虫
# asyncio爬虫, 去重, 入库
import asyncio
import re
import aiohttp
import aiomysqlfrompyquery import PyQuery
stopping=False
start_url= 'http://www.jobbole.com'waitting_urls=[]
seen_urls= set() # 实际使用爬虫去重时,数量过多,需要使用布隆过滤器asyncdef fetch(url, session):async with aiohttp.ClientSession() assession:try:async with session.get(url) asresp:
print('url status: {}'.format(resp.status))if resp.status in [200, 201]:
data= awaitresp.text()returndata
except Exceptionase:
print(e)
def extract_urls(html): # html中提取所有url
urls=[]
pq=PyQuery(html)for link in pq.items('a'):
url= link.attr('href')if url and url.startwith('http') and url not inseen_urls:
urls.append(url)
waitting_urls.append(urls)returnurlsasyncdef init_urls(url, session):
html= awaitfetch(url, session)
seen_urls.add(url)
extract_urls(html)asyncdef article_handler(url, session, pool): # 获取文章详情并解析入库
html= awaitfetch(url, session)
extract_urls(html)
pq=PyQuery(html)
title= pq('title').text() # 为了简单, 只获取title的内容async with pool.acquire() asconn:async with conn.cursor() ascur:await cur.execute('SELECT 42;')
insert_sql= "insert into article_test(title) values('{}')".format(
title)awaitcur.execute(insert_sql) # 插入数据库
# print(cur.description)
# (r,)= awaitcur.fetchone()
# assert r== 42
asyncdef consumer(pool):async with aiohttp.ClientSession() assession:whilenot stopping:if len(waitting_urls) == 0: # 如果使用asyncio.Queue的话, 不需要我们来处理这些逻辑。await asyncio.sleep(0.5)continueurl=waitting_urls.pop()
print('start get url:{}'.format(url))if re.match('http://.*?jobbole.com/\d+/', url):if url not inseen_urls: # 是没有处理过的url,则处理
asyncio.ensure_future(article_handler(url, sssion, pool))else:if url not inseen_urls:
asyncio.ensure_future(init_urls(url))asyncdef main(loop):
# 等待mysql连接建立好
pool= await aiomysql.creat_pool(host='127.0.0.1', port=3306, user='root',
password='', db='aiomysql_test', loop=loop, charset='utf8', autocommit=True)
# charset autocommit必须设置, 这是坑, 不写数据库写入不了中文数据async with aiohttp.ClientSession() assession:
html= awaitfetch(start_url, session)
seen_urls.add(start_url)
extract_urls(html)
asyncio.ensure_future(consumer(pool))if __name__ == '__main__':
loop=asyncio.get_event_loop()
asyncio.ensure_future(main(loop))
loop.run_forever()