python asyncio原理_12 python asyncio并发编程

事件循环

asyncio是python用于解决异步io编程的一整套解决方案

执行 10 个耗时的请求

import asyncio

import time

async def get_html(url):

print("start get url")

await asyncio.sleep(2)

print("end get url")

if __name__ == "__main__":

start_time = time.time()

loop = asyncio.get_event_loop()

tasks = [get_html("http://www.imooc.com") for i in range(10)]

loop.run_until_complete(asyncio.wait(tasks))

print(time.time()-start_time)

loop.close()

输出结果如下

获取协程的返回值

1 创建一个任务 task

2 通过调用 task.result 获取协程的返回值

import asyncio

import time

async def get_html(url):

print("start get url")

await asyncio.sleep(2)

return "zhangbiao"

if __name__ == "__main__":

start_time = time.time()

loop = asyncio.get_event_loop()

task = loop.create_task(get_html("http://www.imooc.com"))

loop.run_until_complete(task)

print(task.result())

输出结果如下

执行成功进行回调处理

可以通过  add_done_callback( 任务) 添加回调,因为这个函数只接受一个回调的函数名,不能传参,我们想要传参可以使用偏函数

# 获取协程的返回值

import asyncio

import time

from functools import partial

async def get_html(url):

print("start get url")

await asyncio.sleep(2)

return "zhangbiao"

def callback(url, future):

print(url)

print("send email to bobby")

if __name__ == "__main__":

start_time = time.time()

loop = asyncio.get_event_loop()

task = loop.create_task(get_html("http://www.imooc.com"))

task.add_done_callback(partial(callback, "http://www.imooc.com"))

loop.run_until_complete(task)

print(task.result())

输出结果如下

wait 和 gather区别

这两个都可以添加多个任务到事件循环中

gather 比wait更加的高级

1 可以对任务进行分组

2 可以取消任务

import asyncio

import time

async def get_html(url):

print("start get url")

await asyncio.sleep(2)

print("end get url")

if __name__ == "__main__":

start_time = time.time()

loop = asyncio.get_event_loop()

tasks = [get_html("http://www.imooc.com") for i in range(10)]

#gather和wait的区别

# tasks = [get_html("http://www.imooc.com") for i in range(10)]

# loop.run_until_complete(asyncio.wait(tasks))

group1 = [get_html("http://projectsedu.com") for i in range(2)]

group2 = [get_html("http://www.imooc.com") for i in range(2)]

group1 = asyncio.gather(*group1)

group2 = asyncio.gather(*group2)

loop.run_until_complete(asyncio.gather(group1, group2))

print(time.time() - start_time)

输出结果如下

task取消和子协程调用原理

程序运行时 通过 ctl +c 取消任务 调用task.cancel()取消任务

import asyncio

import time

async def get_html(sleep_times):

print("waiting")

await asyncio.sleep(sleep_times)

print("done after {}s".format(sleep_times))

if __name__ == "__main__":

task1 = get_html(2)

task2 = get_html(3)

task3 = get_html(3)

tasks = [task1, task2, task3]

loop = asyncio.get_event_loop()

try:

loop.run_until_complete(asyncio.wait(tasks))

except KeyboardInterrupt as e:

all_tasks = asyncio.Task.all_tasks()

for task in all_tasks:

print("cancel task")

print(task.cancel())

loop.stop()

loop.run_forever()

finally:

loop.close()

在终端

python ceshi.py

# 运行成功后 按 ctl +c 取消任务

输出结果如下

子协程调用原理图

官方的一个实例如下

从下面的原理图我们可以看到

1 当时间循环处于运行状态的时候 任务Task 处于pending(等待),会把控制权交给委托生成器print_sum

2  委托生成器print_sum 会建立一个双向通道为Task和子生成器,调用子生成器compute并把值传递过去

3  子生成器compute会通过委托生成器建立的双向通道把自己当前的状态suspending(暂停),传给Task,Task告诉loop它数据还没处理完成

4  loop会循环检测Task ,Task通过双向通道去看自生成器是否处理完成

5 子生成器处理完成后会向委托生成器抛出一个异常和计算的值,并关闭生成器

6 委托生成器再把异常抛给任务(Task),把任务关闭

7  loop停止循环

call_soon、call_at、call_later、call_soon_threadsafe

call_soon 循环开始检测时,立即执行一个回调函数

call_at  循环开始的第几秒s执行

call_later  循环开始后10s后执行

call_soom_threadsafe 立即执行一个安全的线程

import asyncio

import time

def callback(str, loop):

print("success time {}".format(str))

def stoploop(str,loop):

time.sleep(str)

loop.stop()

#call_later, call_at

if __name__ == "__main__":

loop = asyncio.get_event_loop()

loop.call_soon(callback, 'loop循环开始检测立即执行', loop)

now = loop.time() # loop循环时间

loop.call_at(now+2, callback, 2, loop)

loop.call_at(now+1, callback, 1, loop)

loop.call_at(now+3, callback, 3, loop)

loop.call_later(6,callback,"6s后执行",loop)

# loop.call_soon_threadsafe(stoploop,loop)

loop.run_forever()

运行结果如下

ThreadPollExecutor 和 asyncio 完成阻塞 IO 请求

在asyncio 中集成线程池处理耗时IO

在协程中同步阻塞的写法,但有些时候不得已就是一些同步耗时的接口

可以把线程池进程到asynico模块中

tasks = []

executor = ThreadPoolExecutor(3)

for url in range(20):

url = "http://shop.projectsedu.com/goods/{}/".format(url)

task = loop.run_in_executor(executor, get_url, url)

tasks.append(task)

loop.run_until_complete(asyncio.wait(tasks))

完整代码如下

#使用多线程:在携程中集成阻塞io

import asyncio

from concurrent.futures import ThreadPoolExecutor

import socket

from urllib.parse import urlparse

def get_url(url):

#通过socket请求html

url = urlparse(url)

host = url.netloc

path = url.path

if path == "":

path = "/"

#建立socket连接

client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

# client.setblocking(False)

client.connect((host, 80)) #阻塞不会消耗cpu

#不停的询问连接是否建立好, 需要while循环不停的去检查状态

#做计算任务或者再次发起其他的连接请求

client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))

data = b""

while True:

d = client.recv(1024)

if d:

data += d

else:

break

data = data.decode("utf8")

html_data = data.split("\r\n\r\n")[1]

print(html_data)

client.close()

if __name__ == "__main__":

import time

start_time = time.time()

loop = asyncio.get_event_loop()

executor = ThreadPoolExecutor(3)

tasks = []

for url in range(20):

url = "http://shop.projectsedu.com/goods/{}/".format(url)

task = loop.run_in_executor(executor, get_url, url)

tasks.append(task)

loop.run_until_complete(asyncio.wait(tasks))

print("last time:{}".format(time.time()-start_time))

输出结果如下

不用集成也是可以的 但是要在函数的前面加上 async使同步变成异步写法

#使用多线程:在携程中集成阻塞io

importasynciofrom concurrent.futures importThreadPoolExecutorimportsocketfrom urllib.parse importurlparseimporttime

asyncdefget_html(url):#通过socket请求html

url =urlparse(url)

host=url.netloc

path=url.pathif path == "":

path= "/"

#建立socket连接

client =socket.socket(socket.AF_INET, socket.SOCK_STREAM)#client.setblocking(False)

client.connect((host, 80)) #阻塞不会消耗cpu

#不停的询问连接是否建立好, 需要while循环不停的去检查状态

#做计算任务或者再次发起其他的连接请求

client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))

data= b""

whileTrue:

d= client.recv(1024)ifd:

data+=delse:breakdata= data.decode("utf8")

html_data= data.split("\r\n\r\n")[1]print(html_data)

client.close()if __name__ == "__main__":

start_time=time.time()

loop=asyncio.get_event_loop()

tasks= [get_html("http://shop.projectsedu.com/goods/2/") for i in range(10)]

loop.run_until_complete(asyncio.wait(tasks))print(time.time() - start_time)

View Code

输出结果如下

asyncio 模拟 http 请求

#asyncio 没有提供http协议的接口 aiohttp

import asyncio

import socket

from urllib.parse import urlparse

async def get_url(url):

#通过socket请求html

url = urlparse(url)

host = url.netloc

path = url.path

if path == "":

path = "/"

#建立socket连接

reader, writer = await asyncio.open_connection(host,80)

writer.write("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))

all_lines = []

async for raw_line in reader:

data = raw_line.decode("utf8")

all_lines.append(data)

html = "\n".join(all_lines)

return html

async def main():

tasks = []

for url in range(20):

url = "http://shop.projectsedu.com/goods/{}/".format(url)

tasks.append(asyncio.ensure_future(get_url(url)))

for task in asyncio.as_completed(tasks): # 获取执行完的任务

result = await task # 获取执行完任务的结果

print(result)

if __name__ == "__main__":

import time

start_time = time.time()

loop = asyncio.get_event_loop()

loop.run_until_complete(main())

print('last time:{}'.format(time.time()-start_time))

输出结果如下

future 和 task

future 是一个结果的容器,结果执行完后在内部会回调call_back函数

task 是future的子类,可以用来激活协程

asyncio同步和通信

在多少线程中考虑安全性,需要加锁,在协程中是不需要的

import asyncio

total = 0

async def add():

global total

for _ in range(1000000):

total += 1

async def desc():

global total, lock

for _ in range(1000000):

total -= 1

if __name__ == '__main__':

tasks = [add(), desc()]

loop = asyncio.get_event_loop()

loop.run_until_complete(asyncio.wait(tasks))

print(total)

输出结果如下

在有些情况在对协程中我们还是需要类似锁的机制

parse_stuff和use_stuff有共同调用的代码

get_stuff parse_stuff去请求的时候 如果get_stuff也去请求, 会触发网站的反爬虫机制.

这就需要我们像上诉代码那样加lock

get_stuff 和 use_stuff  中都调用了parse_stuff我们想在get_stuff中只请求一次,下次用缓存,所以要用到锁

import asyncio

import aiohttp

from asyncio import Lock

cache = {}

lock = Lock()

async def get_stuff(url):

async with lock: # 等价于 with await lock: 还有async for 。。。类似的用法

# 这里可以使用async with 是因为 Lock中有__await__ 和 __aenter__两个魔法方法

# 和线程一样, 这里也可以用 await lock.acquire() 并在结束时 lock.release

if url in cache:

return cache[url]

print("第一次请求")

stuff = aiohttp.request('GET', url)

cache[url] = stuff

return stuff

async def parse_stuff(url):

stuff = await get_stuff(url)

print('parse_stuff',stuff)

# do some parse

async def use_stuff(url):

stuff = await get_stuff(url)

print('use_stuff',stuff)

# use stuff to do something interesting

if __name__ == '__main__':

tasks = [parse_stuff('baidu'), use_stuff('baidu')]

loop = asyncio.get_event_loop()

loop.run_until_complete(asyncio.wait(tasks))

输出结果如下

asyncio 通信 queue

协程是单线程的,所以协程中完全可以使用全局变量实现queue来相互通信,但是如果想要 在queue中定义存放有限的最大数目。 我们需要使用 :

put 和get 的前面都要加 await

from asyncio import Queue

queue = Queue(maxsize=3)

await queue.get()

await queue.put()

aiohttp实现高并发爬虫

# asyncio爬虫, 去重, 入库

import asyncio

import re

import aiohttp

import aiomysqlfrompyquery import PyQuery

stopping=False

start_url= 'http://www.jobbole.com'waitting_urls=[]

seen_urls= set() # 实际使用爬虫去重时,数量过多,需要使用布隆过滤器asyncdef fetch(url, session):async with aiohttp.ClientSession() assession:try:async with session.get(url) asresp:

print('url status: {}'.format(resp.status))if resp.status in [200, 201]:

data= awaitresp.text()returndata

except Exceptionase:

print(e)

def extract_urls(html): # html中提取所有url

urls=[]

pq=PyQuery(html)for link in pq.items('a'):

url= link.attr('href')if url and url.startwith('http') and url not inseen_urls:

urls.append(url)

waitting_urls.append(urls)returnurlsasyncdef init_urls(url, session):

html= awaitfetch(url, session)

seen_urls.add(url)

extract_urls(html)asyncdef article_handler(url, session, pool): # 获取文章详情并解析入库

html= awaitfetch(url, session)

extract_urls(html)

pq=PyQuery(html)

title= pq('title').text() # 为了简单, 只获取title的内容async with pool.acquire() asconn:async with conn.cursor() ascur:await cur.execute('SELECT 42;')

insert_sql= "insert into article_test(title) values('{}')".format(

title)awaitcur.execute(insert_sql) # 插入数据库

# print(cur.description)

# (r,)= awaitcur.fetchone()

# assert r== 42

asyncdef consumer(pool):async with aiohttp.ClientSession() assession:whilenot stopping:if len(waitting_urls) == 0: # 如果使用asyncio.Queue的话, 不需要我们来处理这些逻辑。await asyncio.sleep(0.5)continueurl=waitting_urls.pop()

print('start get url:{}'.format(url))if re.match('http://.*?jobbole.com/\d+/', url):if url not inseen_urls: # 是没有处理过的url,则处理

asyncio.ensure_future(article_handler(url, sssion, pool))else:if url not inseen_urls:

asyncio.ensure_future(init_urls(url))asyncdef main(loop):

# 等待mysql连接建立好

pool= await aiomysql.creat_pool(host='127.0.0.1', port=3306, user='root',

password='', db='aiomysql_test', loop=loop, charset='utf8', autocommit=True)

# charset autocommit必须设置, 这是坑, 不写数据库写入不了中文数据async with aiohttp.ClientSession() assession:

html= awaitfetch(start_url, session)

seen_urls.add(start_url)

extract_urls(html)

asyncio.ensure_future(consumer(pool))if __name__ == '__main__':

loop=asyncio.get_event_loop()

asyncio.ensure_future(main(loop))

loop.run_forever()

你可能感兴趣的:(python,asyncio原理)