线程池的使用+requests模块+回调函数
"""
可以实现并发
但是,请求发送出去后和返回之前,中间时期线程空闲
编写方式:
- 直接返回处理
- 通过回调函数处理
"""
########### 编写方式一 ###########
"""
from concurrent.futures import ThreadPoolExecutor
import requests
import time
def task(url):
response = requests.get(url)
print(url,response)
# 写正则表达式
pool = ThreadPoolExecutor(7)
url_list = [
'http://www.cnblogs.com/wupeiqi',
'http://huaban.com/favorite/beauty/',
'http://www.bing.com',
'http://www.zhihu.com',
'http://www.sina.com',
'http://www.baidu.com',
'http://www.autohome.com.cn',
]
for url in url_list:
pool.submit(task,url)
pool.shutdown(wait=True)
"""
########### 编写方式二 ###########
from concurrent.futures import ThreadPoolExecutor
import requests
import time
def task(url):
"""
下载页面
:param url:
:return:
"""
response = requests.get(url)
return response
def done(future,*args,**kwargs):
response = future.result()
print(response.status_code,response.content)
pool = ThreadPoolExecutor(7)
url_list = [
'http://www.cnblogs.com/wupeiqi',
'http://huaban.com/favorite/beauty/',
'http://www.bing.com',
'http://www.zhihu.com',
'http://www.sina.com',
'http://www.baidu.com',
'http://www.autohome.com.cn',
]
for url in url_list:
v = pool.submit(task,url)
v.add_done_callback(done) # 添加回调函数
pool.shutdown(wait=True)
进程池的使用+requests模块+回调函数
"""
可以实现并发
但是,请求发送出去后和返回之前,中间时期进程空闲
编写方式:
- 直接返回处理
- 通过回调函数处理
"""
########### 编写方式一 ###########
"""
from concurrent.futures import ProcessPoolExecutor
import requests
import time
def task(url):
response = requests.get(url)
print(url,response)
# 写正则表达式
pool = ProcessPoolExecutor(7)
url_list = [
'http://www.cnblogs.com/wupeiqi',
'http://huaban.com/favorite/beauty/',
'http://www.bing.com',
'http://www.zhihu.com',
'http://www.sina.com',
'http://www.baidu.com',
'http://www.autohome.com.cn',
]
for url in url_list:
pool.submit(task,url)
pool.shutdown(wait=True)
"""
########### 编写方式二 ###########
from concurrent.futures import ProcessPoolExecutor
import requests
import time
def task(url):
response = requests.get(url)
return response
def done(future,*args,**kwargs):
response = future.result()
print(response.status_code,response.content)
pool = ProcessPoolExecutor(7)
url_list = [
'http://www.cnblogs.com/wupeiqi',
'http://huaban.com/favorite/beauty/',
'http://www.bing.com',
'http://www.zhihu.com',
'http://www.sina.com',
'http://www.baidu.com',
'http://www.autohome.com.cn',
]
for url in url_list:
v = pool.submit(task,url)
v.add_done_callback(done)
pool.shutdown(wait=True)
asyncio + aiohttp
import asyncio
"""
@asyncio.coroutine
def task():
print('before...task......')
yield from asyncio.sleep(5) # 发送Http请求,支持TCP获取结果..
print('end...task......')
tasks = [task(), task()]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
"""
"""
import asyncio
@asyncio.coroutine
def task(host, url='/'):
print('start',host,url)
reader, writer = yield from asyncio.open_connection(host, 80)
request_header_content = "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" % (url, host,)
request_header_content = bytes(request_header_content, encoding='utf-8')
writer.write(request_header_content)
yield from writer.drain()
text = yield from reader.read()
print('end',host, url, text)
writer.close()
tasks = [
task('www.cnblogs.com', '/wupeiqi/'),
task('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091')
]
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
"""
"""
import aiohttp
import asyncio
@asyncio.coroutine
def fetch_async(url):
print(url)
response = yield from aiohttp.request('GET', url)
print(url, response)
response.close()
tasks = [fetch_async('http://www.baidu.com/'), fetch_async('http://www.chouti.com/')]
event_loop = asyncio.get_event_loop()
results = event_loop.run_until_complete(asyncio.gather(*tasks))
event_loop.close()
"""
import asyncio
import requests
@asyncio.coroutine
def task(func, *args):
print(func,args)
loop = asyncio.get_event_loop()
future = loop.run_in_executor(None, func, *args) # requests.get('http://www.cnblogs.com/wupeiqi/')
response = yield from future
print(response.url, response.content)
tasks = [
task(requests.get, 'http://www.cnblogs.com/wupeiqi/'),
task(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091')
]
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
不管是何种异步IO框架,内部的原码都是类似的,设置非堵塞IO,循环检测socket对象的状态,就形成了异步IO
IO多路复用,就是用select监听多个socket对象
异步IO,就是非堵塞IO加IO多路复用
而我们自己就可以编写出一个 自定义异步IO框架
import socket
import select
# ########################## HTTP请求本质,阻塞 ##########################
"""
sk = socket.socket()
# 1.连接
sk.connect(('www.baidu.com',80,)) # IO阻塞
print('连接成功了...')
# 2. 连接成功发送消息
sk.send(b'GET / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n')
# sk.send(b'POST / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\nk1=v1&k2=v2')
# 3. 等待着服务端响应
data = sk.recv(8096) # IO阻塞
print(data)
# 关闭连接
sk.close()
"""
# ########################## HTTP请求本质,非阻塞 ##########################
"""
sk = socket.socket()
sk.setblocking(False)
# 1.连接
try:
sk.connect(('www.baidu.com',80,)) # IO阻塞
print('连接成功了...')
except BlockingIOError as e:
print(e)
# 2. 连接成功发送消息
sk.send(b'GET / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n')
# sk.send(b'POST / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\nk1=v1&k2=v2')
# 3. 等待着服务端响应
data = sk.recv(8096) # IO阻塞
print(data)
# 关闭连接
sk.close()
"""
class HttpRequest:
def __init__(self,sk,host,callback):
self.socket = sk
self.host = host
self.callback = callback
def fileno(self):
return self.socket.fileno()
class HttpResponse:
def __init__(self,recv_data):
self.recv_data = recv_data
self.header_dict = {}
self.body = None
self.initialize()
def initialize(self):
headers, body = self.recv_data.split(b'\r\n\r\n', 1)
self.body = body
header_list = headers.split(b'\r\n')
for h in header_list:
h_str = str(h,encoding='utf-8')
v = h_str.split(':',1)
if len(v) == 2:
self.header_dict[v[0]] = v[1]
class AsyncRequest:
def __init__(self):
self.conn = []
self.connection = [] # 用于检测是否已经连接成功
def add_request(self,host,callback):
try:
sk = socket.socket()
sk.setblocking(0)
sk.connect((host,80,))
except BlockingIOError as e:
pass
request = HttpRequest(sk,host,callback)
self.conn.append(request)
self.connection.append(request)
def run(self):
while True:
rlist,wlist,elist = select.select(self.conn,self.connection,self.conn,0.05)
for w in wlist:
print(w.host,'连接成功...')
# 只要能循环到,表示socket和服务器端已经连接成功
tpl = "GET / HTTP/1.0\r\nHost:%s\r\n\r\n" %(w.host,)
w.socket.send(bytes(tpl,encoding='utf-8'))
self.connection.remove(w)
for r in rlist:
# r,是HttpRequest
recv_data = bytes()
while True:
try:
chunck = r.socket.recv(8096)
recv_data += chunck
except Exception as e:
break
response = HttpResponse(recv_data)
r.callback(response)
r.socket.close()
self.conn.remove(r)
if len(self.conn) == 0:
break
def f1(response):
print('保存到文件',response.header_dict)
def f2(response):
print('保存到数据库', response.header_dict)
url_list = [
{'host':'www.baidu.com','callback': f1},
{'host':'cn.bing.com','callback': f2},
{'host':'www.cnblogs.com','callback': f2},
]
req = AsyncRequest()
for item in url_list:
req.add_request(item['host'],item['callback'])
req.run()
note
一个东西你只会用,不会懂得原理的时候,心里没有底
很多人都停留在只会用的阶段
多线程和多进程的区别,
python 中同一个时刻只能有一个线程进入CPU执行
IO密集型用线程,计算密集型用进程
爬虫性能相关
- 单线程堵塞
requestss.get(''''') 一直等待回复,堵塞
事件循环
IO 多路复用:r,w,e ==> 监听多个socket对象 利用其特性可以开发很多异步模块
异步IO, 非堵塞的socket+IO多路复用
- 非堵塞的socket
- select[自己对象],w,r
参考博客:https://www.cnblogs.com/wupeiqi/articles/6229292.html