下面使用了concurrent.future : 并发入门
例子中的ThreadPoolExecutor可以改成ProcessPoolExecutor 试试看,接口统一
另外ProcessPoolExecutor 的参数默认值:os.cpu_count()
map , submit, as_completed的参数及返回.参考文档
分别使用了map 以及 submit 和 as_completed完成下载;
注意: as_completed 返回的future是完成或失败的.正在运行的不会返回;
可以修改max_workers参数看看变化
首先是map方法 . 如果你修改了我的代码去迭代map的返回值需要注意 迭代过程中会调用future.result() 因此会阻塞,
而我的代码中会阻塞的原因是在with 块中,最后会调用executor.shutdown,这个函数会等待所有的线程完成或失败
:
from concurrent import futures
import requests,time,sys,os
#常量
FLAGS = ('CN IN US ID BR PK NG BD RU JP '
'MX PH VN ET EG DE IR TR CD FR').split()
BASE_URL = 'http://flupy.org/data/flags' #下载url
DEST_DIR = 'downloads/' #保存目录
CHUNK_SIZE = 8192 #块大小
MAX_THREAD = 20 #并发最多20个线程
"""
使用requests 来请求下载;
对于下载小文件不需要使用iter_content,
直接使用requests.get(url).content即可.
这个例子适用了下载大文件的情况.
requests库的使用可以参考:
http://docs.python-requests.org/zh_CN/latest/user/quickstart.html
"""
#下载主体
def begin_download(flag):
path = os.path.join(DEST_DIR, flag.lower()+'.gif')
print('开始下载:%s,保存位置:%s'%(flag,path),end='\n')
sys.stdout.flush()
url = '{}/{flag}/{flag}.gif'.format(BASE_URL, flag=flag.lower())
#以下代码如果做测试可以改成:open(path,'wb').write(requests.get(url)) 即可
with requests.get(url,stream=True) as resp: #请求下载,stream=True流下载.
with open(path,'wb') as fd: #打开文件
for chunk in resp.iter_content(CHUNK_SIZE): #分块请求文件流
fd.write(chunk) #写文件
print('%s done!'%flag);return flag
#计算了文件下载时间
def t_download():
start_time = time.time()
#多线程下载,每个url分配一个线程
with futures.ThreadPoolExecutor(max_workers=min(len(FLAGS),MAX_THREAD)) as ex:
iter_res = ex.map(begin_download,FLAGS)
elapsed = time.time() - start_time
print('all done :{}s'.format(elapsed))
if __name__ == '__main__':
os.makedirs(DEST_DIR,exist_ok=True)
t_download()
接下来是submit (返回一个future)和as_completed(接受一个future列表,返回一个生成器,哪个任务先完成就返回其future),
只把上面的 t_download 修改一下:
def t1_download():
start_time = time.time()
future_tasks = []
results = []
with futures.ThreadPoolExecutor(max_workers=min(len(FLAGS),MAX_THREAD)) as ex:
print('准备添加线程..')
for flag in FLAGS:
future_tasks.append(ex.submit(begin_download,flag))
print('线程添加完毕..')
for f in futures.as_completed(future_tasks):
try:
res = f.result()
except Exception as e:
print('%s下载失败, except:%s'%(res,e))
else:
results.append(res)
elapsed = time.time() - start_time
print('总共完成->%d<-个任务'%len(results))
print('all done :{}s'.format(elapsed))
下面添加了进度条, 使用tqdm.基本代码都没变:
def t1_download():
start_time = time.time()
future_tasks = []
results = []
with futures.ThreadPoolExecutor(max_workers=min(len(FLAGS),MAX_THREAD)) as ex:
for flag in FLAGS:
future_tasks.append(ex.submit(begin_download,flag))
done_iter = futures.as_completed(future_tasks)
done_iter = tqdm.tqdm(done_iter,total=len(future_tasks))
for future in done_iter:
try:
res = future.result()
except Exception as e:
print('%s'%e)
else:
results.append(res)
elapsed = time.time() - start_time
print('总共完成->%d<-个任务'%len(results))
print('all done :{}s'.format(elapsed))
下面使用了异步下载:
import os,sys,time,asyncio,aiohttp
FLAGS = ('CN IN US ID BR PK NG BD RU JP '
'MX PH VN ET EG DE IR TR CD FR').split()
BASE_URL = 'http://flupy.org/data/flags' #下载url
DEST_DIR = 'downloads/' #保存目录
async def fetch(session:aiohttp.ClientSession,url:str,path:str,flag:str):
print(flag, ' 开始下载')
async with session.get(url) as resp:
with open(path,'wb') as fd:
while 1:
chunk = await resp.content.read(8196)
if not chunk:
break
fd.write(chunk)
return flag
async def download():
tasks = []
async with aiohttp.ClientSession() as session:
for flag in FLAGS:
path = os.path.join(DEST_DIR, flag.lower() + '.gif')
url = '{}/{cc}/{cc}.gif'.format(BASE_URL, cc=flag.lower())
tasks.append(asyncio.ensure_future(fetch(session, url, path, flag)))
await asyncio.wait(tasks)
# for coroutine in asyncio.as_completed(tasks):
# res = await coroutine
# print('%s下载完成' % res)
os.makedirs(DEST_DIR,exist_ok=True)
lp = asyncio.get_event_loop()
start = time.time()
lp.run_until_complete(download())
end = time.time()
lp.close()
print('耗时:',end-start)