测试模型:resnet50和resnet101
测试显卡:2080ti
单独测试耗时:resnet50 24.4ms resnet101 48.22ms
初始化种子和热机:为了保证每次验证的一致性,需要初始化种子,使得每次测试的输入数据保持一致,同时,为了准确统计时间,测试前先跑100次热机,然后再统计时间。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/5/8 16:20
# @Author : wangjianrong
# @File : 1.模型串行.py
from torchvision.models.resnet import resnet50, resnet101
import random
import os
import numpy as np
import torch
from time import time
import asyncio
from concurrent.futures import ThreadPoolExecutor,as_completed,wait,ALL_COMPLETED
def init_seed(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
if seed == 0:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def inference(model, x,name):
y = model(x)
return name
def main():
init_seed(0)
s = time()
fake_input = torch.randn(1, 3, 224, 224)
e = time()
print("gen data:", e - s)
fake_input = fake_input.cuda()
e = time()
print("gen data:", e - s)
warm_cnt = 100
repeat = 100
model1 = resnet50(True).cuda().eval()
model2 = resnet101(True).cuda().eval()
s = time()
for i in range(warm_cnt):
y = model1(fake_input)
e = time()
print("warm up res50:", e - s)
s = time()
for i in range(warm_cnt):
y = model2(fake_input)
e = time()
print("warm up re101:", e - s)
s = time()
for i in range(repeat):
y = inference(model1,fake_input,1)
y = inference(model2,fake_input,1)
e = time()
print("模型串行耗时:",e - s)
if __name__ == '__main__':
main()
结果:
warm up res50: 2.331266403198242
warm up re101: 4.534073352813721
模型串行耗时: 6.889774560928345
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/5/8 16:26
# @Author : wangjianrong
# @File : 2.多线程.py
from torchvision.models.resnet import resnet50, resnet101
import random
import os
import numpy as np
import torch
from time import time
import asyncio
from concurrent.futures import ThreadPoolExecutor,as_completed,wait,ALL_COMPLETED
def init_seed(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
if seed == 0:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def inference(model, x,name):
y = model(x)
return name
def main():
init_seed(0)
s = time()
fake_input = torch.randn(1, 3, 224, 224)
e = time()
print("gen data:", e - s)
fake_input = fake_input.cuda()
e = time()
print("gen data:", e - s)
warm_cnt = 100
repeat = 100
model1 = resnet50(True).cuda().eval()
model2 = resnet101(True).cuda().eval()
s = time()
for i in range(warm_cnt):
y = model1(fake_input)
e = time()
print("warm up res50:", e - s)
s = time()
for i in range(warm_cnt):
y = model2(fake_input)
e = time()
print("warm up re101:", e - s)
pool = ThreadPoolExecutor(max_workers=2)
s = time()
for i in range(repeat):
# 方法1 map
# 此处res为函数返回值
for res in pool.map(inference,[model1,model2],[fake_input,fake_input],["resnet50","res101"]):
# print(res)
pass
e = time()
print("多线程map:", e - s)
s = time()
for i in range(repeat):
# 方法2 submit+wait
# 返回future对象
f1 = pool.submit(inference,model1,fake_input,'res50')
f2 = pool.submit(inference,model2,fake_input,'res101')
res = wait([f1,f2],return_when=ALL_COMPLETED)
for r in res.done:
# print(r.result())
pass
e = time()
print("多线程wait:", e - s)
if __name__ == '__main__':
main()
# asyncio.run(main())
结果:两种方式时间整体上比较接近,总时间约等于耗时比较长的模型的时间,比串行模型耗时少很多
warm up res50: 2.4041590690612793
warm up re101: 4.691877365112305
多线程map: 4.694884538650513
多线程wait: 4.744607210159302
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/5/8 16:49
# @Author : wangjianrong
# @File : 3.协程.py
from torchvision.models.resnet import resnet50, resnet101
import random
import os
import numpy as np
import torch
from time import time
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, ALL_COMPLETED
def init_seed(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
if seed == 0:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def inference(model, x, name):
y = model(x)
return name
async def ainference(model, x, name):
y = model(x)
return name
async def main():
init_seed(0)
s = time()
fake_input = torch.randn(1, 3, 224, 224)
e = time()
print("gen data:", e - s)
fake_input = fake_input.cuda()
e = time()
print("gen data:", e - s)
warm_cnt = 100
repeat = 100
model1 = resnet50(True).cuda().eval()
model2 = resnet101(True).cuda().eval()
s = time()
for i in range(warm_cnt):
y = model1(fake_input)
e = time()
print("warm up res50:", e - s)
s = time()
for i in range(warm_cnt):
y = model2(fake_input)
e = time()
print("warm up re101:", e - s)
loop = asyncio.get_running_loop()
# 方法1 使用协程函数
s = time()
for i in range(repeat):
tasks = [ainference(model1, fake_input,'res50'), ainference(model2, fake_input,'res101')]
done, pending = await asyncio.wait(tasks)
e = time()
print("直接使用协程函数:",e - s)
# 方法2 将非协程函数转成协程future
s = time()
for i in range(repeat):
f1 = loop.run_in_executor(None, inference, model1, fake_input, "res50")
f2 = loop.run_in_executor(None, inference, model2, fake_input, "res101")
done, pending = await asyncio.wait([f1, f2])
e = time()
print("将非协程函数转成协程:",e-s)
# 方法3 通过线程池
pool = ThreadPoolExecutor(max_workers=2)
s = time()
for i in range(repeat):
f1 = loop.run_in_executor(pool, inference, model1, fake_input, "res50")
f2 = loop.run_in_executor(pool, inference, model2, fake_input, "res101")
done,pending = await asyncio.wait([f1,f2])
e = time()
print("通过线程池:",e-s)
if __name__ == '__main__':
# main()
asyncio.run(main())
结果:
直接调用协程函数耗时和串行耗时差不多,可能是因为都是密集型计算。
后两者的区别可能是线程池默认线程数不同导致的,最大线程数为2时耗时最少,等于1则和串行耗时差不多,大于2耗时均会有所增加
warm up res50: 2.3446831703186035
warm up re101: 4.573963165283203
直接使用协程函数: 6.90220308303833
将非协程函数转成协程: 5.143332481384277
通过线程池: 4.59693717956543