pytorch多模型异步推理

测试模型:resnet50和resnet101
测试显卡:2080ti
单独测试耗时:resnet50 24.4ms resnet101 48.22ms
初始化种子和热机:为了保证每次验证的一致性,需要初始化种子,使得每次测试的输入数据保持一致,同时,为了准确统计时间,测试前先跑100次热机,然后再统计时间。

第一种情况:两个模型串行跑

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021/5/8 16:20
# @Author  : wangjianrong
# @File    : 1.模型串行.py
 
from torchvision.models.resnet import resnet50, resnet101
import random
import os
import numpy as np
import torch
from time import time
import asyncio
from concurrent.futures import ThreadPoolExecutor,as_completed,wait,ALL_COMPLETED
 
 
def init_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
 
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
    if seed == 0:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
 
 
def inference(model, x,name):
    y = model(x)
    return name
 
def main():
    init_seed(0)
    s = time()
    fake_input = torch.randn(1, 3, 224, 224)
    e = time()
    print("gen data:", e - s)
    fake_input = fake_input.cuda()
    e = time()
    print("gen data:", e - s)
    warm_cnt = 100
    repeat = 100
    model1 = resnet50(True).cuda().eval()
    model2 = resnet101(True).cuda().eval()
    s = time()
    for i in range(warm_cnt):
        y = model1(fake_input)
    e = time()
    print("warm up res50:", e - s)
    s = time()
    for i in range(warm_cnt):
        y = model2(fake_input)
    e = time()
    print("warm up re101:", e - s)
 
    s = time()
 
    for i in range(repeat):
        y = inference(model1,fake_input,1)
        y = inference(model2,fake_input,1)
 
    e = time()
    print("模型串行耗时:",e - s)
 
 
if __name__ == '__main__':
    main()

结果:

warm up res50: 2.331266403198242
warm up re101: 4.534073352813721
模型串行耗时: 6.889774560928345

第二种情况:使用线程池同时跑两个模型推理

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021/5/8 16:26
# @Author  : wangjianrong
# @File    : 2.多线程.py
 
from torchvision.models.resnet import resnet50, resnet101
import random
import os
import numpy as np
import torch
from time import time
import asyncio
from concurrent.futures import ThreadPoolExecutor,as_completed,wait,ALL_COMPLETED
 
 
def init_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
 
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
    if seed == 0:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
 
 
def inference(model, x,name):
    y = model(x)
    return name
 
def main():
    init_seed(0)
    s = time()
    fake_input = torch.randn(1, 3, 224, 224)
    e = time()
    print("gen data:", e - s)
    fake_input = fake_input.cuda()
    e = time()
    print("gen data:", e - s)
    warm_cnt = 100
    repeat = 100
    model1 = resnet50(True).cuda().eval()
    model2 = resnet101(True).cuda().eval()
    s = time()
    for i in range(warm_cnt):
        y = model1(fake_input)
    e = time()
    print("warm up res50:", e - s)
    s = time()
    for i in range(warm_cnt):
        y = model2(fake_input)
    e = time()
    print("warm up re101:", e - s)
 
    pool = ThreadPoolExecutor(max_workers=2)
 
    s = time()
    for i in range(repeat):
        # 方法1 map
        # 此处res为函数返回值
        for res in pool.map(inference,[model1,model2],[fake_input,fake_input],["resnet50","res101"]):
            # print(res)
            pass
    e = time()
    print("多线程map:", e - s)
 
    s = time()
    for i in range(repeat):
        # 方法2 submit+wait
        # 返回future对象
        f1 = pool.submit(inference,model1,fake_input,'res50')
        f2 = pool.submit(inference,model2,fake_input,'res101')
        res = wait([f1,f2],return_when=ALL_COMPLETED)
        for r in res.done:
            # print(r.result())
            pass
    e = time()
    print("多线程wait:", e - s)
 
 
if __name__ == '__main__':
    main()
    # asyncio.run(main())

结果:两种方式时间整体上比较接近,总时间约等于耗时比较长的模型的时间,比串行模型耗时少很多

warm up res50: 2.4041590690612793
warm up re101: 4.691877365112305
多线程map: 4.694884538650513
多线程wait: 4.744607210159302

第三种情况:使用协程的方法

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021/5/8 16:49
# @Author  : wangjianrong
# @File    : 3.协程.py
 
from torchvision.models.resnet import resnet50, resnet101
import random
import os
import numpy as np
import torch
from time import time
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, ALL_COMPLETED
 
 
def init_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
 
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
    if seed == 0:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
 
 
def inference(model, x, name):
    y = model(x)
    return name
 
 
async def ainference(model, x, name):
    y = model(x)
    return name
 
 
async def main():
    init_seed(0)
    s = time()
    fake_input = torch.randn(1, 3, 224, 224)
    e = time()
    print("gen data:", e - s)
    fake_input = fake_input.cuda()
    e = time()
    print("gen data:", e - s)
    warm_cnt = 100
    repeat = 100
    model1 = resnet50(True).cuda().eval()
    model2 = resnet101(True).cuda().eval()
    s = time()
    for i in range(warm_cnt):
        y = model1(fake_input)
    e = time()
    print("warm up res50:", e - s)
    s = time()
    for i in range(warm_cnt):
        y = model2(fake_input)
    e = time()
    print("warm up re101:", e - s)
 
    loop = asyncio.get_running_loop()
    # 方法1 使用协程函数
    s = time()
    for i in range(repeat):
        tasks = [ainference(model1, fake_input,'res50'), ainference(model2, fake_input,'res101')]
        done, pending = await asyncio.wait(tasks)
    e = time()
    print("直接使用协程函数:",e - s)
 
    # 方法2 将非协程函数转成协程future
    s = time()
    for i in range(repeat):
        f1 = loop.run_in_executor(None, inference, model1, fake_input, "res50")
        f2 = loop.run_in_executor(None, inference, model2, fake_input, "res101")
        done, pending = await asyncio.wait([f1, f2])
    e = time()
    print("将非协程函数转成协程:",e-s)
 
    # 方法3 通过线程池
    pool = ThreadPoolExecutor(max_workers=2)
    s = time()
    for i in range(repeat):
        f1 = loop.run_in_executor(pool, inference, model1, fake_input, "res50")
        f2 = loop.run_in_executor(pool, inference, model2, fake_input, "res101")
        done,pending = await asyncio.wait([f1,f2])
    e = time()
    print("通过线程池:",e-s)
 
 
 
if __name__ == '__main__':
    # main()
    asyncio.run(main())

结果:

直接调用协程函数耗时和串行耗时差不多,可能是因为都是密集型计算。

后两者的区别可能是线程池默认线程数不同导致的,最大线程数为2时耗时最少,等于1则和串行耗时差不多,大于2耗时均会有所增加

warm up res50: 2.3446831703186035
warm up re101: 4.573963165283203
直接使用协程函数: 6.90220308303833
将非协程函数转成协程: 5.143332481384277
通过线程池: 4.59693717956543

你可能感兴趣的:(python,多线程,cuda,深度学习,python)