目录
1.指标定义
2.环境准备
3.参数读取
4.Params和Flops计算
计算步骤
python代码如下
输出结果
5.FPS计算
计算步骤
python代码
输出结果
6.完整python代码
整理不易,欢迎一键三连!!!
送你们一条美丽的--分割线--
FPS:Frames Per Second的缩写,指刷新帧率,通常在深度学习中表示每秒推理多少张图,是衡量算法推理速度的一个指标。
Params:parameters的缩写,指网络参数量,通常用来计算可学习参数量,是衡量网络复杂度的一个指标。
FLOPS:注意全大写,是floating point operations per second的缩写,意指每秒浮点运算次数,理解为计算速度。是一个衡量硬件性能的指标。
FLOPs:注意s小写,是floating point operations的缩写(s表复数),意指浮点运算数,理解为计算量。可以用来衡量算法/模型的复杂度。本文计算的额是Flops(s小写)。
FLOPs的单位通常是GFLOPs
1 GFLOPs = 10^9 FLOPs(即:10亿次浮点运算)
可以直接使用mmseg工程自带的tools/get_flops.py文件进行计算Params参数量、FLOPs,要计算FPS,可以直接使用mmseg工程自带的tools/benchmark.py文件进行计算,若要同时计算Params、FLOPs、FPS,需要的环境准备如下:
python代码如下:
#tools/get_flops.py
import argparse
from mmcv import Config
from mmcv.cnn import get_model_complexity_info
from mmseg.models import build_segmentor
#tools/benchmark.py
import mmcv
import numpy as np
import torch
from mmcv.parallel import MMDataParallel
from mmcv.runner import load_checkpoint, wrap_fp16_model
from mmseg.datasets import build_dataloader, build_dataset
from mmseg.models import build_segmentor
主要用到的还是mmseg和mmcv,基本上官网给出的openmmlab只要安装过就都可以。
python代码如下:
def parse_args():
parser = argparse.ArgumentParser(
description='Get the FLOPs and fps of a segmentor')
parser.add_argument('config', help='train config file path')
parser.add_argument(
'--shape',
type=int,
nargs='+',
default=[512, 512],
help='input image size')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument(
'--log-interval', type=int, default=50, help='interval of logging')
parser.add_argument(
'--work-dir',
help=('if specified, the results will be dumped '
'into the directory as json'))
parser.add_argument('--repeat-times', type=int, default=1)
args = parser.parse_args()
return args
计算Params和FLOPs需要2个输入:
计算FPS需要5个输入:
def get_params_flops():
args = parse_args()
if len(args.shape) == 1:
input_shape = (3, args.shape[0], args.shape[0])
elif len(args.shape) == 2:
input_shape = (3, ) + tuple(args.shape)
else:
raise ValueError('invalid input shape')
cfg = Config.fromfile(args.config)
cfg.model.pretrained = None
model = build_segmentor(
cfg.model,
train_cfg=cfg.get('train_cfg'),
test_cfg=cfg.get('test_cfg')).cuda()
model.eval()
if hasattr(model, 'forward_dummy'):
model.forward = model.forward_dummy
else:
raise NotImplementedError(
'FLOPs counter is currently not currently supported with {}'.
format(model.__class__.__name__))
flops, params = get_model_complexity_info(model, input_shape)
split_line = '=' * 30
print('{0}\nInput shape: {1}\nFlops: {2}\nParams: {3}\n{0}'.format(
split_line, input_shape, flops, params))
print('!!!Please be cautious if you use the results in papers. '
'You may need to check if all ops are supported and verify that the '
'flops computation is correct.')
EncoderDecoder(
121.934 M, 99.940% Params, 291.368 GFLOPs, 100.000% FLOPs,
(backbone): ConvNeXt(
87.508 M, 71.724% Params, 80.244 GFLOPs, 27.540% FLOPs,
(downsample_layers): ModuleList(
2.761 M, 2.263% Params, 1.715 GFLOPs, 0.589% FLOPs,
(0): Sequential(
0.006 M, 0.005% Params, 0.103 GFLOPs, 0.035% FLOPs,
(0): Conv2d(0.006 M, 0.005% Params, 0.103 GFLOPs, 0.035% FLOPs, 3, 128, kernel_size=(4, 4), stride=(4, 4))
(1): LayerNorm2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, (128,), eps=1e-06, elementwise_affine=True)
)
(1): Sequential(
...
...
init_cfg={'type': 'Normal', 'std': 0.01, 'override': {'name': 'conv_seg'}}
(auxiliary_head): FCNHead(
1.186 M, 0.972% Params, 0.0 GFLOPs, 0.000% FLOPs, input_transform=None, ignore_index=255, align_corners=False
(loss_decode): CrossEntropyLoss(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, avg_non_ignore=False)
(conv_seg): Conv2d(0.006 M, 0.005% Params, 0.0 GFLOPs, 0.000% FLOPs, 256, 25, kernel_size=(1, 1), stride=(1, 1))
(dropout): Dropout2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, p=0.1, inplace=False)
(convs): Sequential(
1.18 M, 0.967% Params, 0.0 GFLOPs, 0.000% FLOPs,
(0): ConvModule(
1.18 M, 0.967% Params, 0.0 GFLOPs, 0.000% FLOPs,
(conv): Conv2d(1.18 M, 0.967% Params, 0.0 GFLOPs, 0.000% FLOPs, 512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): SyncBatchNorm(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, 256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(activate): ReLU(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, inplace=True)
)
)
)
init_cfg={'type': 'Normal', 'std': 0.01, 'override': {'name': 'conv_seg'}}
)
==============================
Input shape: (3, 512, 512)
Flops: 291.37 GFLOPs
Params: 122.01 M
==============================
def get_fps():
args = parse_args()
cfg = Config.fromfile(args.config)
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
if args.work_dir is not None:
mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
json_file = osp.join(args.work_dir, f'fps_{timestamp}.json')
else:
# use config filename as default work_dir if cfg.work_dir is None
work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
mmcv.mkdir_or_exist(osp.abspath(work_dir))
json_file = osp.join(work_dir, f'fps_{timestamp}.json')
repeat_times = args.repeat_times
# set cudnn_benchmark
torch.backends.cudnn.benchmark = False
cfg.model.pretrained = None
cfg.data.test.test_mode = True
benchmark_dict = dict(config=args.config, unit='img / s')
overall_fps_list = []
for time_index in range(repeat_times):
print(f'Run {time_index + 1}:')
# build the dataloader
# TODO: support multiple images per gpu (only minor changes are needed)
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(
dataset,
samples_per_gpu=1,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=False,
shuffle=False)
# build the model and load checkpoint
cfg.model.train_cfg = None
model = build_segmentor(cfg.model, test_cfg=cfg.get('test_cfg'))
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
if 'checkpoint' in args and osp.exists(args.checkpoint):
load_checkpoint(model, args.checkpoint, map_location='cpu')
model = MMDataParallel(model, device_ids=[0])
model.eval()
# the first several iterations may be very slow so skip them
num_warmup = 5
pure_inf_time = 0
total_iters = 200
# benchmark with 200 image and take the average
for i, data in enumerate(data_loader):
torch.cuda.synchronize()
start_time = time.perf_counter()
with torch.no_grad():
model(return_loss=False, rescale=True, **data)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start_time
if i >= num_warmup:
pure_inf_time += elapsed
if (i + 1) % args.log_interval == 0:
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Done image [{i + 1:<3}/ {total_iters}], '
f'fps: {fps:.2f} img / s')
if (i + 1) == total_iters:
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Overall fps: {fps:.2f} img / s\n')
benchmark_dict[f'overall_fps_{time_index + 1}'] = round(fps, 2)
overall_fps_list.append(fps)
break
benchmark_dict['average_fps'] = round(np.mean(overall_fps_list), 2)
benchmark_dict['fps_variance'] = round(np.var(overall_fps_list), 4)
print(f'Average fps of {repeat_times} evaluations: '
f'{benchmark_dict["average_fps"]}')
print(f'The variance of {repeat_times} evaluations: '
f'{benchmark_dict["fps_variance"]}')
mmcv.dump(benchmark_dict, json_file, indent=4)
Run 1:
2024-01-26 11:13:50,106 - mmseg - INFO - Loaded 48600 images
load checkpoint from local path: configs/xx/epoch_1.pth
Done image [50 / 200], fps: 9.53 img / s
Done image [100/ 200], fps: 9.53 img / s
Done image [150/ 200], fps: 9.53 img / s
Done image [200/ 200], fps: 9.53 img / s
Overall fps: 9.53 img / s
计算参数量和FLOPs的完整python代码如下:
#tools/get_flops.py
import argparse
from mmcv import Config
from mmcv.cnn import get_model_complexity_info
from mmseg.models import build_segmentor
#tools/benchmark.py
import mmcv
import numpy as np
import torch
from mmcv.parallel import MMDataParallel
from mmcv.runner import load_checkpoint, wrap_fp16_model
from mmseg.datasets import build_dataloader, build_dataset
from mmseg.models import build_segmentor
def parse_args():
parser = argparse.ArgumentParser(
description='Get the FLOPs and fps of a segmentor')
parser.add_argument('config', help='train config file path')
parser.add_argument(
'--shape',
type=int,
nargs='+',
default=[512, 512],
help='input image size')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument(
'--log-interval', type=int, default=50, help='interval of logging')
parser.add_argument(
'--work-dir',
help=('if specified, the results will be dumped '
'into the directory as json'))
parser.add_argument('--repeat-times', type=int, default=1)
args = parser.parse_args()
return args
def get_params_flops():
args = parse_args()
if len(args.shape) == 1:
input_shape = (3, args.shape[0], args.shape[0])
elif len(args.shape) == 2:
input_shape = (3, ) + tuple(args.shape)
else:
raise ValueError('invalid input shape')
cfg = Config.fromfile(args.config)
cfg.model.pretrained = None
model = build_segmentor(
cfg.model,
train_cfg=cfg.get('train_cfg'),
test_cfg=cfg.get('test_cfg')).cuda()
model.eval()
if hasattr(model, 'forward_dummy'):
model.forward = model.forward_dummy
else:
raise NotImplementedError(
'FLOPs counter is currently not currently supported with {}'.
format(model.__class__.__name__))
flops, params = get_model_complexity_info(model, input_shape)
split_line = '=' * 30
print('{0}\nInput shape: {1}\nFlops: {2}\nParams: {3}\n{0}'.format(
split_line, input_shape, flops, params))
print('!!!Please be cautious if you use the results in papers. '
'You may need to check if all ops are supported and verify that the '
'flops computation is correct.')
def get_fps():
args = parse_args()
cfg = Config.fromfile(args.config)
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
if args.work_dir is not None:
mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
json_file = osp.join(args.work_dir, f'fps_{timestamp}.json')
else:
# use config filename as default work_dir if cfg.work_dir is None
work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
mmcv.mkdir_or_exist(osp.abspath(work_dir))
json_file = osp.join(work_dir, f'fps_{timestamp}.json')
repeat_times = args.repeat_times
# set cudnn_benchmark
torch.backends.cudnn.benchmark = False
cfg.model.pretrained = None
cfg.data.test.test_mode = True
benchmark_dict = dict(config=args.config, unit='img / s')
overall_fps_list = []
for time_index in range(repeat_times):
print(f'Run {time_index + 1}:')
# build the dataloader
# TODO: support multiple images per gpu (only minor changes are needed)
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(
dataset,
samples_per_gpu=1,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=False,
shuffle=False)
# build the model and load checkpoint
cfg.model.train_cfg = None
model = build_segmentor(cfg.model, test_cfg=cfg.get('test_cfg'))
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
if 'checkpoint' in args and osp.exists(args.checkpoint):
load_checkpoint(model, args.checkpoint, map_location='cpu')
model = MMDataParallel(model, device_ids=[0])
model.eval()
# the first several iterations may be very slow so skip them
num_warmup = 5
pure_inf_time = 0
total_iters = 200
# benchmark with 200 image and take the average
for i, data in enumerate(data_loader):
torch.cuda.synchronize()
start_time = time.perf_counter()
with torch.no_grad():
model(return_loss=False, rescale=True, **data)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start_time
if i >= num_warmup:
pure_inf_time += elapsed
if (i + 1) % args.log_interval == 0:
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Done image [{i + 1:<3}/ {total_iters}], '
f'fps: {fps:.2f} img / s')
if (i + 1) == total_iters:
fps = (i + 1 - num_warmup) / pure_inf_time
print(f'Overall fps: {fps:.2f} img / s\n')
benchmark_dict[f'overall_fps_{time_index + 1}'] = round(fps, 2)
overall_fps_list.append(fps)
break
benchmark_dict['average_fps'] = round(np.mean(overall_fps_list), 2)
benchmark_dict['fps_variance'] = round(np.var(overall_fps_list), 4)
print(f'Average fps of {repeat_times} evaluations: '
f'{benchmark_dict["average_fps"]}')
print(f'The variance of {repeat_times} evaluations: '
f'{benchmark_dict["fps_variance"]}')
mmcv.dump(benchmark_dict, json_file, indent=4)
if __name__ == '__main__':
get_params_flops()
get_fps()
⛵⛵⭐⭐