pytorch 静态量化
YoloV5_lite 量化
fuse_modules 模块提取代码
silu 不支持量化
add 操作不支持量化操作
yolov detect 回归报错
针对yolov5_lite 网络采用pytorch 进行训练后的静态量化,主要介绍量化的过程,并记录其中遇到的各种问题。
X表示是浮点值, Q表示量化值,量化中一般会将浮点进行截断,选取【T2, T1】区间
其中Q0 表示量化的0点, S表示量化的scale。
Q0=0, T1 = -T2 这个是时候就是对称量化, 所以在神经网络计算的结果如下:
Q0 != 0 , T1 != -T2, 这个时候是非对称量化,量化损失更小,但是计算量上更大,如下:
backend = "fbgemm"
model.qconfig = torch.quantization.get_default_qconfig(backend) # 不同平台不同配置
model = torch.quantization.fuse_modules(model,fuse_ops) # 合并某些层,不想合并这句也可以跳过
model_fp32_prepared = torch.quantization.prepare(model)
stride = int(model.stride.max())
dataset = LoadImages(opt.source, img_size=opt.img_size, stride=stride)
import tqdm
tx = None
index = 0
for path, img, im0s, vid_cap in tqdm.tqdm(dataset):
img = torch.from_numpy(img).to(device)
img = img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
tx = img
out = model_fp32_prepared(img, augment=opt.augment)
index = index + 1
#model_fp32_prepared = model_fp32_prepared.to('cpu')
model_int8 = torch.quantization.convert(model_fp32_prepared)
for path, img, im0s, vid_cap in tqdm.tqdm(dataset):
img = torch.from_numpy(img).to("cpu")
img = img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
tx = img
out = model_int8(img, augment=opt.augment)
index = index + 1
torch.save(model_int8.state_dict(), "int_8_model.pt")
fuse_modules的时候目前,只是支持conv+bn, conv+bn+relu 这两种方式的融合, 这个需要传入
融合模块名称的list, 这里提供了查找融合模块的list的代码
def is_fused_itm(module): #用来判断该模块是否是需要fuse的模块
if module.__module__ in ['torch.nn.modules.conv', 'torch.nn.modules.batchnorm', 'torch.nn.modules.activation']:
return True
return False
def get_fuse_module(model):
sub_module_dict = {}
for name, module in model.named_modules(): #提取每一个{name:op}的操作
module_name_structs = name.split('.')
if is_fused_itm(module):
print(name, module)
sub_module_name = ".".join(module_name_structs[0:len(module_name_structs) - 1])
if not sub_module_name in sub_module_dict:
sub_module_dict[sub_module_name] = [(name, module)]
fuse_ops = []
tmp_fuse_ops = []
for name in sub_module_dict.keys(): #遍历所有的module,找到每个子module 可以合并的部分
vals = sub_module_dict[name]
for n, m in vals:
if m.__module__ == 'torch.nn.modules.conv':
if len(tmp_fuse_ops) > 1:
tmp_fuse_ops = [n]
elif m.__module__ == 'torch.nn.modules.batchnorm':
elif m.__module__ == 'torch.nn.modules.activation' and m._get_name()=='ReLU':
if len(tmp_fuse_ops) > 1: # 处理掉当前队列中的剩余的,如果剩余大于1 则说明可以fuse, 并清空缓存
tmp_fuse_ops = []
return fuse_ops
self.quant = torch.quantization.QuantStub()
self.dequant = torch.quantization.DeQuantStub()
class QuantModel(Model):
def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None):
super(QuantModel, self).__init__(cfg, ch, nc, anchors)
def set_quant(self):
logger.info("quant model need to set")
self.quant = torch.quantization.QuantStub()
self.dequant = torch.quantization.DeQuantStub()
def forward(self, x, augment=False, profile=False):
x = self.quant(x)
x = super(QuantModel, self).forward(x, augment, profile)
x = [self.dequant(val) for val in x]
return x
这里的Model 表示的是原来的网络,set_quant在原有的Model 定义为空,并且在构造函数的开始调用, 如下所示:
class Model(nn.Module):
def set_quant(self):
logger.info("org model not need to set quant")
def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None): # model, input channels, number of classes
super(Model, self).__init__()
File "/home/kylin/anaconda3/envs/torch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/kylin/YOLOv5-Lite/models/yolo.py", line 315, in forward
x = super(QuantModel, self).forward(x, augment, profile)
File "/home/kylin/YOLOv5-Lite/models/yolo.py", line 166, in forward
return self.forward_once(x, profile) # single-scale inference, train
File "/home/kylin/YOLOv5-Lite/models/yolo.py", line 182, in forward_once
x = m(x) # run
File "/home/kylin/anaconda3/envs/torch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/kylin/YOLOv5-Lite/models/common.py", line 171, in forward
return self.act(self.bn(self.conv(x)))
File "/home/kylin/anaconda3/envs/torch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/kylin/anaconda3/envs/torch/lib/python3.8/site-packages/torch/nn/modules/activation.py", line 395, in forward
return F.silu(input, inplace=self.inplace)
File "/home/kylin/anaconda3/envs/torch/lib/python3.8/site-packages/torch/nn/functional.py", line 2059, in silu
return torch._C._nn.silu(input)
NotImplementedError: Could not run 'aten::silu.out' with arguments from the 'QuantizedCPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::silu.out' is only available for these backends: [CPU, CUDA, Meta, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMeta, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradNestedTensor, Tracer, AutocastCPU, AutocastCUDA, FuncTorchBatched, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PythonDispatcher].
pytorch 的量化op表中不支持silu 这种activation 操作,目前的方式直接变成identity 进行重新训练,当前的自己的数据集上差别不大, 修改的位置在common.py 中的 Conv 类中, 代码如下:
class Conv(nn.Module):
# Standard convolution
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
super(Conv, self).__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
self.bn = nn.BatchNorm2d(c2)
self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
将act=True , 改成False, 这样默认就是 采用了Identity 操作了。 注意这里修改完需要重新训练一下。
File "/home/kylin/anaconda3/envs/torch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/kylin/YOLOv5-Lite/models/yolo.py", line 315, in forward
x = super(QuantModel, self).forward(x, augment, profile)
File "/home/kylin/YOLOv5-Lite/models/yolo.py", line 166, in forward
return self.forward_once(x, profile) # single-scale inference, train
File "/home/kylin/YOLOv5-Lite/models/yolo.py", line 182, in forward_once
x = m(x) # run
File "/home/kylin/anaconda3/envs/torch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/kylin/YOLOv5-Lite/models/common.py", line 826, in forward
return torch.add(x1, x2, alpha=self.a)
NotImplementedError: Could not run 'aten::add.out' with arguments from the 'QuantizedCPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::add.out' is only available for these backends: [CPU, CUDA, Meta, MkldnnCPU, SparseCPU, SparseCUDA, SparseCsrCPU, SparseCsrCUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMeta, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradNestedTensor, Tracer, AutocastCPU, AutocastCUDA, FuncTorchBatched, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PythonDispatcher].
add, cat 这些二元操作,由于两个输入的scale 不一致这样直接int 加会出现问题,所以add 这类操作需要直接采用float 运算,不能采用量化操作运行进行处理,pytorch 中提供了量化后的浮点操作,修改是在common.py 的 ADD模块中进行修改
class ADD(nn.Module):
# Stortcut a list of tensors along dimension
def __init__(self, alpha=0.5):
super(ADD, self).__init__()
self.a = alpha
self.ff = torch.nn.quantized.FloatFunctional()
def forward(self, x):
x1, x2 = x[0], x[1]
#return torch.add(x1, x2, alpha=self.a)
return self.ff.add(x1, self.ff.mul_scalar(x2, self.a)) #用于量化
调用了torch.nn.quantized.FloatFunctional() 获取了浮点操作。将原始的pytorch 的 add 操作变成了
浮点操作。可以参考torch.ao.nn.quantized.modules.functional_modules — PyTorch 1.13 documentation
File "/home/kylin/anaconda3/envs/torch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/kylin/YOLOv5-Lite/models/yolo.py", line 315, in forward
x = super(QuantModel, self).forward(x, augment, profile)
File "/home/kylin/YOLOv5-Lite/models/yolo.py", line 166, in forward
return self.forward_once(x, profile) # single-scale inference, train
File "/home/kylin/YOLOv5-Lite/models/yolo.py", line 182, in forward_once
x = m(x) # run
File "/home/kylin/anaconda3/envs/torch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/kylin/YOLOv5-Lite/models/yolo.py", line 82, in forward
return x if self.training else (torch.cat(z, 1), torch.cat(logits_, 1), x)
RuntimeError: Tensors must have same number of dimensions: got 5 and 3
目前这块问题的解决方案只能采用将detect 的框回归的部分从网络中抽离出来,然后在量化计算完成后再直接进行相关的框的计算。修改是在yolo.py 中的Detect 模块。
def forward(self, x):
# x = x.copy() # for profiling
z = [] # inference output
logits_ = []
self.training |= self.export
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
# bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
# x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
# if not self.training: # inference
# if torch.onnx.is_in_onnx_export():
# self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
# elif self.grid[i].shape[2:4] != x[i].shape[2:4]:
# self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
# logits = x[i][..., 5:]
# y = x[i].sigmoid()
# logits_.append(y)
# if not torch.onnx.is_in_onnx_export():
# y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy
# y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
# else:
# xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy
# wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].data # wh
# y = torch.cat((xy, wh, y[..., 4:]), -1)
# z.append(y.view(bs, -1, self.no))
# logits_.append(logits.view(bs, -1, self.no - 5))
# return x if self.training else (torch.cat(z, 1), torch.cat(logits_, 1), x)
return x
目前可以将yolov5 的pytorch 的静态量化跑通, 性能和精度后续再补齐。
torch — PyTorch 1.13 documentation