import numpy as np
import copy
def KL(P, Q):
out = np.sum(P * np.log(P / Q))
return out
def maxq(value):
dynamic_range = np.abs(value).max()
scale = dynamic_range / 127.0
return scale
def histogramq(value):
hist, bins = np.histogram(value, 100)
total = len(value)
left, right = 0, len(hist)
limit = 0.99
while True:
nleft = left + 1
nright = right - 1
left_cover = hist[nleft:right].sum() / total
right_cover = hist[left:nright].sum() / total
if left_cover < limit and right_cover < limit:
break
if left_cover > right_cover:
left += 1
else:
right -= 1
low = bins[left]
high = bins[right - 1]
dynamic_range = max(abs(low), abs(high))
scale = dynamic_range / 127.0
return scale
def entropy(value, target_bin=128):
# 计算最大绝对值
amax = np.abs(value).max()
# 计算直方图分布
distribution, _ = np.histogram(value, bins=2048, range=(0, amax))
# 遍历直方图分布,区间为[1:2048]
distribution = distribution[1:]
length = distribution.size
# 定义KL散度
kl_divergence = np.zeros(length - target_bin)
# 遍历[128:2047]
for threshold in range(target_bin, length):
# slice分布,区间为[:threshold]
sliced_nd_hist = copy.deepcopy(distribution[:threshold])
# 复制切分分布为:p
p = sliced_nd_hist.copy()
threshold_sum = sum(distribution[threshold:])
# 边界外的组加到边界P[i-1]上,没有直接丢掉
p[threshold-1] += threshold_sum
is_nonzeros = (p != 0).astype(np.int64)
# 合并bins,步长为:num_merged_bins=sliced_nd_hist.size // target_bin=16
quantized_bins = np.zeros(target_bin, dtype=np.int64)
num_merged_bins = sliced_nd_hist.size // target_bin
for j in range(target_bin):
start = j * num_merged_bins
stop = start + num_merged_bins
quantized_bins[j] = sliced_nd_hist[start:stop].sum()
quantized_bins[-1] += sliced_nd_hist[target_bin * num_merged_bins:].sum()
# 定义分布:q ,这里的size要和p分布一致,也就是和sliced_nd_hist分布一致
q = np.zeros(sliced_nd_hist.size, dtype=np.float64)
# 展开bins到q
for j in range(target_bin):
start = j * num_merged_bins
stop = -1 if j == target_bin - 1 else start + num_merged_bins
norm = is_nonzeros[start:stop].sum()
q[start:stop] = float(quantized_bins[j]) / float(norm) if norm != 0 else q[start:stop]
# 归一化操作
p = p / sum(p)
q = q / sum(q)
# 计算KL散度
kl_divergence[threshold - target_bin] = KL(p, q)
# 求出最小的 kl 散度
min_kl_divergence = np.argmin(kl_divergence)
# 求出最小 kl 散度对应的刻度
threshold_value = min_kl_divergence + target_bin
# 计算最终的threshold
dynamic_range = (threshold_value + 0.5) * (amax / 2048)
scale = dynamic_range / 127.0
return scale
# int8截断,注意,-128丢掉了不要
def saturate(x):
return np.clip(np.round(x), -127, +127)
class Quant:
def __init__(self, value):
# 这里是对称量化,动态范围选取有多种方法,max/histogram/entropy等等
self.scale = maxq(value)
# self.scale = histogramq(value)
# self.scale = entropy(value)
def __call__(self, f):
# 进行阶段
return saturate(f / self.scale)
def Quant_Conv(x, w, b, iq, wq, oq=None):
alpha = iq.scale * wq.scale
out_int32 = iq(x) * wq(w)
if oq is None:
# float32 output
return out_int32 * alpha + b
else:
# int8 quant output
return saturate((out_int32 * alpha + b) / oq.scale)
if __name__ == '__main__':
# x -> Q1 -> conv1 -> Q2 -> conv2 -> y
np.random.seed(31)
nelem = 1000
# 生成随机权重、输入与偏置向量
x = np.random.randn(nelem)
weight1 = np.random.randn(nelem)
bias1 = np.random.randn(nelem)
# 计算第一层卷积计算的结果输出(fp32)
t = x * weight1 + bias1
weight2 = np.random.randn(nelem)
bias2 = np.random.randn(nelem)
# 计算第二层卷积计算的结果输出(fp32)
y = t * weight2 + bias2
# 分别对输入、权重以及中间层输出(也是下一层的输入)进行量化校准
xQ = Quant(x)
w1Q = Quant(weight1)
tQ = Quant(t)
w2Q = Quant(weight2)
qt = Quant_Conv(x, weight1, bias1, xQ, w1Q, tQ)
# int8计算的结果输出
y2 = Quant_Conv(qt, weight2, bias2, tQ, w2Q)
# 计算量化计算的均方差
y_diff = (np.abs(y - y2) ** 2).mean()
print(f"ydiff mse error is: {y_diff}")
'''
max mse error : 35.1663
histogramq mse error : 8.6907
entropy mse error : 1.8590
'''
这段代码实现了一个基本的卷积神经网络量化方案。代码使用对称量化方案对每层的输入、权重和中间输出进行量化。然后使用量化后的输入和权重执行卷积操作,最后在返回输出前对卷积操作的输出进行量化。
代码定义了一个名为Quant的类,该类接收一个输入数组并计算量化所需的比例因子。量化函数定义为saturate(f / self.scale),它将输入按比例因子缩放并四舍五入到最接近的整数。然后将输出剪裁到范围[-127,127]之间。
代码还定义了一个名为Quant_Conv的函数,该函数接收量化输入、权重、偏置和输入和权重的量化对象。然后使用量化输入和权重执行卷积操作并添加偏置项。如果提供了输出量化对象,则在返回输出之前对其进行量化。
代码为计算量化的比例因子提供了三种方法:maxq、histogramq和entropy。maxq基于输入的最大绝对值计算比例因子。histogramq基于输入的直方图计算比例因子。entropy基于输入的熵计算比例因子。
代码还计算了量化卷积操作的输出与浮点卷积操作的输出之间的均方误差,以评估量化的准确性。
总体而言,代码提供了一个卷积神经网络的基本量化实现,并可作为更复杂量化技术的起点。
import argparse
import os
import sys
from pathlib import Path
import warnings
import yaml
import torch
from tqdm import tqdm
FILE = Path(__file__).resolve()
ROOT = FILE.parents[0] # YOLOv5 root directory
if str(ROOT) not in sys.path:
sys.path.append(str(ROOT)) # add ROOT to PATH
ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative
from models.yolo import Model
from utils.dataloaders import create_dataloader
from utils.general import (check_img_size, check_yaml, file_size, colorstr, check_dataset)
from utils.torch_utils import select_device
import py_quant_utils as quant
import val
def collect_stats(model, data_loader, num_batches, device):
"""Feed data to the network and collect statistics"""
# Enable calibrators
model.eval()
for name, module in model.named_modules():
if isinstance(module, quant.quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
# Feed data to the network for collecting stats
for i, (image, targets, paths, shapes) in tqdm(enumerate(data_loader), total=num_batches):
image = image.to(device, non_blocking=True)
image = image.float() # uint8 to fp16/32
image /= 255.0 # 0 - 255 to 0.0 - 1.0
model(image)
if i >= num_batches:
break
# Disable calibrators
for name, module in model.named_modules():
if isinstance(module, quant.quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.enable_quant()
module.disable_calib()
else:
module.enable()
def compute_amax(model, **kwargs):
# Load calib result
for name, module in model.named_modules():
if isinstance(module, quant.quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, quant.calib.MaxCalibrator):
module.load_calib_amax()
else:
module.load_calib_amax(**kwargs)
print(F"{name:40}: {module}")
def calibrate_model(model, model_name, data_loader, num_calib_batch, calibrator, hist_percentile, out_dir, device):
"""
Feed data to the network and calibrate.
Arguments:
model: detection model
model_name: name to use when creating state files
data_loader: calibration data set
num_calib_batch: amount of calibration passes to perform
calibrator: type of calibration to use (max/histogram)
hist_percentile: percentiles to be used for historgram calibration
out_dir: dir to save state files in
"""
if num_calib_batch > 0:
print("Calibrating model")
with torch.no_grad():
collect_stats(model, data_loader, num_calib_batch, device)
if not calibrator == "histogram":
compute_amax(model, method="max")
calib_output = os.path.join(out_dir, F"{model_name}-max-{num_calib_batch * data_loader.batch_size}.pth")
torch.save(model.state_dict(), calib_output)
else:
for percentile in hist_percentile:
print(F"{percentile} percentile calibration")
compute_amax(model, method="percentile")
calib_output = os.path.join(out_dir, F"{model_name}-percentile-{percentile}-{num_calib_batch * data_loader.batch_size}.pth")
torch.save(model.state_dict(), calib_output)
for method in ["mse", "entropy"]:
print(F"{method} calibration")
compute_amax(model, method=method)
calib_output = os.path.join(out_dir, F"{model_name}-{method}-{num_calib_batch * data_loader.batch_size}.pth")
torch.save(model.state_dict(), calib_output)
def load_model(weight, device) -> Model:
model = torch.load(weight, map_location=device)["model"]
model.float()
model.eval()
with torch.no_grad():
model.fuse()
return model
def prepare_model(calibrator, hyp, opt, device):
"""
1、Load模型
2、插入模型的Q和DQ节点
3、设置量化方法:per_tensor/per_channels,
4、Dataloader的制作
"""
with open(opt.data, encoding='utf-8') as f:
data_dict = yaml.load(f, Loader=yaml.SafeLoader)
data_dict = check_dataset(data_dict)
calib_path = data_dict['val']
# 初始化量化方法
quant.initialize_calib_method(per_channel_quantization=True, calib_method=calibrator)
# 加载FP32的Pytorch模型
model = load_model(opt.weights, device)
# 为FP32的Torch的Q和DQ的节点
quant.replace_to_quantization_module(model, ignore_policy=opt.sensitive_layer)
model.eval()
model.cuda()
gs = max(int(model.stride.max()), 32) # grid size (max stride)
imgsz, _ = [check_img_size(x, gs) for x in [opt.imgsz, opt.imgsz]] # verify imgsz are gs-multiples
# Calib dataloader
calib_loader = create_dataloader(calib_path,
imgsz,
opt.batch_size,
gs,
hyp=hyp,
cache=opt.cache,
rect=True,
rank=-1,
workers=opt.workers * 2,
pad=0.5,
prefix=colorstr('calib: '))[0]
return model, calib_loader
def export_onnx(model, onnx_filename, batch_onnx, dynamic_shape, simplify, imgsz=672, prefix=colorstr('calib: ')):
from models.yolo import Detect
model.eval()
for k, m in model.named_modules():
if isinstance(m, Detect):
m.inplace = False
m.export = True
# We have to shift to pytorch's fake quant ops before exporting the model to ONNX
quant.quant_nn.TensorQuantizer.use_fb_fake_quant = True
# Export ONNX for multiple batch sizes
print("Creating ONNX file: " + onnx_filename)
dummy_input = torch.randn(batch_onnx, 3, imgsz, imgsz)
try:
import onnx
with torch.no_grad():
torch.onnx.export(model.cpu(),
dummy_input.cpu(),
onnx_filename,
verbose=False,
opset_version=13,
input_names=['images'],
output_names=['output'],
dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'}} if dynamic_shape else None,
enable_onnx_checker=False,
do_constant_folding=True)
print('ONNX export success, saved as %s' % onnx_filename)
except ValueError:
warnings.warn(UserWarning("Per-channel quantization is not yet supported in Pytorch/ONNX RT (requires ONNX opset 13)"))
print("Failed to export to ONNX")
return False
except Exception as e:
print(f'{prefix} export failure: {e}')
# Checks
model_onnx = onnx.load(onnx_filename) # load onnx model
onnx.checker.check_model(model_onnx) # check onnx model
# Simplify
if simplify:
try:
import onnxsim
print(f'{prefix} simplifying with onnx-simplifier {onnxsim.__version__}...')
model_onnx, check = onnxsim.simplify(
model_onnx,
dynamic_input_shape=dynamic_shape,
input_shapes={'images': list(dummy_input.shape)} if dynamic_shape else None)
assert check, 'assert check failed'
onnx.save(model_onnx, onnx_filename)
except Exception as e:
print(f'{prefix} simplifier failure: {e}')
print(f'{prefix} export success, saved as {onnx_filename} ({file_size(onnx_filename):.1f} MB)')
print(f"{prefix} Run ONNX model inference with: 'python detect.py --weights {onnx_filename}'")
# Restore the PSX/TensorRT's fake quant mechanism
quant.quant_nn.TensorQuantizer.use_fb_fake_quant = False
# Restore the model to train/test mode, use Detect() layer grid
model.export = False
return True
def parse_opt():
parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='dataset.yaml path')
parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'weights/yolov5n.pt', help='model.pt path(s)')
parser.add_argument('--model-name', '-m', default='yolov5n', help='model name: default yolov5s')
parser.add_argument('--batch-size', type=int, default=32, help='batch size')
parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='inference size (pixels)')
parser.add_argument('--device', default='0', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
parser.add_argument('--workers', type=int, default=0, help='max dataloader workers (per RANK in DDP mode)')
parser.add_argument('--cache', type=str, nargs='?', const='ram', help='--cache images in "ram" (default) or "disk"')
# setting for calibration
parser.add_argument('--hyp', type=str, default='data/hyp.qat.yaml', help='hyperparameters path')
parser.add_argument('--calib-batch-size', type=int, default=32, help='calib batch size: default 64')
parser.add_argument('--sensitive-layer', default=['model.24.m.0',
'model.24.m.1',
'model.24.m.2'], help='skip sensitive layer: default detect head')
parser.add_argument('--num-calib-batch', default=32, type=int,
help='Number of batches for calibration. 0 will disable calibration. (default: 4)')
parser.add_argument('--calibrator', type=str, choices=["max", "histogram"], default="max")
parser.add_argument('--percentile', nargs='+', type=float, default=[99.9, 99.99, 99.999, 99.9999])
parser.add_argument('--dynamic', default=False, help='dynamic ONNX axes')
parser.add_argument('--simplify', default=True, help='simplify ONNX file')
parser.add_argument('--out-dir', '-o', default=ROOT / 'runs/', help='output folder: default ./runs/finetune')
parser.add_argument('--batch-size-onnx', type=int, default=1, help='batch size for onnx: default 1')
opt = parser.parse_args()
opt.data = check_yaml(opt.data) # check YAML
# print_args(vars(opt))
return opt
def evaluate_accuracy(model, opt, testloader):
with open(opt.data, encoding='utf-8') as f:
data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict
results, _, _ = val.run(
data_dict,
batch_size=opt.batch_size,
imgsz=opt.imgsz,
model=model,
iou_thres=0.65, # best pycocotools results at 0.65
single_cls=False,
dataloader=testloader,
save_dir=Path(opt.out_dir),
save_json=False,
verbose=True,
plots=True) # val best model with plots
map50 = list(results)[3]
map = list(results)[2]
return map50, map
if __name__ == "__main__":
# 超参数
opt = parse_opt()
# 设备选择
device = select_device(opt.device, batch_size=opt.batch_size)
# 准备模型和dataloader
model, data_loader = prepare_model(calibrator=opt.calibrator, hyp=opt.hyp, opt=opt, device=device)
# 校准模型
with torch.no_grad():
calibrate_model(
model=model,
model_name=opt.model_name,
data_loader=data_loader,
num_calib_batch=opt.num_calib_batch,
calibrator=opt.calibrator,
hist_percentile=opt.percentile,
out_dir=opt.out_dir,
device=device)
# # 验证PTQ后的模型精度
# with torch.no_grad():
# map50_calibrated, map_calibrated = evaluate_accuracy(model, opt, data_loader)
# print('Calibration evaluation:', "mAP@IoU=0.50:{:.5f}, mAP@IoU=0.50:0.95:{:.5f}".format(map50_calibrated, map_calibrated))
# # 验证原始的模型精度
# with torch.no_grad():
# with quant.disable_quantization(model):
# map50_Orgin, map_Orgin = evaluate_accuracy(model, opt, data_loader)
# print('Orgin evaluation:', "mAP@IoU=0.50:{:.5f}, mAP@IoU=0.50:0.95:{:.5f}".format(map50_Orgin, map_Orgin ))
# 导出ONNX
onnx_filename = './weights/yolov5n_ptq_detect.onnx'
model.export = True
export_onnx(model, onnx_filename, opt.batch_size_onnx, opt.dynamic, opt.simplify)
这是将全精度 (FP32)神经网络模型转换为较低精度格式(例如 INT8)的过程。这可以减少模型大小和内存使用量,并实现更高效的硬件执行。
import argparse
import os
import sys
from pathlib import Path
import warnings
import yaml
import torch
from tqdm import tqdm
FILE = Path(__file__).resolve()
ROOT = FILE.parents[0] # YOLOv5 root directory
if str(ROOT) not in sys.path:
sys.path.append(str(ROOT)) # add ROOT to PATH
ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative
from models.yolo import Model
from utils.dataloaders import create_dataloader
from utils.general import (check_img_size, check_yaml, file_size, colorstr, check_dataset)
from utils.torch_utils import select_device
import py_quant_utils as quant
import val
def collect_stats(model, data_loader, num_batches, device):
"""Feed data to the network and collect statistics"""
# Enable calibrators
model.eval()
for name, module in model.named_modules():
if isinstance(module, quant.quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
# Feed data to the network for collecting stats
for i, (image, targets, paths, shapes) in tqdm(enumerate(data_loader), total=num_batches):
image = image.to(device, non_blocking=True)
image = image.float() # uint8 to fp16/32
image /= 255.0 # 0 - 255 to 0.0 - 1.0
model(image)
if i >= num_batches:
break
# Disable calibrators
for name, module in model.named_modules():
if isinstance(module, quant.quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.enable_quant()
module.disable_calib()
else:
module.enable()
def compute_amax(model, **kwargs):
# Load calib result
for name, module in model.named_modules():
if isinstance(module, quant.quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, quant.calib.MaxCalibrator):
module.load_calib_amax()
else:
module.load_calib_amax(**kwargs)
print(F"{name:40}: {module}")
def calibrate_model(model, model_name, data_loader, num_calib_batch, calibrator, hist_percentile, out_dir, device):
"""
Feed data to the network and calibrate.
Arguments:
model: detection model
model_name: name to use when creating state files
data_loader: calibration data set
num_calib_batch: amount of calibration passes to perform
calibrator: type of calibration to use (max/histogram)
hist_percentile: percentiles to be used for historgram calibration
out_dir: dir to save state files in
"""
if num_calib_batch > 0:
print("Calibrating model")
with torch.no_grad():
collect_stats(model, data_loader, num_calib_batch, device)
if not calibrator == "histogram":
compute_amax(model, method="max")
calib_output = os.path.join(out_dir, F"{model_name}-max-{num_calib_batch * data_loader.batch_size}.pth")
torch.save(model.state_dict(), calib_output)
else:
for percentile in hist_percentile:
print(F"{percentile} percentile calibration")
compute_amax(model, method="percentile")
calib_output = os.path.join(out_dir, F"{model_name}-percentile-{percentile}-{num_calib_batch * data_loader.batch_size}.pth")
torch.save(model.state_dict(), calib_output)
for method in ["mse", "entropy"]:
print(F"{method} calibration")
compute_amax(model, method=method)
calib_output = os.path.join(out_dir, F"{model_name}-{method}-{num_calib_batch * data_loader.batch_size}.pth")
torch.save(model.state_dict(), calib_output)
def load_model(weight, device) -> Model:
model = torch.load(weight, map_location=device)["model"]
model.float()
model.eval()
with torch.no_grad():
model.fuse()
return model
def prepare_model(calibrator, opt, device):
"""
1、Load模型
2、插入模型的Q和DQ节点
3、设置量化方法:per_tensor/per_channels,
4、Dataloader的制作
"""
with open(opt.data, encoding='utf-8') as f:
data_dict = yaml.load(f, Loader=yaml.SafeLoader)
data_dict = check_dataset(data_dict)
calib_path = data_dict['val']
# 初始化量化方法
quant.initialize_calib_method(per_channel_quantization=True, calib_method=calibrator)
# 加载FP32的Pytorch模型
model = load_model(opt.weights, device)
# 为FP32的Torch的Q和DQ的节点
quant.replace_to_quantization_module(model, ignore_policy=opt.sensitive_layer)
model.eval()
model.cuda()
gs = max(int(model.stride.max()), 32) # grid size (max stride)
imgsz, _ = [check_img_size(x, gs) for x in [opt.imgsz, opt.imgsz]] # verify imgsz are gs-multiples
# Calib dataloader
calib_loader = create_dataloader(calib_path,
imgsz,
opt.batch_size,
gs,
hyp=None,
cache=opt.cache,
rect=True,
rank=-1,
workers=opt.workers * 2,
pad=0.5,
prefix=colorstr('calib: '))[0]
return model, calib_loader
def export_onnx(model, onnx_filename, batch_onnx, dynamic_shape, simplify, imgsz=672, prefix=colorstr('calib: ')):
from models.yolo import Detect
model.eval()
for k, m in model.named_modules():
if isinstance(m, Detect):
m.inplace = False
m.export = True
# We have to shift to pytorch's fake quant ops before exporting the model to ONNX
quant.quant_nn.TensorQuantizer.use_fb_fake_quant = True
# Export ONNX for multiple batch sizes
print("Creating ONNX file: " + onnx_filename)
im = torch.randn(1, 3, imgsz, imgsz)
try:
import onnx
torch.onnx.export(
model.cpu() if dynamic_shape else model, # --dynamic only compatible with cpu
im.cpu() if dynamic_shape else im,
onnx_filename,
verbose=False,
opset_version=13,
training= torch.onnx.TrainingMode.EVAL,
do_constant_folding=True,
input_names=['images'],
output_names=['output'],
dynamic_axes={
'images': {0: 'batch'},
'output': {0: 'batch'} # shape(1,25200,85)
} if dynamic_shape else None)
print('ONNX export success, saved as %s' % onnx_filename)
except ValueError:
warnings.warn(UserWarning("Per-channel quantization is not yet supported in Pytorch/ONNX RT (requires ONNX opset 13)"))
print("Failed to export to ONNX")
return False
except Exception as e:
print(f'{prefix} export failure: {e}')
# Checks
model_onnx = onnx.load(onnx_filename) # load onnx model
onnx.checker.check_model(model_onnx) # check onnx model
# Simplify
if simplify:
try:
import onnxsim
print(f'{prefix} simplifying with onnx-simplifier {onnxsim.__version__}...')
model_onnx, check = onnxsim.simplify(model_onnx)
assert check, 'assert check failed'
onnx.save(model_onnx, onnx_filename)
except Exception as e:
print(f'{prefix} simplifier failure: {e}')
print(f'{prefix} export success, saved as {onnx_filename} ({file_size(onnx_filename):.1f} MB)')
print(f"{prefix} Run ONNX model inference with: 'python detect.py --weights {onnx_filename}'")
def parse_opt():
parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='dataset.yaml path')
parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'weights/yolov5n.pt', help='model.pt path(s)')
parser.add_argument('--model-name', '-m', default='yolov5n', help='model name: default yolov5s')
parser.add_argument('--batch-size', type=int, default=32, help='batch size')
parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='inference size (pixels)')
parser.add_argument('--device', default='0', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
parser.add_argument('--workers', type=int, default=0, help='max dataloader workers (per RANK in DDP mode)')
parser.add_argument('--cache', type=str, nargs='?', const='ram', help='--cache images in "ram" (default) or "disk"')
# setting for calibration
parser.add_argument('--calib-batch-size', type=int, default=64, help='calib batch size: default 64')
# parser.add_argument('--sensitive-layer', default=['model.24.m.0',
# 'model.24.m.1',
# 'model.24.m.2'], help='skip sensitive layer: default detect head')
parser.add_argument('--sensitive-layer', default=[], help='skip sensitive layer: default detect head')
parser.add_argument('--num-calib-batch', default=64, type=int,
help='Number of batches for calibration. 0 will disable calibration. (default: 4)')
parser.add_argument('--calibrator', type=str, choices=["max", "histogram"], default="max")
parser.add_argument('--percentile', nargs='+', type=float, default=[99.9, 99.99, 99.999, 99.9999])
parser.add_argument('--dynamic', default=False, help='dynamic ONNX axes')
parser.add_argument('--simplify', default=True, help='simplify ONNX file')
parser.add_argument('--out-dir', '-o', default=ROOT / 'weights/', help='output folder: default ./runs/finetune')
parser.add_argument('--batch-size-onnx', type=int, default=1, help='batch size for onnx: default 1')
opt = parser.parse_args()
opt.data = check_yaml(opt.data) # check YAML
return opt
def evaluate_accuracy(model, opt, testloader):
with open(opt.data, encoding='utf-8') as f:
data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict
results, _, _ = val.run(
data_dict,
batch_size=opt.batch_size,
imgsz=opt.imgsz,
model=model,
iou_thres=0.65, # best pycocotools results at 0.65
single_cls=False,
dataloader=testloader,
save_dir=Path(opt.out_dir),
save_json=False,
verbose=True,
plots=True) # val best model with plots
map50 = list(results)[3]
map = list(results)[2]
return map50, map
def sensitive_analysis(model, opt, data_loader, summary_file='./summary_sensitive_analysis.json'):
summary = quant.SummaryTool(summary_file)
# 验证PTQ后的模型精度
map50_calibrated, map_calibrated = evaluate_accuracy(model, opt, data_loader)
summary.append([map50_calibrated, map_calibrated, "PTQ"])
print("Sensitive Analysis by each layer...")
for i in range(0, len(model.model)):
layer = model.model[i]
if quant.have_quantizer(layer):
print(f"Quantization disable model.{i}")
quant.disable_quantization(layer).apply()
map50_calibrated, map_calibrated = evaluate_accuracy(model, opt, data_loader)
summary.append([map50_calibrated, map_calibrated, f"model.{i}"])
quant.enable_quantization(layer).apply()
else:
print(f"ignore model.{i} because it is {type(layer)}")
summary = sorted(summary.data, key=lambda x:x[0], reverse=True)
print("Sensitive summary:")
for n, (map_calibrated, map50_calibrated, name) in enumerate(summary[:10]):
print(f"Top{n}: Using fp16 {name}, map_calibrated = {map_calibrated:.5f}")
if __name__ == "__main__":
# 超参数
opt = parse_opt()
# 设备选择
device = select_device(opt.device, batch_size=opt.batch_size)
# 准备模型和dataloader
model, data_loader = prepare_model(calibrator=opt.calibrator, opt=opt, device=device)
# 校准模型
with torch.no_grad():
calibrate_model(
model=model,
model_name=opt.model_name,
data_loader=data_loader,
num_calib_batch=opt.num_calib_batch,
calibrator=opt.calibrator,
hist_percentile=opt.percentile,
out_dir=opt.out_dir,
device=device)
sensitive_analysis(model, opt, data_loader)