nan – not a number,即训练过程不合理,数值过大/过小
在经过log或除法计算时,容易出现无穷大的计算结果。有些数据的中间计算结果太接近0或1,导致计算损失时出现inf,这些数据被定义为 ” 脏 “ 数据。
设置batch_size=1,shuffle = False,逐个排查。
一列数据很小,一列很大就容易造成nan,建议对数据进行normalize归一化处理
class Normalize(object):
def __call__(self, sample):
image, box = sample['image'], sample['box']
mean = np.mean(image.astype(np.float32), axis=0)
std = np.std(image.astype(np.float32), axis=0)
np.seterr(divide='ignore', invalid='ignore')
image_nor = (image.astype(np.float32) - mean) / std
return {'image': image_nor, 'box': box}
每个batch前梯度要清零
optimizer.zero_grad()
# method 1
'''开启model.train()时依然会令requires_grad=True'''
for m in self.modules():
if isinstance(m, nn.BatchNorm2d):
m.weight.requires_grad = False
m.bias.requires_grad = False
# method 2
self.bn = nn.BatchNorm2d(out_channels, eps=1e-5, momentum=0.0, track_running_stats=False)
常见的cnn初始化策略有xavier、msra、He等,以下为Pytorch提供的初始化方法。
torch.nn.init.uniform_(tensor, a=0, b=1) # 均匀分布
torch.nn.init.normal_(tensor, mean=0, std=1) # 正态分布
torch.nn.init.constant_(tensor, val) # 常数分布
torch.nn.init.xavier_uniform_(tensor, gain=1) # Xavier均匀分布
torch.nn.init.xavier_normal_(tensor, gain=1) # Xavier标准正态分布
torch.nn.init.kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu') # Kaiming均匀分布
torch.nn.init.kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu') # Kaiming标准正态分布
网络初始化代码示例
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
# freeze bn
m.weight.requires_grad = False
m.bias.requires_grad = False
对于目标检测网络,anchors设置不合理会导致网络训练时无法匹配gt_box和pred_box,进而引起梯度变化异常,间接导致nan。
其中一个解决方法是用kmeans算法对训练数据集中anchors进行聚类,以聚类中心的尺寸定义anchors尺寸,代码参考。
import numpy as np
import xml.etree.ElementTree as ET
import glob
import os
import random
def cas_iou(box, cluster):
x = np.minimum(cluster[:, 0], box[0])
y = np.minimum(cluster[:, 1], box[1])
intersection = x * y
area1 = box[0] * box[1]
area2 = cluster[:, 0] * cluster[:, 1]
iou = intersection / (area1 + area2 - intersection)
return iou
def avg_iou(box, cluster):
return np.mean([np.max(cas_iou(box[i], cluster)) for i in range(box.shape[0])])
def kmeans(box, k):
row = box.shape[0]
distance = np.empty((row, k))
last_clu = np.zeros((row,))
np.random.seed()
cluster = box[np.random.choice(row, k, replace=False)]
while True:
for i in range(row):
distance[i] = 1 - cas_iou(box[i], cluster)
near = np.argmin(distance, axis=1)
if (last_clu == near).all():
break
for j in range(k):
cluster[j] = np.median(box[near == j], axis=0)
last_clu = near
return cluster
def load_data(path):
data = []
for xml_file in glob.glob(f'{path}/*.xml'):
print(xml_file)
tree = ET.parse(xml_file)
height = int(tree.findtext('./size/height'))
width = int(tree.findtext('./size/width'))
for obj in tree.iter('object'):
xmin = np.float64(int(float(obj.findtext('bndbox/xmin'))) / width)
ymin = np.float64(int(float(obj.findtext('bndbox/ymin'))) / height)
xmax = np.float64(int(float(obj.findtext('bndbox/xmax'))) / width)
ymax = np.float64(int(float(obj.findtext('bndbox/ymax'))) / height)
data.append([xmax - xmin, ymax - ymin])
return np.array(data)
if __name__ == '__main__':
SIZE = 192
anchors_num = 9
path = 'your_annotation_path'
data = load_data(path)
out = kmeans(data, anchors_num)
out = out[np.argsort(out[:, 0])]
print('acc:{:.2f}%'.format(avg_iou(data, out) * 100))
print(out * SIZE)
冻结backbone训练时过拟合,则解冻backbone后可能出现梯度异常,出现损失函数为nan的情况。
optimizer = optim.Adam(model.parameters(), lr=1e-3, eps=1e-4)