一.疑问
这几天一直纠结于一个问题:
同样的代码,为什么在keras的0.3.3版本中,拟合得比较好,也没有过拟合,验证集准确率一直高于训练准确率. 但是在换到keras的1.2.0版本中的时候,就过拟合了,验证误差一直高于训练误差
二.答案
今天终于发现原因了,原来是这两个版本的keras的optimezer实现不一样,但是它们的默认参数是一样的,因为我代码中用的是adam方法优化,下面就以optimezer中的adam来举例说明:
1.下面是keras==0.3.3时,其中optimezer.py中的adam方法实现:
1 class Adam(Optimizer): 2 '''Adam optimizer. 3 4 Default parameters follow those provided in the original paper. 5 6 # Arguments 7 lr: float >= 0. Learning rate. 8 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 9 epsilon: float >= 0. Fuzz factor. 10 11 # References 12 - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) 13 ''' 14 def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, 15 *args, **kwargs): 16 super(Adam, self).__init__(**kwargs) 17 self.__dict__.update(locals()) 18 self.iterations = K.variable(0) 19 self.lr = K.variable(lr) 20 self.beta_1 = K.variable(beta_1) 21 self.beta_2 = K.variable(beta_2) 22 23 def get_updates(self, params, constraints, loss): 24 grads = self.get_gradients(loss, params) 25 self.updates = [(self.iterations, self.iterations+1.)] 26 27 t = self.iterations + 1 28 lr_t = self.lr * K.sqrt(1 - K.pow(self.beta_2, t)) / (1 - K.pow(self.beta_1, t)) 29 30 for p, g, c in zip(params, grads, constraints): 31 # zero init of moment 32 m = K.variable(np.zeros(K.get_value(p).shape)) 33 # zero init of velocity 34 v = K.variable(np.zeros(K.get_value(p).shape)) 35 36 m_t = (self.beta_1 * m) + (1 - self.beta_1) * g 37 v_t = (self.beta_2 * v) + (1 - self.beta_2) * K.square(g) 38 p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) 39 40 self.updates.append((m, m_t)) 41 self.updates.append((v, v_t)) 42 self.updates.append((p, c(p_t))) # apply constraints 43 return self.updates 44 45 def get_config(self): 46 return {"name": self.__class__.__name__, 47 "lr": float(K.get_value(self.lr)), 48 "beta_1": float(K.get_value(self.beta_1)), 49 "beta_2": float(K.get_value(self.beta_2)), 50 "epsilon": self.epsilon}
2.下面是keras==1.2.0时,其中optimezer.py中的adam方法实现:
1 class Adam(Optimizer): 2 '''Adam optimizer. 3 4 Default parameters follow those provided in the original paper. 5 6 # Arguments 7 lr: float >= 0. Learning rate. 8 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 9 epsilon: float >= 0. Fuzz factor. 10 11 # References 12 - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) 13 ''' 14 def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, 15 epsilon=1e-8, decay=0., **kwargs): 16 super(Adam, self).__init__(**kwargs) 17 self.__dict__.update(locals()) 18 self.iterations = K.variable(0) 19 self.lr = K.variable(lr) 20 self.beta_1 = K.variable(beta_1) 21 self.beta_2 = K.variable(beta_2) 22 self.decay = K.variable(decay) 23 self.inital_decay = decay 24 25 def get_updates(self, params, constraints, loss): 26 grads = self.get_gradients(loss, params) 27 self.updates = [K.update_add(self.iterations, 1)] 28 29 lr = self.lr 30 if self.inital_decay > 0: 31 lr *= (1. / (1. + self.decay * self.iterations)) 32 33 t = self.iterations + 1 34 lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)) 35 36 shapes = [K.get_variable_shape(p) for p in params] 37 ms = [K.zeros(shape) for shape in shapes] 38 vs = [K.zeros(shape) for shape in shapes] 39 self.weights = [self.iterations] + ms + vs 40 41 for p, g, m, v in zip(params, grads, ms, vs): 42 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 43 v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 44 p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) 45 46 self.updates.append(K.update(m, m_t)) 47 self.updates.append(K.update(v, v_t)) 48 49 new_p = p_t 50 # apply constraints 51 if p in constraints: 52 c = constraints[p] 53 new_p = c(new_p) 54 self.updates.append(K.update(p, new_p)) 55 return self.updates 56 57 def get_config(self): 58 config = {'lr': float(K.get_value(self.lr)), 59 'beta_1': float(K.get_value(self.beta_1)), 60 'beta_2': float(K.get_value(self.beta_2)), 61 'decay': float(K.get_value(self.decay)), 62 'epsilon': self.epsilon} 63 base_config = super(Adam, self).get_config() 64 return dict(list(base_config.items()) + list(config.items()))
读代码对比,可发现这两者实现方式有不同,而我的代码中一直使用的是adam的默认参数,所以才会结果不一样.
三.解决
要避免这一问题可用以下方法:
1.在自己的代码中,要对优化器的参数给定,不要用默认参数.
adam = optimizers.Adam(lr=1e-4)
但是,在keras官方文档中,明确有说明,在用这些优化器的时候,最好使用默认参数,所以也可采用第2种方法.
2.优化函数中的优化方法要给定,也就是在训练的时候,在fit函数中的callbacks参数中的schedule要给定.
比如:
1 # Callback that implements learning rate schedule 2 schedule = Step([20], [1e-4, 1e-6]) 3 4 history = model.fit(X_train, Y_train, 5 batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_test,Y_test), 6 callbacks=[ 7 schedule, 8 keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0,save_best_only=True, mode='auto')# 该回调函数将在每个epoch后保存模型到filepath 9 # ,keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')# 当监测值不再改善时,该回调函数将中止训练.当early stop被激活(如发现loss相比上一个epoch训练没有下降),则经过patience个epoch后停止训练 10 ], 11 verbose=2, shuffle=True)
其中Step函数如下:
1 class Step(Callback): 2 3 def __init__(self, steps, learning_rates, verbose=0): 4 self.steps = steps 5 self.lr = learning_rates 6 self.verbose = verbose 7 8 def change_lr(self, new_lr): 9 old_lr = K.get_value(self.model.optimizer.lr) 10 K.set_value(self.model.optimizer.lr, new_lr) 11 if self.verbose == 1: 12 print('Learning rate is %g' %new_lr) 13 14 def on_epoch_begin(self, epoch, logs={}): 15 for i, step in enumerate(self.steps): 16 if epoch < step: 17 self.change_lr(self.lr[i]) 18 return 19 self.change_lr(self.lr[i+1]) 20 21 def get_config(self): 22 config = {'class': type(self).__name__, 23 'steps': self.steps, 24 'learning_rates': self.lr, 25 'verbose': self.verbose} 26 return config 27 28 @classmethod 29 def from_config(cls, config): 30 offset = config.get('epoch_offset', 0) 31 steps = [step - offset for step in config['steps']] 32 return cls(steps, config['learning_rates'], 33 verbose=config.get('verbose', 0))