在使用梯度下降进行最小化 f ( x ) f(x) f(x), 初始值为 x i x_i xi, 则 x i + 1 x_{i+1} xi+1公式为
x i + 1 = x i − l r ∂ f ( x i ) ∂ x i x_{i+1}=x_i -lr \frac{\partial f(x_i)}{\partial x_i} xi+1=xi−lr∂xi∂f(xi)
lr为learning rate,学习率。学习率过大可能会导致无法得到很好的收敛,学习率过小会导致收敛过慢。
例如,优化目标为 f ( x ) = x 2 f(x) = x^2 f(x)=x2, 则 f ( x ) f(x) f(x)的梯度为
∂ f ( x ) ∂ x = 2 x \frac{\partial f(x)}{\partial x} = 2x ∂x∂f(x)=2x
如果梯度过大为1, 初始值为5
step | 初始值 | 优化后的值 |
---|---|---|
1 | 5 | 5 - 1 * 2 *5= -5 |
2 | -5 | -5 - 1 * 2*(-5) = 5 |
3 | 5 | -5 |
由上表可以看出,在经过三次优化后,得出的结果和原来一样。
如果梯度过小,如设置为0.001, 则从初始值5优化到0.05需要2300步, 所以会导致收敛过于慢。
在tensorflow中提供了调整学习率的方式,如下函数
tf.train.exponential_decay(start_learning_rate,
global_step,
num_decay_step,
decay_rate,
staircase=True)
所以学习率的公式为
l r = s t a r t _ l e a r n i n g _ r a t e ∗ d e c a y _ r a t e g l o b a l _ s t e p n u m _ d e c a y _ s t e p lr = {start\_learning\_rate} * decay\_rate^{\frac{global\_step}{num\_decay\_step}} lr=start_learning_rate∗decay_ratenum_decay_stepglobal_step
每隔num_decay_step就会把learning rate乘以一个小于1的系数。staircase=True的意思是指数项会变成整数, 从而学习会是梯状(如图一),一般num_decay_step会设置为每隔epoch所需要的训练step,所以这样就保证了单个epoch数据所使用的learning rate是一样的。
代码如下:
import tensorflow as tf
from numpy.random import RandomState
from absl import flags, app
import matplotlib.pyplot as plt
import numpy as np
FLAGS = flags.FLAGS
flags.DEFINE_integer('num_batch_size', 20, 'batch size')
flags.DEFINE_integer('num_data', 1000, 'num data size')
flags.DEFINE_integer('num_train_step', 10000, 'num train step')
flags.DEFINE_boolean('stair_case', False, 'stair case or not')
flags.DEFINE_integer('num_decay_step', 100, 'num decay step')
SEED = 1
NUM_FEATURE = 2
class Data():
'''
Data
'''
def __init__(self, num_data):
self.num_data = num_data
self._generate_data()
def _generate_data(self):
rs = RandomState(SEED)
all_data = rs.rand(self.num_data, NUM_FEATURE)
y = [[2 * x1 + x2] for x1, x2 in all_data]
self.num_train_data = int(self.num_data / 2)
self.num_test_data = self.num_data - self.num_train_data
self.train_data_x = all_data[0: self.num_train_data, :]
self.train_data_y = y[0: self.num_train_data]
self.test_data_x = all_data[self.num_train_data: self.num_data, :]
self.test_data_y = y[self.num_train_data: self.num_data]
def get_all_test_data(self):
return self.test_data_x, self.test_data_y
def get_batch_data(self, step, batch_size):
start_index = batch_size * step % self.num_train_data
if start_index == self.num_train_data:
self.get_batch_data(step + 1, batch_size)
end_index = min(start_index + batch_size, self.num_train_data)
return self.train_data_x[start_index: end_index, :], \
self.train_data_y[start_index: end_index]
'''
define inference.
'''
def _sigmoid(x):
return 1 / (1+ tf.math.exp(-x))
def inference(intput_tensor, hidden_layer):
with tf.variable_scope("weight1", reuse=tf.AUTO_REUSE):
w1 = tf.get_variable("w1",
shape=[NUM_FEATURE, hidden_layer],
initializer=tf.initializers.orthogonal())
w2 = tf.get_variable("w2",
shape=[hidden_layer, 1],
initializer=tf.initializers.orthogonal())
b1 = tf.get_variable("b1",
shape=[1, hidden_layer,],
initializer=tf.initializers.orthogonal())
hidden_output = _sigmoid(tf.matmul(intput_tensor, w1) + b1)
y_predict = tf.matmul(hidden_output, w2)
return y_predict
def main(argv):
input = tf.placeholder(dtype=tf.float32, shape=[None, 2], name="inputs")
y_true = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="y_true")
# inference result
y_predict = inference(input, 10)
print(y_predict)
print(y_true)
# loss
loss = tf.math.reduce_mean(tf.math.square(y_predict - y_true))
global_step = tf.Variable(0)
learning_rate = \
tf.train.exponential_decay(0.1, global_step, FLAGS.num_decay_step, 0.96, staircase=FLAGS.stair_case)
# define optimizer
op = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_step = op.minimize(loss, global_step=global_step)
init =tf.global_variables_initializer()
step_list = []
lr_list = []
train_loss_list = []
test_loss_list = []
with tf.Session() as sess:
all_data = Data(FLAGS.num_data)
sess.run(init)
for i in range(FLAGS.num_train_step):
train_x, train_y = all_data.get_batch_data(i, FLAGS.num_batch_size)
_, train_loss = sess.run([train_step, loss], feed_dict={input: train_x, y_true: train_y})
# get test error
test_x, test_y = all_data.get_all_test_data()
test_loss = sess.run([loss], feed_dict={input: test_x, y_true: test_y})
step_list.append(sess.run(global_step))
lr_list.append(sess.run(learning_rate))
train_loss_list.append(train_loss)
test_loss_list.append(test_loss)
if i % 100 == 0:
print('Step: {}, training loss: {}, test_loss: {}'.format(i+1, train_loss, test_loss))
# plot step and learning rate
fig, (axs1, axs2) = plt.subplots(1, 2)
axs1.plot(step_list, lr_list)
axs1.set_title('the learning rate over step')
axs1.set_xlabel('step')
axs1.set_ylabel('learning rate')
# plot train and test loss
axs2.plot(step_list, train_loss_list, '-', step_list, test_loss_list, '*')
axs2.set_title('training and test loss over step')
axs2.set_xlabel('step')
axs2.set_ylabel('Loss')
plt.show()
if __name__ == "__main__":
app.run(main)
python3 chapter_3_4_5.py --num_train_step 1000 --stair_case --num_decay_step 200
图一: 使用梯状学习率,左图为学习率随step的变化,右图为训练和测试误差。
python3 chapter_3_4_5.py --num_train_step 1000 --num_decay_step 200