# 已知:
lr = 1e-2 # learning rate
X = {x^1, x^2, ..., x^N} # 输入N
Y = {y^1, y^2, ..., y^N} # 标签N
W = init_op().run()
compute_loss(X, W, Y) # loss计算函数
compute_grads(grads, W) # 梯度计算函数
while True:
# 随机取一个batch, batch size = m
# X_batch = {x^2, x^4, x^6, ...} # 共m个
# Y_batch = {y^2, y^4, y^6, ...} # 共m个
X_batch, Y_batch = random_choice(X, Y, size=m)
# 计算cost
cost = mean(sum(compute_loss(X_batch, W, Y_batch), axis=[1,2,3]))
# 计算梯度
grads = compute_grads(cost, W)
# apply grads
W = W - lr * grads
- Momentum
- Momentum的步长相当于10倍学习率的SGD(gama=0.9);
# 已知:
gama = 0.9
sum_grads = zeros_like(W)
while True:
# 随机取一个batch, batch size = m
X_batch, Y_batch = random_choice(X, Y, size=m)
# 计算cost
cost = mean(sum(compute_loss(X_batch, W, Y_batch), axis=[1,2,3]))
# 计算梯度
grads = compute_grads(cost, W)
# apply grads
sum_grads = gama * sum_grads + grads
W = W - lr * sum_grads
# 已知:
epsilon = 1e-10
sum_square_grads = zeros_like(W)
while True:
# 随机取一个batch, batch size = m
X_batch, Y_batch = random_choice(X, Y, size=m)
# 计算cost
cost = mean(sum(compute_loss(X_batch, W, Y_batch), axis=[1,2,3]))
# 计算梯度
grads = compute_grads(cost, W)
# apply grads
sum_square_grads += square(grads)
W = W - lr * grads / (sqrt(sum_square_grads ) + epsilon)
- RMS
- RMS相当于adgrad + momentum, adgrad的sum_grads随着训练进行一直加入新的项,相当于学习率快速衰减,所以sum_grads的更新增加滑动平均机制。
# 已知:
epsilon = 1e-10
gama = 0.9
sum_square_grads = np.zeros_like(W)
while True:
# 随机取一个batch, batch size = m
X_batch, Y_batch = random_choice(X, Y, size=m)
# 计算cost
cost = mean(sum(compute_loss(X_batch, W, Y_batch), axis=[1,2,3]))
# 计算梯度
grads = compute_grads(cost, W)
# apply grads
sum_square_grads = gama * sum_square_grads + (1 - gama) * square(grads)
W = W - lr * grads / (sqrt(sum_square_grads ) + epsilon)
# 已知:
epsilon = 1e-10
momentum= 0.9
delta = 0.999
sum_grads = zeros_like(W)
sum_square_grads = zeros_like(W)
step = 0
while True:
step += 1
# 随机取一个batch, batch size = m
X_batch, Y_batch = random_choice(X, Y, size=m)
# 计算cost
cost = mean(sum(compute_loss(X_batch, W, Y_batch), axis=[1,2,3]))
# 计算梯度
grads = compute_grads(cost, W)
# apply grads
sum_grads = momentum * (sum_grads) + (1 - mementum) * sum_grads
sum_square_grads = delta * sum_square_grads + (1 - delta ) * square(grads)
sum_grads = sum_grads / (1 - momentum**step)
sum_square_grads = sum_square_grads / (1 - delta**step)
W = W - lr * sum_grads / (sqrt(sum_square_grads ) + epsilon)