梯度下降法是最小化目标函数J(θ)的一种方法,其中θ为参数,利用目标函数关于参数的梯度的反方向更新参数。学习率η决定达到最小值或局部最小值过程中所采用的的步长的大小。
区别在于计算目标函数的梯度时用到多少数据,根据数据量的不同,在精度和时间两个方面做出权衡。
θ = θ − η ⋅ ∇ θ J ( θ ) \theta=\theta-\eta \cdot \nabla_{\theta} J(\theta) θ=θ−η⋅∇θJ(θ)
for i in range(nb_epochs):
params_grad = evaluate_gradient(loss_function, data, params)
params = params - learning_rate * params_grad
θ = θ − η ⋅ ∇ θ J ( θ ; x ( i ) ; y ( i ) ) \theta=\theta-\eta \cdot \nabla_{\theta} J\left(\theta ; x^{(i)} ; y^{(i)}\right) θ=θ−η⋅∇θJ(θ;x(i);y(i))
for i in range(nb_epochs):
np.random.shuffle(data)
for example in data:
params_grad = evaluate_gradient(loss_function, example, params)
params = params - learning_rate * params_grad
for i in range(nb_epochs):
np.random.shuffle(data)
for batch in get_batches(data, batch_size=50):
params_grad = evaluate_gradient(loss_function, batch, params)
params = params - learning_rate * params_grad
本部分包括批梯度下降、随机梯度下降和小批量梯度下降的Python实现
代码使用了最简单的线性模型作为例子:
import numpy as np
import time
import matplotlib.pyplot as plt
# 梯度下降算法
def batch_gradient_descent(input_data, output_data, eta, tolerance):
time_start = time.time()
w = np.ones((1, 2))
old_w = np.zeros((1,2))
iteration = 1
loss_function = []
while np.sqrt(np.sum(np.square(w - old_w))) > tolerance:
#while iteration <= 1000:
error = output_data - np.dot(w, input_data)
loss_function.append(sum([c*c for c in (output_data - np.dot(w, input_data))][0])/input_data.shape[1])
old_w = w
w = w + eta * np.dot(error, input_data.T)/input_data.shape[1]
iteration = iteration + 1
if iteration%500 == 0:
print("迭代次数{}参数值{}".format(iteration,w))
time_end = time.time()
print("耗时{}\n迭代次数{}".format(time_end-time_start,iteration))
print("运行结果:参数为{}".format(w.tolist()[0]))
result = {
"time":time_end-time_start, "iterations":iteration,
"w":w.tolist()[0], "loss_functions":loss_function}
return result
# 随机梯度下降算法
def random_gradient_descent(input_data, output_data, eta, tolerance):
time_start = time.time()
w = np.ones((1, 2))
old_w = np.zeros((1,2))
iteration = 1
loss_function = []
while np.sqrt(np.sum(np.square(w - old_w))) > tolerance:
for i in range(input_data.shape[1]):
col_rand_x = input_data[:, i]
col_rand_y = output_data[i]
error = col_rand_y - np.dot(w, col_rand_x)
loss_function.append(sum([c*c for c in (output_data - np.dot(w, input_data))][0])/input_data.shape[1])
old_w = w
w = w + eta * error * col_rand_x.T
iteration = iteration + 1
if iteration%500 == 0:
print("迭代次数{}参数值{}".format(iteration,w))
time_end = time.time()