import numpy as np
# 生成随机数据
np.random.seed(0)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)
# 添加偏置项
X_b = np.c_[np.ones((100, 1)), X]
# 设置超参数
eta = 0.1 # 学习率
n_iterations = 1000 # 迭代次数
m = 100 # 样本数量
# 随机初始化模型参数
theta = np.random.randn(2, 1)
# 使用SGD训练模型
for iteration in range(n_iterations):
random_index = np.random.randint(m)
xi = X_b[random_index:random_index+1]
yi = y[random_index:random_index+1]
gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
theta = theta - eta * gradients
# 输出模型参数
print(theta)
import torch
import torch.nn as nn
import torch.optim as optim
# 定义神经网络
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(10, 5)
self.fc2 = nn.Linear(5, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = self.fc2(x)
return x
# 定义训练数据和标签
data = torch.randn(100, 10)
labels = torch.randn(100, 1)
# 初始化神经网络和Adagrad优化器
net = Net()
optimizer = optim.Adagrad(net.parameters(), lr=0.01)
# 训练神经网络
for i in range(100):
optimizer.zero_grad()
output = net(data)
loss = nn.functional.mse_loss(output, labels)
loss.backward()
optimizer.step()
# 输出训练后的参数
print(net.state_dict())
squared_grad = E[g^2]_t
squared_dx = E[delta_x^2]_t-1
g = compute_gradient(params)
delta_x = - RMS[squared_dx] / RMS[squared_grad] * g
params += delta_x
import tensorflow as tf
# 定义一个简单的神经网络
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(10, activation='softmax')
])
# 编译模型,使用Adam优化器
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# 训练模型
model.fit(train_images, train_labels, epochs=5)
公式如下:
mt = beta1 * mt-1 + (1 - beta1) * gt
vt = max(beta2 * vt-1, abs(gt))
theta = theta - alpha / (1 - beta1) * mt / (vt + epsilon)
示例:
import tensorflow as tf
# 定义损失函数和优化器
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adamax(learning_rate=0.001)
# 训练模型
for epoch in range(num_epochs):
for x_batch, y_batch in train_dataset:
with tf.GradientTape() as tape:
logits = model(x_batch, training=True)
loss_value = loss_fn(y_batch, logits)
grads = tape.gradient(loss_value, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
import numpy as np
def nadam(grad_func, x_init, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8, max_iter=1000):
# 初始化参数
x = x_init
m = np.zeros_like(x)
v = np.zeros_like(x)
t = 0
# 计算梯度
grad = grad_func(x)
while t < max_iter:
t += 1
# 计算动量参数
m = beta1 * m + (1 - beta1) * grad
v = beta2 * v + (1 - beta2) * np.square(grad)
# 计算修正后的动量参数
m_hat = m / (1 - np.power(beta1, t))
v_hat = v / (1 - np.power(beta2, t))
# 计算Nesterov动量
nesterov_m = beta1 * m_hat + (1 - beta1) * grad
# 更新参数
x -= lr * nesterov_m / (np.sqrt(v_hat) + eps)
# 计算梯度
grad = grad_func(x)
return x
import numpy as np
def rmsprop(parameters, gradients, cache, learning_rate, decay_rate):
"""
:param parameters: 神经网络的参数
:param gradients: 梯度
:param cache: RMSprop缓存
:param learning_rate: 学习率
:param decay_rate: 衰减率
:return: 更新后的参数和缓存
"""
eps = 1e-8
for i in range(len(parameters)):
cache[i] = decay_rate * cache[i] + (1 - decay_rate) * gradients[i] ** 2
parameters[i] -= learning_rate * gradients[i] / (np.sqrt(cache[i]) + eps)
return parameters, cache