本次实验的两层NN的结构:
相当于使用了 ReLu
最后一层用 softmax 得出 loss
其目的是为了求任意函数的导数
推导过程:
如有错误,麻烦指出
Two-Layer Neural Network 的结构:
我的理解是这样的:
代码:
correct_scores 当作是一个测试数据集
import numpy as np
import matplotlib.pyplot as plt
from cs231n.classifiers.neural_net import TwoLayerNet
input_size = 4
hidden_size = 10
num_classes = 3
num_inputs = 5
def rel_error(x, y):
return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))
def init_toy_model():
np.random.seed(0)
return TwoLayerNet(input_size, hidden_size, num_classes, std=1e-1)
def init_toy_data():
np.random.seed(1)
X = 10 * np.random.randn(num_inputs, input_size)
y = np.array([0, 1, 2, 2, 1])
return X, y
net = init_toy_model()
X, y = init_toy_data()
# Forward pass: compute scores
scores = net.loss(X)
print('Your scores:')
print(scores)
print()
print('correct scores:')
correct_scores = np.asarray([
[-0.81233741, -1.27654624, -0.70335995],
[-0.17129677, -1.18803311, -0.47310444],
[-0.51590475, -1.01354314, -0.8504215 ],
[-0.15419291, -0.48629638, -0.52901952],
[-0.00618733, -0.12435261, -0.15226949]])
print(correct_scores)
print()
# The difference should be very small. We get < 1e-7
print('Difference between your scores and correct scores:')
print(np.sum(np.abs(scores - correct_scores)))
loss, _ = net.loss(X, y, reg=0.05)
correct_loss = 1.30378789133
print('Difference between your loss and correct loss:')
print(np.sum(np.abs(loss - correct_loss)))
1. 首先,正向求loss:
在nerual_net.py 的 loss 中:
# Unpack variables from the params dictionary
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
N, D = X.shape
# Compute the forward pass
scores = None
#############################################################################
# TODO: Perform the forward pass, computing the class scores for the input. #
# Store the result in the scores variable, which should be an array of #
# shape (N, C). #
#############################################################################
# 计算分值
z1 = X.dot(W1) + b1
a1 = np.maximum(0, z1)
scores = a1.dot(W2) + b2
#############################################################################
# END OF YOUR CODE #
#############################################################################
# If the targets are not given then jump out, we're done
if y is None:
return scores
# Compute the loss
loss = None
#############################################################################
# TODO: Finish the forward pass, and compute the loss. This should include #
# both the data loss and L2 regularization for W1 and W2. Store the result #
# in the variable loss, which should be a scalar. Use the Softmax #
# classifier loss. #
#############################################################################
# 最后一层softmax
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
loss = np.sum(-np.log(probs[np.arange(N), y]))
loss /= N
loss += 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2))
#############################################################################
# END OF YOUR CODE #
#############################################################################
2. 接下来,反向传播求出梯度,并用数值梯度验证
# Backward pass
from cs231n.gradient_check import eval_numerical_gradient
loss, grads = net.loss(X, y, reg=0.05)
# these should all be less than 1e-8 or so
for param_name in grads:
f = lambda W: net.loss(X, y, reg=0.05)[0]
param_grad_num = eval_numerical_gradient(f, net.params[param_name], verbose=False)
print('%s max relative error: %e' % (param_name, rel_error(param_grad_num, grads[param_name])))
在nerual_net.py 的 loss 中: 求导过程在注释中
# Backward pass: compute gradients
grads = {}
#############################################################################
# TODO: Compute the backward pass, computing the derivatives of the weights #
# and biases. Store the results in the grads dictionary. For example, #
# grads['W1'] should store the gradient on W1, and be a matrix of same size #
#############################################################################
# 计算梯度
# softmax
dscores = probs
dscores[range(N), y] -= 1
dscores /= N #
# W2 和 b2
grads['W2'] = np.dot(a1.T, dscores) # 见开头推倒过程或PPT
grads['b2'] = np.sum(dscores, axis=0) # 使其与b2的size相同
# backward 所传播的df/dx
dhidden = np.dot(dscores, W2.T)
# relu求导
dhidden[a1 <= 0] = 0
# W1 和 b1
grads['W1'] = np.dot(X.T, dhidden)
grads['b1'] = np.sum(dhidden, axis=0)
# 加上正则化梯度的部分
grads['W2'] += reg * W2
grads['W1'] += reg * W1
#############################################################################
# END OF YOUR CODE #
#############################################################################
这里想要得到的梯度是各个参数的,即 df/dw df/db,backward传递的是 df/dx
3. 训练数据并画出loss曲线
# Train the network
net = init_toy_model()
stats = net.train(X, y, X, y,
learning_rate=1e-1, reg=5e-6,
num_iters=100, verbose=False)
print('Final training loss: ', stats['loss_history'][-1])
# plot the loss history
plt.plot(stats['loss_history'])
plt.xlabel('iteration')
plt.ylabel('training loss')
plt.title('Training Loss history')
plt.show()
predict
def predict(self, X):
y_pred = None
###########################################################################
# TODO: Implement this function; it should be VERY simple! #
###########################################################################
scores = self.loss(X)
y_pred = np.argmax(scores, axis=1)
###########################################################################
# END OF YOUR CODE #
###########################################################################
return y_pred
这里依然是用 mini-batch
def train(self, X, y, X_val, y_val,
learning_rate=1e-3, learning_rate_decay=0.95,
reg=5e-6, num_iters=100,
batch_size=200, verbose=False):
num_train = X.shape[0]
iterations_per_epoch = max(num_train / batch_size, 1)
# Use SGD to optimize the parameters in self.model
loss_history = []
train_acc_history = []
val_acc_history = []
# 个人感觉一个epoch中应该batch所有的数据,这里直接用epoch应该结果也差不多
for it in range(num_iters):
X_batch = None
y_batch = None
#########################################################################
# TODO: Create a random minibatch of training data and labels, storing #
# them in X_batch and y_batch respectively. #
#########################################################################
sample_indices = np.random.choice(np.arange(num_train), batch_size)
X_batch = X[sample_indices]
y_batch = y[sample_indices]
#########################################################################
# END OF YOUR CODE #
#########################################################################
# Compute loss and gradients using the current minibatch
loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
loss_history.append(loss)
#########################################################################
# TODO: Use the gradients in the grads dictionary to update the #
# parameters of the network (stored in the dictionary self.params) #
# using stochastic gradient descent. You'll need to use the gradients #
# stored in the grads dictionary defined above. #
#########################################################################
# gradient desent
self.params['W1'] += -learning_rate * grads['W1']
self.params['b1'] += -learning_rate * grads['b1']
self.params['W2'] += -learning_rate * grads['W2']
self.params['b2'] += -learning_rate * grads['b2']
#########################################################################
# END OF YOUR CODE #
#########################################################################
if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))
# Every epoch, check train and val accuracy and decay learning rate.
if it % iterations_per_epoch == 0:
# Check accuracy
train_acc = (self.predict(X_batch) == y_batch).mean()
val_acc = (self.predict(X_val) == y_val).mean()
train_acc_history.append(train_acc)
val_acc_history.append(val_acc)
# Decay learning rate
learning_rate *= learning_rate_decay
return {
'loss_history': loss_history,
'train_acc_history': train_acc_history,
'val_acc_history': val_acc_history,
}
可证明model 和 计算过程基本正确
1. 导入数据
# Load the data
from cs231n.data_utils import load_CIFAR10
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
# Load the raw CIFAR-10 data
cifar10_dir = 'F:\pycharmFile\KNN\cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# Subsample the data
mask = list(range(num_training, num_training + num_validation))
X_val = X_train[mask]
y_val = y_train[mask]
mask = list(range(num_training))
X_train = X_train[mask]
y_train = y_train[mask]
mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]
# Normalize the data: subtract the mean image
mean_image = np.mean(X_train, axis=0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
# Reshape data to rows
X_train = X_train.reshape(num_training, -1)
X_val = X_val.reshape(num_validation, -1)
X_test = X_test.reshape(num_test, -1)
return X_train, y_train, X_val, y_val, X_test, y_test
# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
2. 训练,测试, 调优
# Train a network
input_size = 32 * 32 * 3
hidden_size = 50
num_classes = 10
#net = TwoLayerNet(input_size, hidden_size, num_classes)
# Train the network
# stats = net.train(X_train, y_train, X_val, y_val,
# num_iters=1000, batch_size=200,
# learning_rate=1e-4, learning_rate_decay=0.95,
# reg=0.25, verbose=True)
#
# # Predict on the validation set
# val_acc = (net.predict(X_val) == y_val).mean()
# print('Validation accuracy: ', val_acc)
# Plot the loss function and train / validation accuracies
def plot_loss_acc(stats):
plt.subplot(2, 1, 1)
plt.plot(stats['loss_history'])
plt.title('Loss history')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.subplot(2, 1, 2)
plt.plot(stats['train_acc_history'], label='train')
plt.plot(stats['val_acc_history'], label='val')
plt.title('Classification accuracy history')
plt.xlabel('Epoch')
plt.ylabel('Clasification accuracy')
plt.legend()
plt.show()
from cs231n.vis_utils import visualize_grid
# Visualize the weights of the network
def show_net_weights(net):
W1 = net.params['W1']
W1 = W1.reshape(32, 32, 3, -1).transpose(3, 0, 1, 2)
plt.imshow(visualize_grid(W1, padding=3).astype('uint8'))
plt.gca().axis('off')
plt.show()
#show_net_weights(net)
# find the best nn
best_val = -1
best_nn = None
best_stats = None
results = {}
####----------- 这里reg如果设置的太小,可能会导致一些数据溢出 ------------####
learning_rates = [1e-3]
regularization_strengths = [0.6, 0.8]
for lr in learning_rates:
for rg in regularization_strengths:
net = TwoLayerNet(input_size, hidden_size, num_classes)
stats = net.train(X_train, y_train, X_val, y_val,
num_iters=2500, batch_size=200,
learning_rate=lr, learning_rate_decay=0.95,
reg=rg)
y_train_pred = net.predict(X_train)
acc_train = np.mean(y_train_pred == y_train)
y_val_pred = net.predict(X_val)
acc_val = np.mean(y_val_pred == y_val)
results[(lr, rg)] = (acc_train, acc_val)
if best_val < acc_val:
best_nn = net
best_val = acc_val
best_stats = stats
for lr, reg in results:
train_acc, val_acc = results[(lr, reg)]
print('lr: ', lr, ' reg: ', reg, ' train acc: ', train_acc, ' val_acc: ', val_acc)
print('The best validation accuracy: ', best_val)
# Run on the test set
test_acc = (best_nn.predict(X_test) == y_test).mean()
print('Test accuracy: ', test_acc)
plot_loss_acc(best_stats)
show_net_weights(best_nn)
除了调超参(lr, reg)之外,hidden 层的节点数,epoch的大小, batch_size的大小,learning rate的变化率,对训练结果都有一定影响
我得出的结果: