class TwoLayerNet(object):
def __init__(self, input_size, hidden_size, output_size, std=1e-4):
self.params = {}
self.params['W1'] = std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)
def loss(self, X, y=None, reg=0.0):
# Unpack variables from the params dictionary
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
N, D = X.shape
# Compute the forward pass
scores = None
s1 = np.dot(X, W1) + b1 # (N, H)
s1_act = (s1 > 0) * s1
scores = np.dot(s1_act, W2) + b2 # (N, C)
# If the targets are not given then jump out, we're done
if y is None:
return scores
# Compute the loss
loss = None
scores -= np.max(scores, axis=1, keepdims=True) # 数值稳定性
scores = np.exp(scores)
scores /= np.sum(scores, axis=1, keepdims=True) # softmax
loss = -np.log(scores[np.arange(N), y]).sum()
loss /= X.shape[0]
loss += reg * np.sum(W1**2)
loss += reg * np.sum(W2**2)
# Backward pass: compute gradients
grads = {}
ds2 = np.copy(scores) # 计算ds
ds2[np.arange(X.shape[0]), y] -= 1
ds2 = ds2 / X.shape[0]
grads['W2'] = np.dot(s1_act.T, ds2) + 2 * reg * W2
grads['b2'] = np.sum(ds2, axis=0)
ds1 = np.dot(ds2, W2.T)
ds1 = (s1 > 0) * ds1
grads['W1'] = np.dot(X.T, ds1) + 2 * reg * W1
grads['b1'] = np.sum(ds1, axis=0)
return loss, grads
def train(self, X, y, X_val, y_val,
learning_rate=1e-3, learning_rate_decay=0.95,
reg=5e-6, num_iters=100,
batch_size=200, verbose=False):
num_train = X.shape[0]
iterations_per_epoch = max(num_train / batch_size, 1)
# Use SGD to optimize the parameters in self.model
loss_history = []
train_acc_history = []
val_acc_history = []
for it in range(num_iters):
X_batch = None
y_batch = None
idx = np.random.choice(range(num_train), batch_size)
X_batch = X[idx]
y_batch = y[idx]
# Compute loss and gradients using the current minibatch
loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
for p in ['W1', 'W2', 'b1', 'b2']:
self.params[p] -= learning_rate * grads[p]
if verbose and it % 100 == 0:
print('iteration %d / %d: loss %f' % (it, num_iters, loss))
# Every epoch, check train and val accuracy and decay learning rate.
if it % iterations_per_epoch == 0:
# Check accuracy
train_acc = (self.predict(X_batch) == y_batch).mean()
val_acc = (self.predict(X_val) == y_val).mean()
# Decay learning rate
learning_rate *= learning_rate_decay
return {
'loss_history': loss_history,
'train_acc_history': train_acc_history,
'val_acc_history': val_acc_history,
def predict(self, X):
y_pred = None
scores = self.loss(X)
y_pred = np.argmax(scores, axis=1)
return y_pred
hidden_size : 50
learning_rate : 1e-3
regularization :0.25
num_iters : 2000
batch_size : 200
learning_rate_decay : 0.95
Test accuracy: 0.51 best val acc: 0.501
Inline Question
Now that you have trained a Neural Network classifier, you may find that your testing accuracy is much lower than the training accuracy. In what ways can we decrease this gap? Select all that apply.
Your answer: 1 和 3
Your explanation: 增大数据和增加正则化强度都能够提高泛化能力,但是增加隐藏节点会使得model更加的过拟合