详见 CS231n课程笔记1:Introduction。
本文都是作者自己的思考,正确性未经过验证,欢迎指教。
这部分就是二层神经网络的拓展,详情参考CS231n作业笔记2.1:两层全连接神经网络的分层实现。
注:这部分不包括dropout以及batch normalization。
注:这次使用了normal函数,也可以使用randn函数
self.params['W1'] = np.random.normal(scale=weight_scale,size=[input_dim,hidden_dims[0]])
self.params['b1'] = np.zeros(hidden_dims[0])
for index in range(1,len(hidden_dims)):
self.params['W'+str(index+1)] = np.random.normal(scale=weight_scale,size=[hidden_dims[index-1],hidden_dims[index]])
self.params['b'+str(index+1)] = np.zeros(hidden_dims[index])
self.params['W'+str(len(hidden_dims)+1)] = np.random.normal(scale=weight_scale,size=[hidden_dims[-1],num_classes])
self.params['b'+str(len(hidden_dims)+1)] = np.zeros(num_classes)
使用一个字典cache用于记录每次的暂存变量。
cache = {}
hidden_value = None
hidden_value,cache['fc1'] = affine_forward(X,self.params['W1'],self.params['b1'])
hidden_value,cache['relu1'] = relu_forward(hidden_value)
for index in range(2,self.num_layers):
hidden_value,cache['fc'+str(index)] = affine_forward(hidden_value,self.params['W'+str(index)],self.params['b'+str(index)])
hidden_value,cache['relu'+str(index)] = relu_forward(hidden_value)
scores,cache['score'] = affine_forward(hidden_value,self.params['W'+str(self.num_layers)],self.params['b'+str(self.num_layers)])
注:再次强调一下不使用bias regularization.
loss, grads = 0.0, {}
loss,dscores = softmax_loss(scores,y)
for index in range(1,self.num_layers+1):
loss += 0.5*self.reg*np.sum(self.params['W'+str(index)]**2)
dhidden_value,grads['W'+str(self.num_layers)],grads['b'+str(self.num_layers)] = affine_backward(dscores,cache['score'])
for index in range(self.num_layers-1,1,-1):
dhidden_value = relu_backward(dhidden_value,cache['relu'+str(index)])
dhidden_value,grads['W'+str(index)],grads['b'+str(index)] = affine_backward(dhidden_value,cache['fc'+str(index)])
dhidden_value = relu_backward(dhidden_value,cache['relu1'])
dhidden_value,grads['W1'],grads['b1'] = affine_backward(dhidden_value,cache['fc1'])
for index in range(1,self.num_layers+1):
grads['W'+str(index)] += self.reg * self.params['W'+str(index)]
这部分是用于测试Solver的API,自行查看solver.py即可。
至于为了达到50%以上的准确率,只需要尝试常见参数即可。
model = TwoLayerNet()
solver = Solver(model,
data,
update_rule='sgd',
optim_config={
'learning_rate': 1e-3,
},
lr_decay=0.95,
num_epochs=10, batch_size=100,
print_every=100)
solver.train()
此部分用于测试以及使用多层神经网络,同时注意到对于深度神经网络的调参要难于浅神经网络。
尝试常见参数即可。
weight_scale = 1e-2
learning_rate = 7e-3
model = FullyConnectedNet([100, 100],
weight_scale=weight_scale, dtype=np.float64)
solver = Solver(model, small_data,
print_every=10, num_epochs=20, batch_size=25,
update_rule='sgd',
optim_config={
'learning_rate': learning_rate,
}
)
solver.train()
需要写一个调参程序,注意到深度越深,调参难度越高,很容易落入局部极致;而且较好的超参数位于某个极小部分内,所以粗调的时候也需要减小步长。
learning_rate = [1.8e-2,1.8e-2]
weight_scale = [3.25e-2,3.25e-2]
random_params = np.random.rand(1,2)
random_params[:,0] = random_params[:,0]*(learning_rate[1]-learning_rate[0]) + learning_rate[0]
random_params[:,1] = random_params[:,1]*(weight_scale[1]-weight_scale[0]) + weight_scale[0]
results = {}
best_train_acc = -1
best_solver = None
validation = False
for lr,ws in random_params:
model = FullyConnectedNet([100, 100, 100, 100],
weight_scale=ws, dtype=np.float64)
solver = Solver(model, small_data,
print_every=10, num_epochs=20, batch_size=25,
update_rule='sgd',
optim_config={
'learning_rate': lr,
},
verbose = not validation
)
solver.train()
results[(lr,ws)] = (solver.loss_history[-1],solver.train_acc_history[-1])
if (solver.train_acc_history[-1] > best_train_acc):
best_train_acc = solver.train_acc_history[-1]
best_solver = solver
if (validation):
for lr,ws in results:
print "learning rate: %.6e weight scale: %.6e train acc: %.6e" %(lr,ws,results[(lr,ws)][1])
print "best train accuracy: %.6e" % (best_train_acc)
import math
x_scatter = [math.log10(x[0]) for x in results]
y_scatter = [math.log10(x[1]) for x in results]
colors = [results[x][1] for x in results]
plt.scatter(x_scatter,y_scatter,100,c = colors)
plt.xlabel("log learning rate")
plt.ylabel("log weight scale")
plt.colorbar()
plt.show()
else:
plt.plot(solver.loss_history, 'o')
plt.title('Training loss history')
plt.xlabel('Iteration')
plt.ylabel('Training loss')
plt.show()