Dropout.ipynb
代码cs231n/layers.py
def dropout_forward(x, dropout_param):
"""Performs the forward pass for (inverted) dropout.
Inputs:
- x: Input data, of any shape
- dropout_param: A dictionary with the following keys:
- p: Dropout parameter. We keep each neuron output with probability p.
- mode: 'test' or 'train'. If the mode is train, then perform dropout;
if the mode is test, then just return the input.
- seed: Seed for the random number generator. Passing seed makes this
function deterministic, which is needed for gradient checking but not
in real networks.
Outputs:
- out: Array of the same shape as x.
- cache: tuple (dropout_param, mask). In training mode, mask is the dropout
mask that was used to multiply the input; in test mode, mask is None.
NOTE: Please implement **inverted** dropout, not the vanilla version of dropout.
See http://cs231n.github.io/neural-networks-2/#reg for more details.
NOTE 2: Keep in mind that p is the probability of **keep** a neuron
output; this might be contrary to some sources, where it is referred to
as the probability of dropping a neuron output.
"""
p, mode = dropout_param['p'], dropout_param['mode']
if 'seed' in dropout_param:
np.random.seed(dropout_param['seed'])
mask = None
out = None
if mode == 'train':
# p for output
mask = ( np.random.rand(*x.shape) < p ) / p
out = x * mask
elif mode == 'test':
out = x
cache = (dropout_param, mask)
out = out.astype(x.dtype, copy=False)
return out, cache
测试:
np.random.seed(231)
x = np.random.randn(500, 500) + 10
for p in [0.25, 0.4, 0.7]:
out, _ = dropout_forward(x, {'mode': 'train', 'p': p})
out_test, _ = dropout_forward(x, {'mode': 'test', 'p': p})
print('Running tests with p = ', p)
print('Mean of input: ', x.mean())
print('Mean of train-time output: ', out.mean())
print('Mean of test-time output: ', out_test.mean())
print('Fraction of train-time output set to zero: ', (out == 0).mean())
print('Fraction of test-time output set to zero: ', (out_test == 0).mean())
print()
OUT:
Running tests with p = 0.25
Mean of input: 10.000207878477502
Mean of train-time output: 10.014059116977283
Mean of test-time output: 10.000207878477502
Fraction of train-time output set to zero: 0.749784
Fraction of test-time output set to zero: 0.0
Running tests with p = 0.4
Mean of input: 10.000207878477502
Mean of train-time output: 9.977917658761159
Mean of test-time output: 10.000207878477502
Fraction of train-time output set to zero: 0.600796
Fraction of test-time output set to zero: 0.0
Running tests with p = 0.7
Mean of input: 10.000207878477502
Mean of train-time output: 9.987811912159426
Mean of test-time output: 10.000207878477502
Fraction of train-time output set to zero: 0.30074
Fraction of test-time output set to zero: 0.0
可以发现会有 1-p 比例的神经元失活。
def dropout_backward(dout, cache):
"""
Perform the backward pass for (inverted) dropout.
Inputs:
- dout: Upstream derivatives, of any shape
- cache: (dropout_param, mask) from dropout_forward.
"""
dropout_param, mask = cache
mode = dropout_param['mode']
dx = None
if mode == 'train':
dx = dout * mask
elif mode == 'test':
dx = dout
return dx
问题1
class FullyConnectedNet(object):
"""
A fully-connected neural network with an arbitrary number of hidden layers,
ReLU nonlinearities, and a softmax loss function. This will also implement
dropout and batch/layer normalization as options. For a network with L layers,
the architecture will be
{affine - [batch/layer norm] - relu - [dropout]} x (L - 1) - affine - softmax
where batch/layer normalization and dropout are optional, and the {...} block is
repeated L - 1 times.
Similar to the TwoLayerNet above, learnable parameters are stored in the
self.params dictionary and will be learned using the Solver class.
"""
def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
dropout=1, normalization=None, reg=0.0,
weight_scale=1e-2, dtype=np.float32, seed=None):
"""
Initialize a new FullyConnectedNet.
Inputs:
- hidden_dims: A list of integers giving the size of each hidden layer.
- input_dim: An integer giving the size of the input.
- num_classes: An integer giving the number of classes to classify.
- dropout: Scalar between 0 and 1 giving dropout strength. If dropout=1 then
the network should not use dropout at all.
- normalization: What type of normalization the network should use. Valid values
are "batchnorm", "layernorm", or None for no normalization (the default).
- reg: Scalar giving L2 regularization strength.
- weight_scale: Scalar giving the standard deviation for random
initialization of the weights.
- dtype: A numpy datatype object; all computations will be performed using
this datatype. float32 is faster but less accurate, so you should use
float64 for numeric gradient checking.
- seed: If not None, then pass this random seed to the dropout layers. This
will make the dropout layers deteriminstic so we can gradient check the
model.
"""
self.normalization = normalization
self.use_dropout = dropout != 1
self.reg = reg
self.num_layers = 1 + len(hidden_dims)
self.dtype = dtype
self.params = {}
############################################################################
# TODO: Initialize the parameters of the network, storing all values in #
# the self.params dictionary. Store weights and biases for the first layer #
# in W1 and b1; for the second layer use W2 and b2, etc. #
# When using batch normalization, store scale and shift parameters for the #
# first layer in gamma1 and beta1; for the second layer use gamma2 and #
# beta2, etc. Scale parameters should be initialized to ones and shift #
# parameters should be initialized to zeros. #
############################################################################
input_size = input_dim
for i in range(len(hidden_dims)):
output_size = hidden_dims[i]
self.params['W' + str(i+1)] = np.random.randn(input_size,output_size) * weight_scale
self.params['b' + str(i+1)] = np.zeros(output_size)
if self.normalization:
self.params['gamma' + str(i+1)] = np.ones(output_size)
self.params['beta' + str(i+1)] = np.zeros(output_size)
input_size = output_size # 下一层的输入
# 输出层,没有BN操作
self.params['W' + str(self.num_layers)] = np.random.randn(input_size,num_classes) * weight_scale
self.params['b' + str(self.num_layers)] = np.zeros(num_classes)
# When using dropout we need to pass a dropout_param dictionary to each
# dropout layer so that the layer knows the dropout probability and the mode
# (train / test). You can pass the same dropout_param to each dropout layer.
self.dropout_param = {}
if self.use_dropout:
self.dropout_param = {'mode': 'train', 'p': dropout}
if seed is not None:
self.dropout_param['seed'] = seed
# With batch normalization we need to keep track of running means and
# variances, so we need to pass a special bn_param object to each batch
# normalization layer. You should pass self.bn_params[0] to the forward pass
# of the first batch normalization layer, self.bn_params[1] to the forward
# pass of the second batch normalization layer, etc.
self.bn_params = []
if self.normalization=='batchnorm':
self.bn_params = [{'mode': 'train'} for i in range(self.num_layers - 1)]
if self.normalization=='layernorm':
self.bn_params = [{} for i in range(self.num_layers - 1)]
# Cast all parameters to the correct datatype
for k, v in self.params.items():
self.params[k] = v.astype(dtype)
def loss(self, X, y=None):
"""
Compute loss and gradient for the fully-connected net.
Input / output: Same as TwoLayerNet above.
"""
X = X.astype(self.dtype)
mode = 'test' if y is None else 'train'
# Set train/test mode for batchnorm params and dropout param since they
# behave differently during training and testing.
if self.use_dropout:
self.dropout_param['mode'] = mode
if self.normalization=='batchnorm':
for bn_param in self.bn_params:
bn_param['mode'] = mode
############################################################################
# TODO: Implement the forward pass for the fully-connected net, computing #
# the class scores for X and storing them in the scores variable. #
# #
# When using dropout, you'll need to pass self.dropout_param to each #
# dropout forward pass. #
# #
# When using batch normalization, you'll need to pass self.bn_params[0] to #
# the forward pass for the first batch normalization layer, pass #
# self.bn_params[1] to the forward pass for the second batch normalization #
# layer, etc. #
############################################################################
cache = {} # 需要存储反向传播需要的参数
cache_dropout = {}
hidden = X
for i in range(self.num_layers - 1):
if self.normalization == 'batchnorm':
hidden,cache[i+1] = affine_bn_relu_forward(hidden,
self.params['W' + str(i+1)],
self.params['b' + str(i+1)],
self.params['gamma' + str(i+1)],
self.params['beta' + str(i+1)],
self.bn_params[i])
elif self.normalization == 'layernorm':
hidden, cache[i + 1] = affine_ln_relu_forward(hidden,
self.params['W' + str(i + 1)],
self.params['b' + str(i + 1)],
self.params['gamma' + str(i + 1)],
self.params['beta' + str(i + 1)],
self.bn_params[i])
else:
hidden , cache[i+1] = affine_relu_forward(hidden,self.params['W' + str(i+1)],
self.params['b' + str(i+1)])
if self.use_dropout:
hidden , cache_dropout[i+1] = dropout_forward(hidden,self.dropout_param)
# 最后一层不用激活层
scores, cache[self.num_layers] = affine_forward(hidden , self.params['W' + str(self.num_layers)],
self.params['b' + str(self.num_layers)])
# If test mode return early
if mode == 'test':
return scores
############################################################################
# TODO: Implement the backward pass for the fully-connected net. Store the #
# loss in the loss variable and gradients in the grads dictionary. Compute #
# data loss using softmax, and make sure that grads[k] holds the gradients #
# for self.params[k]. Don't forget to add L2 regularization! #
# #
# When using batch/layer normalization, you don't need to regularize the scale #
# and shift parameters. #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
loss, grads = 0.0, {}
loss, dS = softmax_loss(scores , y)
# 最后一层没有relu激活层
dhidden, grads['W' + str(self.num_layers)], grads['b' + str(self.num_layers)] \
= affine_backward(dS,cache[self.num_layers])
loss += 0.5 * self.reg * np.sum(self.params['W' + str(self.num_layers)] * self.params['W' + str(self.num_layers)])
grads['W' + str(self.num_layers)] += self.reg * self.params['W' + str(self.num_layers)]
for i in range(self.num_layers - 1, 0, -1):
loss += 0.5 * self.reg * np.sum(self.params["W" + str(i)] * self.params["W" + str(i)])
# 倒着求梯度
if self.use_dropout:
dhidden = dropout_backward(dhidden,cache_dropout[i])
if self.normalization == 'batchnorm':
dhidden, dw, db, dgamma, dbeta = affine_bn_relu_backward(dhidden, cache[i])
grads['gamma' + str(i)] = dgamma
grads['beta' + str(i)] = dbeta
elif self.normalization == 'layernorm':
dhidden, dw, db, dgamma, dbeta = affine_ln_relu_backward(dhidden, cache[i])
grads['gamma' + str(i)] = dgamma
grads['beta' + str(i)] = dbeta
else:
dhidden, dw, db = affine_relu_backward(dhidden, cache[i])
grads['W' + str(i)] = dw + self.reg * self.params['W' + str(i)]
grads['b' + str(i)] = db
return loss, grads
作为一个实验,我们将在500个训练示例上训练多种设置的带Dropout的网络和不带Dropout的网络。然后我们将随着时间的推移可视化这两个网络的训练和验证准确性。
# Train two identical nets, one with dropout and one without
# Train two identical nets, one with dropout and one without
np.random.seed(231)
num_train = 500
small_data = {
'X_train': data['X_train'][:num_train],
'y_train': data['y_train'][:num_train],
'X_val': data['X_val'],
'y_val': data['y_val'],
}
solvers = {}
# 原来的设定是p = 1 和 p = 0.25
dropout_choices = [1, 0.75,0.5,0.25,0.1]
for dropout in dropout_choices:
model = FullyConnectedNet([500], dropout=dropout)
print(dropout)
solver = Solver(model, small_data,
num_epochs=25, batch_size=100,
update_rule='adam',
optim_config={
'learning_rate': 5e-4,
},
verbose=False, print_every=100)
solver.train()
solvers[dropout] = solver
训练结果展示:
# Plot train and validation accuracies of the two models
train_accs = []
val_accs = []
for dropout in dropout_choices:
solver = solvers[dropout]
print(dropout,"train_acc",max(solver.train_acc_history))
print(dropout,"val_acc",max(solver.val_acc_history))
train_accs.append(solver.train_acc_history[-1])
val_accs.append(solver.val_acc_history[-1])
plt.subplot(3, 1, 1)
for dropout in dropout_choices:
plt.plot(solvers[dropout].train_acc_history, 'o', label='%.2f dropout' % dropout)
plt.title('Train accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(ncol=2, loc='lower right')
plt.subplot(3, 1, 2)
for dropout in dropout_choices:
plt.plot(solvers[dropout].val_acc_history, 'o', label='%.2f dropout' % dropout)
plt.title('Val accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(ncol=2, loc='lower right')
plt.gcf().set_size_inches(15, 15)
plt.show()
比如上一个代码展示的不同的p情况下训最高精度和测试最高精度:
1 train_acc 0.994
1 val_acc 0.317
0.75 train_acc 0.988
0.75 val_acc 0.317
0.5 train_acc 0.99
0.5 val_acc 0.329
0.25 train_acc 0.944
0.25 val_acc 0.337
0.1 train_acc 0.74
0.1 val_acc 0.342