数据集一共分为50000训练集,10000测试集。但是我们为了速度考虑选择5000训练,500测试。
input_dim:输入数据是32*32彩色的。hidden_dim;隐藏层有十个神经元;num_classes输出十个类别的可能性。weight_scale:权重初始化小一些,reg正则化惩罚力度。
#初始化w,b
def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,
weight_scale=1e-3, reg=0.0):
self.params = {}
self.reg = reg
self.params['W1'] = weight_scale * np.random.randn(input_dim, hidden_dim)
self.params['b1'] = np.zeros((1, hidden_dim))
self.params['W2'] = weight_scale * np.random.randn(hidden_dim, num_classes)
self.params['b2'] = np.zeros((1, num_classes))
数据输入进来,通过全连接层后,再通过激活层(用的ReLu函数),然后输出计算损失值
首先拿到初始化的w,b,将x,w,b传入函数,进行前向传播的过程。最后可以求得得分值。
scores = None
N = X.shape[0]
# Unpack variables from the params dictionary
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
#这两个函数的区别是,第二个没有激活函数
h1, cache1 = affine_relu_forward(X, W1, b1)
out, cache2 = affine_forward(h1, W2, b2)
#得分值
scores = out
进入affine_relu_forward函数,这个函数计算出中间层的输出。
def affine_relu_forward(x, w, b):
a, fc_cache = affine_forward(x, w, b)
#保存中间值,包括原本全连接层的输出和经过ReLu之后的输出,为了反向传播时计算用。
out, relu_cache = relu_forward(a)
cache = (fc_cache, relu_cache)
return out, cache
在affine_forward中进行计算,x*w+b
out = np.dot(x_row, w) + b
relu_forward功能:进行ReLu操作,ReLu,就是算出max(0,x)
def relu_forward(x):
out = None
out = ReLU(x)
cache = x
return out, cache
def ReLU(x):
"""ReLU non-linearity."""
return np.maximum(0, x)
调用softmax_loss
data_loss, dscores = softmax_loss(scores, y)
def softmax_loss(x, y):
#对得分进行归一化处理
probs = np.exp(x - np.max(x, axis=1, keepdims=True))
probs /= np.sum(probs, axis=1, keepdims=True)
N = x.shape[0]
使用-log(属于正确类别的得分) ,并计算损失值
loss = -np.sum(np.log(probs[np.arange(N), y])) / N
dx = probs.copy()
#求梯度
dx[np.arange(N), y] -= 1
dx /= N
#将损失值和梯度返回
return loss, dx
#正则化惩罚项 1/2w^2
reg_loss = 0.5 * self.reg * np.sum(W1*W1) + 0.5 * self.reg * np.sum(W2*W2)
#损失函数=损失值+正则化惩罚
loss = data_loss + reg_loss
算完softmax的梯度之后该算前一层的梯度了,第二层的w2和b2。
调用affine_backward,然后调用affine_relu_backward。
dh1, dW2, db2 = affine_backward(dscores, cache2)
dX, dW1, db1 = affine_relu_backward(dh1, cache1)
对于x求梯度,求导就是w,实际上就是w*dout(之前传下来的梯度),代码如①,计算w的梯度同样,如②。对b求就是1,那他就等于上面传下来的。
#dout是softmax层传的梯度,cache是第二层计算的结果。
def affine_backward(dout, cache):
x, w, b = cache
dx, dw, db = None, None, None
#①
dx = np.dot(dout, w.T) # (N,D)
#对x进行规范化
dx = np.reshape(dx, x.shape) # (N,d1,...,d_k)
x_row = x.reshape(x.shape[0], -1) # (N,D)
#②
dw = np.dot(x_row.T, dout) # (D,M)
db = np.sum(dout, axis=0, keepdims=True) # (1,M)
return dx, dw, db
affine_relu_backward函数,先对relu进行反向传播。然后再次调用affine_backward。
def affine_relu_backward(dout, cache):
"""
Backward pass for the affine-relu convenience layer
"""
fc_cache, relu_cache = cache
da = relu_backward(dout, relu_cache)
dx, dw, db = affine_backward(da, fc_cache)
return dx, dw, db
对于relu层,前向传播是通过max(0,x),所以求导时x>0时导数为1也就是传进来的梯度,当x≤0时,导数就是0,那么梯度也为0.
def relu_backward(dout, cache):
dx, x = None, cache
dx = dout
dx[x <= 0] = 0
return dx
加上正则化惩罚项,就完成了完整的反向传播
dW2 += self.reg * W2
dW1 += self.reg * W1
保存梯度值
grads['W1'] = dW1
grads['b1'] = db1
grads['W2'] = dW2
grads['b2'] = db2