def init_parameters(layer_dims,initialization):
np.random.seed(3)
parameters = {}
if initialization=='zeros':
for i in range(1,len(layer_dims)):
parameters['W'+str(i)] = np.zeros((layer_dims[i],layer_dims[i-1]))
parameters['b'+str(i)] = np.zeros((layer_dims[i],1))
elif initialization=='random':
for i in range(1,len(layer_dims)):
parameters['W'+str(i)] = np.random.randn(layer_dims[i],layer_dims[i-1])
parameters['b'+str(i)] = np.zeros((layer_dims[i],1))
elif initialization=='he': #这是由He等人在所写的Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification论文中得到的结论
for i in range(1,len(layer_dims)):
parameters['W'+str(i)] = np.random.randn(layer_dims[i],layer_dims[i-1]) * np.sqrt(2/layer_dims[i-1])
parameters['b'+str(i)] = np.zeros((layer_dims[i],1))
else:
print("错误的初始化参数!程序退出")
exit()
assert(parameters['W'+str(i)].shape == (layer_dims[i],layer_dims[i-1]))
assert(parameters['b'+str(i)].shape == (layer_dims[i],1))
return parameters
因为W是一个 n [ l ] × n [ l − 1 ] n^{[l]}\times n^{[l-1]} n[l]×n[l−1]的矩阵, n [ l ] n^{[l]} n[l]表示l层神经元的数目, n [ l − 1 ] n^{[l-1]} n[l−1]表示l-1层神经元的数目,故:
∑ i = 1 L ∥ W [ l ] ∥ F 2 = ∑ i = 1 n [ l ] ∑ j = 1 n [ l − 1 ] ( w i j [ l ] ) 2 \sum_{i=1}^{L}\left \| W^{[l]} \right \|_{F}^{2}=\sum_{i=1}^{n^{[l]}}\sum_{j=1}^{n^{[l-1]}}(w_{ij}^{[l]})^2 ∑i=1L∥∥W[l]∥∥F2=∑i=1n[l]∑j=1n[l−1](wij[l])2
d W [ l ] = ∂ L ∂ W [ l ] + λ m W [ l ] dW^{[l]}=\frac{\partial L}{\partial W^{[l]}}+\frac{\lambda }{m}W^{[l]} dW[l]=∂W[l]∂L+mλW[l]
权重更新:
W [ l ] = W [ l ] − α d W [ l ] = W [ l ] − α ( ∂ L ∂ W [ l ] + λ m W [ l ] ) = ( 1 − α λ m ) W [ l ] − α d W [ l ] W^{[l]}=W^{[l]}-\alpha dW^{[l]}=W^{[l]}-\alpha(\frac{\partial L}{\partial W^{[l]}}+\frac{\lambda }{m}W^{[l]})=(1-\frac{\alpha \lambda }{m})W^{[l]}-\alpha dW^{[l]} W[l]=W[l]−αdW[l]=W[l]−α(∂W[l]∂L+mλW[l])=(1−mαλ)W[l]−αdW[l]
从上式可以看出,正则化试图让 W [ l ] W^{[l]} W[l]乘以一个小于1的权重系数,使其变得更小,因此L2正则化也被称为“权重衰减”。具体正则化为何有利于预防过拟合,可参考第二门课第一周的1.5节(参考链接:http://www.ai-start.com/dl2017/html/lesson2-week1.html#header-n89)。
def datagen(m,lambd,is_plot):
np.random.seed(1)
N = int(m/2) #分为两类
D = 2 #样本的特征数或维度
X = np.zeros((m,D)) #初始化样本坐标
Y = np.zeros((m,1)) #初始化样本标签
for j in range(2):
ix = range(N*j,N*(j+1))
t = np.random.randn(N)*lambd
r = np.random.randn(N)*lambd
if j==0:
X[ix] = np.c_[t-0.4, r-0.4]
else:
X[ix] = np.c_[t+0.4, r+0.4]
Y[ix] = j #red or blue
if is_plot:
fig = plt.figure()
plt.rcParams['figure.figsize']=(7.0,4.0)
plt.rcParams['image.interpolation']='nearset'
plt.rcParams['image.cmap']='gray'
plt.title('training dataset')
plt.scatter(X[:, 0], X[:, 1], c=np.squeeze(Y), s=40, cmap=plt.cm.Spectral)
plt.show()
return X.T,Y.T
加入正则化后的更新代码:
计算cost
def compute_cost_with_regulation(A,Y,parameters,lambd):
m = Y.shape[1]
sum = 0.
cross_entropy_cost = compute_cost(A[len(A)-1],Y)
for i in range(len(A)):
sum += np.sum(np.square(parameters["W"+str(i+1)]))
L2_regularization_cost = lambd * sum / (2 * m)
cost = cross_entropy_cost + L2_regularization_cost
return cost
反向传播
def backward_propagation_with_regulation(X,Y,Z,A,W,lambd,derivate_function):
l = len(W)
dZ = list(range(l))
dA = list(range(l))
dW = list(range(l))
db = list(range(l))
m = Y.shape[1]
grads = {}
dZ[l-1] = A[l-1] - Y
for i in range(l-1,-1,-1):
if i>0:
dW[i] = (1/m)*np.dot(dZ[i],A[i-1].T) + ((lambd * W[i] / m))
else:
dW[i] = (1/m)*np.dot(dZ[i],X.T) + ((lambd * W[i] / m))
db[i] = (1/m)*np.sum(dZ[i],axis=1,keepdims=True)
dA[i-1] = np.dot(W[i].T,dZ[i])
dZ[i-1] = np.multiply(dA[i-1],np.int64(A[i-1]>0))
for i in range(len(dW)):
grads["dW"+str(i+1)] = dW[i]
grads["db"+str(i+1)] = db[i]
return grads
实施dropout最常用的方法是inverted dropout(反向随机失活),该方法需要为网络的每一层设置失活概率,用keep-prob表示,比如keep-prob=0.8,那么消除任意一个隐藏单元的概率为0.2。以一个三层网络为例: 首先定义一个向量d,若对网络的第三层实施dropout,则: d 3 = n p . r a n d o m . r a n d ( A [ 3 ] . s h a p e [ 0 ] , A [ 3 ] . s h a p e [ 1 ] ) < k e e p − p r o b d3=np.random.rand(A^{[3]}.shape[0],A^{[3]}.shape[1])<keep-prob d3=np.random.rand(A[3].shape[0],A[3].shape[1])<keep−prob A [ 3 ] = n p . m u l t i p l y ( A [ 3 ] , d 3 ) / k e e p − p r o b A^{[3]}=np.multiply(A^{[3]},d3)/keep-prob A[3]=np.multiply(A[3],d3)/keep−prob 第一行公式得到的d3为布尔型数组,值为True或False, A [ 3 ] A^{[3]} A[3] 的期望值保持不变。
下面来看用代码如何实现,以及实验效果(还是以L2正则化中的训练集为例):
对最后一层不实施dropout,前向传播与反向传播的代码分别如下:
前向传播
def forward_propagation_with_dropout(X,parameters,activate_fun,keep_prob):
#retrieve parameters
W = []
b = []
for i in range(1,len(parameters)//2+1):
W.append(parameters["W"+str(i)])
b.append(parameters["b"+str(i)])
#compute forward_propagation
Z = []
A = []
D = []
for i in range(len(W)):
if i==0:
sZ = np.dot(W[i],X)+b[i]
else:
sZ = np.dot(W[i],A[i-1])+b[i]
sA = activate_fun[i](sZ)
if i<(len(W)-1):
sD = np.random.rand(sA.shape[0],sA.shape[1]) < keep_prob
sA = np.multiply(sA,sD) / keep_prob
Z.append(sZ)
A.append(sA)
D.append(sD)
return Z,A,W,D
反向传播
def backward_propagation_with_dropout(X,Y,Z,A,W,D,keep_prob):
l = len(W)
dZ = list(range(l))
dA = list(range(l-1))
dW = list(range(l))
db = list(range(l))
m = Y.shape[1]
grads = {}
dZ[l-1] = A[l-1] - Y
for i in range(l-1,0,-1):
if i>0:
dW[i] = (1/m)*np.dot(dZ[i],A[i-1].T)
else:
dW[i] = (1/m)*np.dot(dZ[i],X.T)
db[i] = (1/m)*np.sum(dZ[i],axis=1,keepdims=True)
dA[i-1] = np.dot(W[i].T,dZ[i])*D[i-1]/keep_prob
dZ[i-1] = np.multiply(dA[i-1],np.int64(A[i-1]>0))
for i in range(len(dW)):
grads["dW"+str(i+1)] = dW[i]
grads["db"+str(i+1)] = db[i]
return grads
只需要在调用该对象合适(比如下列的setStyles)的方法后让该方法返回该对象(通过this 因为一旦一个函数称为一个对象方法的话那么在这个方法内部this(结合下面的setStyles)指向这个对象)
function create(type){
var element=document.createElement(type);
//this=element;
JAX-WS
SOAP Version 1.2 Part 0: Primer (Second Edition)
SOAP Version 1.2 Part 1: Messaging Framework (Second Edition)
SOAP Version 1.2 Part 2: Adjuncts (Second Edition)
Which style of WSDL
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml