用Python进行神经网络初始化、正则、优化

初始化:

def model(...initilalization = 'he'):

if intitialization == 'he':

parameters = initialize_parameters_he(layers_dims)

-------------------------------------------------

def initialize_parameters_he(layers_dims):

np.random.seed(3)

parameters = {}

L = len(layers_dims) - 1

for i in range(1, L+1):

parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l-1]) * np.sqrt(2. / layers_dims[l-1])

parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))

return parameters

正则化:

def model(..., lambd = 0, keep_prob = 1):

...

for i in range(0, num_iterations):

if keep_prob == 1:

a3, cache = forward_propagation(X, parameters)

elif keep_prob < 1:

a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)

if lambd == 0:

cost = compute_cost(a3, Y)

else:

cost = compute_cost_with_regularization(a3, Y, parameters, lambd)

...

return parameters

----------------------------------------------------------

def compute_cost_with_regularization(A3, Y, parameters, lambd):

m = Y.shape[1]

W1 = parameters['W1']

W2 = parameters['W2']

W3 = parameters['W3']

cross_entropy_cost = compute _cost(A3, Y)

L2_regularization_cost = (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3))) * lambd / 2 / m

cost = cross_entropy_cost + L2_regularization_cost

return cost

-----------------------------------------------------------

def backward_propagation_with_regularization(X, Y, cache, lambd):

m = X.shape[1]

(Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache

dZ3 = A3 - Y

dW3 = 1. / m * np.dot(dZ3,  A2.T) + lambd / m * W3

db3 = 1. / m * np.sum(dZ3, axis=1, keepdims = True)

dA2 = np.dot(W3.T, dZ3)

dZ2 = np.multiply(dA2, np.int64(A2>0))

dW2 = 1. / m * np.dot(dZ2. A1.T) + lambd / m * W2

db2 = 1. / m * np.sum(dZ2, axis=1, keepdims = True)

dA1 = np.dot(W2.T, dZ2)

dZ1 = np.multiply(dA1, np.int64(A1>0))

dW1 = 1. / m * np.dot(dZ1, X.T) + lambd / m * W1

db1 = 1. / m * np.sum(dZ1, axis=1, keepdims = True)

gradients = {'dZ3': dZ3, 'dW3': dW3, 'db3': db3, 'dA2': dA2, ...}

-------------------------------------------------------------------

def forward_propagation_with_dropout(X, parameters, keep_prob = 0.5):

np.random.seed(1)

W1 = parameters['W1']

b1 = parameters['b1']

...

Z1 = np.dot(W1, X) + b1

A1 = relu(Z1)

D1 = np.random.rand(A1.shape[0], A1.shape[1])

D1 = D1 < keep_prob

A1 = A1 * D1 #shutdown 0s

A1 = A1 / keep_prob

Z2 = np.dot(W2, A1) + b2

A2 = relu(Z2)

D2 = np.random.rand(A2, shape[0], A2.shape[1])

D2 = D2 < keep_prob

A2 = A2 * D2

A2 = A2 / keep_prob

Z3 = np.dot(W3, A2) + b3

A3 = sigmoid(Z3)

cache = (Z1, D1, A1, W1, b1, ...)

return A3, cache

---------------------------------------------------------------------

def backward_propagation_with_dropout(X, Y, cache, keep_prob):

...

dZ3 = A3 - Y

dW3 = 1. / m * np.dot(dZ3, A2.T)

db3 = 1. / m * np.sum(dZ3, axis=1, keepdims = True)

dA2 = np.dot(W3.T, dZ3)

dA2 = dA2 * D2

dA2 = dA2 / keep_prob

dZ2 = np.multiply(dA2, np.int64(A2 > 0))

....

gradients ={...}

return gradients

优化:

def update_parameters_with_gd(parameters, grads, learning_rate):

L = len(parameters) // 2

for l in range(L):

parameters['W' + str(l+1)] = parameters['W' + str(l+1)] - learning_rate * grads['dW' + str(l+1)]

parameters['b' + str(l+1)] = parameters['b' + str(l+1)] - learning_rate * grads['db' + str(l+1)]

return parameters

------------------------------------------------------------------------

def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):

np.random.seed(seed)

m = X.shape[1]

mini_batches = []

permutation = list(np.random.permutation(m))

shuffled_X = X[:, permutation]

shuffled_Y = Y[:, permutaion].reshape((1, m))

num_complete_minibatches = math.floor (m / mini_batch_size)

for k in range(0, num_complete_minibatches):

mini_batch_X = shuffled_X[:, k * mini_batch_size : (k + 1) * mini_batch_size]

mini_batch_Y = shuffled_Y[:, k * mini_batch_size : (k + 1) * mini_batch_size]

mini_batch = (mini_batch_X, mini_batch_Y)

mini_batches.append(mini_batch)

if m % mini_batch_size != 0:

mini_batch_X = shuffled_X[:, - ( m - mini_batch_size * math.floor ( m / mini_batch_size)) : ]

mini_batch_Y = shuffled_Y[:, - ( m - mini_batch_size * math.floor ( m / mini_batch_size)) : ]

mini_batch = (mini_batch_X, mini_batch_Y)

mini_batches.append(mini_batch)

return mini_batches

---------------------------------------------------------------------------------

def initialize_velocity(parameters):

L = len(parameters) // 2

v = {}

for i in range(L):

v['dW' + str(l+1)] = np.zeros(parameters['W' + str(l+1)].shape)

v['db' + str(l+1)] = np.zeros(parameters['b' + str(l+1)].shape)

return v

----------------------------------------------------------------------------------

def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):

L = len(parameters) // 2

for l in range(L):

v['dW' + str(l+1)] = beta * v['dW' + str(l+1)] + (1 - beta) * grads['dW' + str(l + 1)]

v['db' + str(l+1)] = beta * v['db' + str(l+1)] + (1 - beta) * grads['db' +str(l + 1)]

parameters['W' + str(l + 1)] = parameters['W' + str(l + 1)] - learning_rate * v['dW' + str( l+ 1)]

parameters['b' + str(l + 1)] = parameters['b' + str(l + 1)] - learning_rate * v['db' + str(l + 1)]

return parameters, v

---------------------------------------------------------------------------------

def initialize_adam(parameters):

L = len(parameters) // 2

v = {}

s = {}

for l in range(L):

v['dW' + str(l+1)] = np.zeros(parameters['W' + str(l+1)].shape)

v['db' + str(l+1)] = np.zeros(parameters['b' + str(l+1)].shape)

s['dW' + str(l+1)] = np.zeros(parameters['W' + str(l+1).shape)

s['db' + str(l+1)] = np.zeros(parameters['b' + str(l+1).shape)

return v, s

--------------------------------------------------------------------------------------------

def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01, beta1 = 0.9, beta2 = 0.999, epsilon = 1e - 8):

L = len(parameters) // 2

v_corrected = {}

s_corrected = {}

for l in range(L):

v['dW' + str(l+1)] = beta1 * v['dW' + str(l+1)] + (1 - beta1) * grads['dW' + str(l+1)]

v['db' + str(l+1)] = beta1 * v['db' + str(l+1)] + (1- beta1) * grads['db' + str(l+1)]

v_corrected['dW' + str(l+1)] = v('dW' + str(l+1)] / (1 - np.power(beta1, t))

v_corrected['db' + str(l+1)] = v['db' + str(l+1)] / (1 - np.power(beta1, t))

s['dW' + str(l+1)] = beta2 * s['dW' + str(l+1)] + (1 - beta2) * np.power(grads['dW' + str(l+1)]

s['db' + str(l+1)] = beta2 * s['db' + str(l+1)] + (1 - beta2) * np.power(grads['db' + str(l+1)]

s_corrected['dW' + str(l+1)] = s['dW' + str(l+1)] / (1 - np.power(beta2, t))

s_corrected['db' + str(l+1)] = s['db' + str(l+1)] / (1 - np.power(beta2, t))

parameters['W' +str(l+1)] = parameters['W' + str(l+1)] - learning_rate * v_corrected['dW' + str(l+1)] / (np.sqrt(s_corrected['dW' + str(l+1)]) + epsilon)

parameters['b' + str(l+1)] = parameters['b' + str(l+1)] - learning_rate * v_corrected['db' + str(l+1)] / (np.sqrt(s_corrected['db' + str(l+1)]) + epsilon)

return parameters, v, s

-------------------------------------------------------------------------------------------------

def model(...mini_batch_size = 64, beta = 0.9, beta1 = 0.9, beta2 = 0.999, epsilon = 1e - 8, num_epochs = 10000, print_cost = True):

....

if optimizer == 'momentum':

v = initialize_velocity(parameters)

elif optimizer == 'adam':

v, s = initialize_adam(parameters)

for i in range(num_epochs):

seed = seed +1

minibatches = random_mini_batches(X, Y, mini_batch_size, seed)

for minibatch in minibatches:

(minibatch_X, minibatch_Y) = minibatch

a3, cache = forward_propagation(minibatch_X, parameters)

cost = compute_cost(a3, minibatch_Y)

grads = backward_propagation(minibatch_X, minibatch_Y, caches)

if optimizer == 'momentum':

parameters = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)

elif optimizer == 'adam':

t = t + 1

parameters = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1, beta2, epsilon)

if print_cost and i % 1000 == 0:

  print('cost after epoch %i: %f' %(i, cost))

if print_cost and i % 100 == 0:

costs.append(cost)

plt.plot(costs)

plt.ylabel('cost')

plt.xlabel('epchs per 100')

plt.title('learning_rate = ' + str(learning_rate))

plt.show()

return parameters



你可能感兴趣的:(用Python进行神经网络初始化、正则、优化)