# Package imports
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
import matplotlib
# Display plots inline and change default figure size
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)
要训练模型,首先就要生成一个数据集。很庆幸,scikit-learn 有很多有用的数据集生成器,我们这里直接使用make_moons这个函数来生成我们的数据集。
# Generate a dataset and plot it
X, y = sklearn.datasets.make_moons(200, noise=0.20)
plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.Spectral)
# Helper function to plot a decision boundary.
# If you don't fully understand this function don't worry,
# it just generates the contour plot below.
def plot_decision_boundary(pred_func):
# Set min and max values and give it some padding
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = 0.01
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Predict the function value for the whole gid
Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)
为了证明我的观点,我将训练一个逻辑回归分类器。这个分类器接收x,y的输入,然后输出对应的分类(0 或者 1)。为了简便,这里直接使用scikit-learn提供的模型来实现。
# Train the logistic regression classifier
clf = sklearn.linear_model.LogisticRegressionCV(cv=5)
clf.fit(X, y)
# Plot the decision boundary
plot_decision_boundary(lambda x: clf.predict(x))
plt.title("Logistic Regression")
因为我们想要我们的神经网络输出概率,所以输出层的激活函数需要使用softmax,它提供了将得分转换为概率的途径。如果你对logistic 函数很熟悉,那么你可以将softmax函数看作它在多分类问题上的扩展。
我们的神经网络通过前想传播来进行预测,也就是一堆的矩阵乘法和对我们所定义的激活函数的应用。假设x是一个2维向量,那么我们通过如下的方式来计算 y ^ \hat{y} y^:
a2= y ^ \hat{y} y^=softmax(z2)
zi是第i层的输入,ai是第i层经过激活函数的作用后的输出。W1,b1,W2,b2是神经网络的参数,需要我们从数据集中来学习。可以把它们看作是神经网络不同层之间的数据传输矩阵。通过矩阵乘法的定义,我们可以决定这些矩阵的纬度。假如隐层有500个神经元,那么 W 1 ∈ R 2 × 500 W_{1}\in \mathbb{R}^{2\times 500} W1∈R2×500, b 1 ∈ R 500 b_{1}\in \mathbb{R}^{500} b1∈R500, W 2 ∈ R 500 × 2 W_{2}\in \mathbb{R}^{500\times 2} W2∈R500×2, b 2 ∈ R 2 b_{2}\in \mathbb{R}^{2} b2∈R2。现在你应该能够发现为什么隐层的神经元的个数越多,我们的参数就越多了。
对参数进行学习,也就是寻找能够最小化训练集上的误差的(W1,b1,W2,b2)。所以问题就变成了如何来定义这个误差?我们会定义一个损失函数来描述这个误差。输出使用softmax,那么对应的损失函数通常定义为cross-entropy-loss(也叫做负log likelihood)。假如我们拥有N个训练数据和C个类别,那么我们所预测的 y ^ \hat{y} y^与真正的标签y之间的损失函数定义为:
L ( y , y ^ ) = − 1 N ∑ n ∈ N ∑ i ∈ C y n , i l o g y ^ n , i L(y,\hat{y})=-\frac{1}{N}\sum_{n\in N}\sum_{i\in C}y_{n,i}log\hat{y}_{n,i} L(y,y^)=−N1n∈N∑i∈C∑yn,ilogy^n,i
这个公式开起来很复杂,但实际上它做的事情就是如果我们在某个样本上预测错误,就累计该错误。然后在整个样本上求和。y(实际值)和 y ^ \hat{y} y^(预测值)相差越大,损失函数的值就会越大。最小化损失函数,其实就是在数据集上最大话似然函数。
作为输入,梯度下降法需要损失函数对我们的参数的的梯度(导数的向量): ∂ L ∂ W 1 \frac{\partial L}{\partial W_{1}} ∂W1∂L, ∂ L ∂ b 1 \frac{\partial L}{\partial b_{1}} ∂b1∂L, ∂ L ∂ W 2 \frac{\partial L}{\partial W_{2}} ∂W2∂L, ∂ L ∂ b 2 \frac{\partial L}{\partial b_{2}} ∂b2∂L。我们使用著名的误差逆传播算法来求这些梯度。这里我不会细说误差逆传播算法是如何工作的,你可以参考这些在网上流传很广的解释传送门、传送门。
δ 3 = y ^ − y \delta_{3}=\hat{y}-y δ3=y^−y
δ 2 = ( 1 − t a n h 2 z 1 ) ∘ δ 3 W 2 T \delta_{2}=(1-tanh^2z_{1})\circ\delta_{3}W_{2}^T δ2=(1−tanh2z1)∘δ3W2T
∂ L ∂ W 2 = a 1 T δ 3 \frac{\partial L}{\partial W_{2}}=a_{1}^T\delta_{3} ∂W2∂L=a1Tδ3
∂ L ∂ b 2 = δ 3 \frac{\partial L}{\partial b_{2}}=\delta_{3} ∂b2∂L=δ3
∂ L ∂ W 1 = x T δ 2 \frac{\partial L}{\partial W_{1}}=x^T\delta_{2} ∂W1∂L=xTδ2
∂ L ∂ b 1 = δ 2 \frac{\partial L}{\partial b_{1}}=\delta_{2} ∂b1∂L=δ2
num_examples = len(X) # training set size
nn_input_dim = 2 # input layer dimensionality
nn_output_dim = 2 # output layer dimensionality
# Gradient descent parameters (I picked these by hand)
epsilon = 0.01 # learning rate for gradient descent
reg_lambda = 0.01 # regularization strength
# Helper function to evaluate the total loss on the dataset
def calculate_loss(model):
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
# Forward propagation to calculate our predictions
z1 = X.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Calculating the loss
corect_logprobs = -np.log(probs[range(num_examples), y])
data_loss = np.sum(corect_logprobs)
# Add regulatization term to loss (optional)
data_loss += reg_lambda/2 * (np.sum(np.square(W1))
+ np.sum(np.square(W2)))
return 1./num_examples * data_loss
# Helper function to predict an output (0 or 1)
def predict(model, x):
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
# Forward propagation
z1 = x.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
return np.argmax(probs, axis=1)
#This function learns parameters for the
# neural network and returns the model.
#- nn_hdim: Number of nodes in the hidden layer
#- num_passes: Number of passes
# through the training data for gradient descent
#- print_loss: If True, print the loss every 1000 iterations
def build_model(nn_hdim, num_passes=20000, print_loss=False):
# Initialize the parameters to random values. We need to learn these.
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
b1 = np.zeros((1, nn_hdim))
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
b2 = np.zeros((1, nn_output_dim))
# This is what we return at the end
model = {}
# Gradient descent. For each batch...
for i in range(0, num_passes):
# Forward propagation
z1 = X.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Backpropagation
delta3 = probs
delta3[range(num_examples), y] -= 1
dW2 = (a1.T).dot(delta3)
db2 = np.sum(delta3, axis=0, keepdims=True)
delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
dW1 = np.dot(X.T, delta2)
db1 = np.sum(delta2, axis=0)
# Add regularization terms (b1 and b2 don't
# have regularization terms)
dW2 += reg_lambda * W2
dW1 += reg_lambda * W1
# Gradient descent parameter update
W1 += -epsilon * dW1
b1 += -epsilon * db1
W2 += -epsilon * dW2
b2 += -epsilon * db2
# Assign new parameters to the model
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
# Optionally print the loss.
# This is expensive because it uses the whole dataset,
# so we don't want to do it too often.
if print_loss and i % 1000 == 0:
print("Loss after iteration %i: %f" %(i, calculate_loss(model)))
return model
# Build a model with a 3-dimensional hidden layer
model = build_model(3, print_loss=True)
# Plot the decision boundary
plot_decision_boundary(lambda x: predict(model, x))
plt.title("Decision Boundary for hidden layer size 3")
plt.figure(figsize=(16, 32))
hidden_layer_dimensions = [1, 2, 3, 4, 5, 20, 50]
for i, nn_hdim in enumerate(hidden_layer_dimensions):
plt.subplot(5, 2, i+1)
plt.title('Hidden Layer size %d' % nn_hdim)
model = build_model(nn_hdim)
plot_decision_boundary(lambda x: predict(model, x))
import random
def build_model_batch(nn_hdim, num_passes=50000, print_loss=False,
# 这里的batch_size就是小批量的大小
# 建立一个训练集的索引列表
indexes = [index for index in range(num_examples)]
# Initialize the parameters to random values. We need to learn these.
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
b1 = np.zeros((1, nn_hdim))
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
b2 = np.zeros((1, nn_output_dim))
# This is what we return at the end
model = {}
# Gradient descent. For each batch...
for i in range(0, num_passes):
# 随机从训练集中拿出 batch_size个数据进行训练
train_indexes = random.sample(indexes, batch_size)
X_TRAIN = X[train_indexes, :]
y_train = y[train_indexes]
# Forward propagation
z1 = X_TRAIN.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Backpropagation
delta3 = probs
delta3[range(batch_size), y_train] -= 1
dW2 = (a1.T).dot(delta3)
db2 = np.sum(delta3, axis=0, keepdims=True)
delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
dW1 = np.dot(X_TRAIN.T, delta2)
db1 = np.sum(delta2, axis=0)
# Add regularization terms (b1 and b2 don't
# have regularization terms)
dW2 += reg_lambda * W2
dW1 += reg_lambda * W1
# Gradient descent parameter update
W1 += -epsilon * dW1
b1 += -epsilon * db1
W2 += -epsilon * dW2
b2 += -epsilon * db2
# Assign new parameters to the model
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
# Optionally print the loss.
# This is expensive because it uses
# the whole dataset, so we don't want to do it too often.
if print_loss and i % 1000 == 0:
print("Loss after iteration %i: %f" %(i, calculate_loss(model)))
return model
# Build a model with a 3-dimensional hidden layer with MiniBatch
model = build_model_batch(3, print_loss=False)
# Plot the decision boundary
plot_decision_boundary(lambda x: predict(model, x))
plt.title("Decision Boundary for hidden layer size 3 with MiniBatch")
# 初始的学习率
max_epsilon = 0.01
# 终止的学习率
min_epsilon = 0.001
def build_model_annealing(nn_hdim, num_passes=80000,
print_loss=False, explore=100):
# explore就是退火周期,每explore次迭代,退火一次
# Initialize the parameters to random values. We need to learn these.
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
b1 = np.zeros((1, nn_hdim))
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
b2 = np.zeros((1, nn_output_dim))
# This is what we return at the end
model = {}
# 初始化学习率为最大
tem_epsilon = max_epsilon
# Gradient descent. For each batch...
for i in range(0, num_passes):
# 进行退火
if tem_epsilon > min_epsilon and i % explore == 0:
tem_epsilon -= (max_epsilon - min_epsilon) / explore
tem_epsilon = max(tem_epsilon, min_epsilon)
# Forward propagation
z1 = X.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Backpropagation
delta3 = probs
delta3[range(num_examples), y] -= 1
dW2 = (a1.T).dot(delta3)
db2 = np.sum(delta3, axis=0, keepdims=True)
delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
dW1 = np.dot(X.T, delta2)
db1 = np.sum(delta2, axis=0)
dW2 += reg_lambda * W2
dW1 += reg_lambda * W1
# Gradient descent parameter update
W1 += -tem_epsilon * dW1
b1 += -tem_epsilon * db1
W2 += -tem_epsilon * dW2
b2 += -tem_epsilon * db2
# Assign new parameters to the model
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
if print_loss and i % 1000 == 0:
print("Loss after iteration %i: %f,the now learning rate is %s"
%(i, calculate_loss(model), tem_epsilon))
return model
# Build a model with a 3-dimensional hidden layer with Annealing
model = build_model_annealing(3, print_loss=True)
# Plot the decision boundary
plot_decision_boundary(lambda x: predict(model, x))
plt.title("Decision Boundary for hidden layer size 3 with Annealing")
# 定义sigmoid函数
def sigmoid(z):
s = 1.0 / (1 + np.exp(-z))
return s
# 绘制图像
def draw_sigmoid():
fig = plt.figure(figsize=(6,4))
ax = fig.add_subplot(111)
x=np.linspace(-6,6,1000) #这个表示在-5到5之间生成1000个x值
y=sigmoid(x) #对上述生成的1000个数循环用sigmoid公式求对应的y
plt.yticks([0,0.5,1.0],[0,0.5,1.0]) #设置y轴显示的刻度
plt.plot(x,y,color='darkblue') #用上述生成的1000个xy值对生成1000个点
ax.spines['right'].set_color('none') #删除右边框设为无
ax.spines['top'].set_color('none') #删除上边框设为无
ax.spines['bottom'].set_position(('data', 0)) #调整x轴位置
ax.spines['left'].set_position(('data', 0)) #调整y轴位置
# Helper function to evaluate the total loss on the dataset
# sigmoid edition
def calculate_loss_sigmoid(model):
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
# Forward propagation to calculate our predictions
z1 = X.dot(W1) + b1
# 修改了激活函数
a1 = sigmoid(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Calculating the loss
corect_logprobs = -np.log(probs[range(num_examples), y])
data_loss = np.sum(corect_logprobs)
# Add regulatization term to loss (optional)
data_loss += reg_lambda/2 * (np.sum(np.square(W1)) +
return 1./num_examples * data_loss
# Helper function to predict an output (0 or 1)
# sigmoid edtion
def predict_sigmoid(model, x):
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
# Forward propagation
z1 = x.dot(W1) + b1
# 修改了激活函数
a1 = sigmoid(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
return np.argmax(probs, axis=1)
# sigmoid edition of building model
def build_model_sigmoid(nn_hdim, num_passes=50000, print_loss=False):
# Initialize the parameters to random values. We need to learn these.
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
b1 = np.zeros((1, nn_hdim))
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
b2 = np.zeros((1, nn_output_dim))
# This is what we return at the end
model = {}
# Gradient descent. For each batch...
for i in range(0, num_passes):
# Forward propagation
z1 = X.dot(W1) + b1
# 修改了激活函数
a1 = sigmoid(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Backpropagation
delta3 = probs
delta3[range(num_examples), y] -= 1
dW2 = (a1.T).dot(delta3)
db2 = np.sum(delta3, axis=0, keepdims=True)
# 这里的导数变成了sigmoid的导数,即a1*(1-a1)
delta2 = delta3.dot(W2.T)*a1*(1-a1)
dW1 = np.dot(X.T, delta2)
db1 = np.sum(delta2, axis=0)
dW2 += reg_lambda * W2
dW1 += reg_lambda * W1
# Gradient descent parameter update
W1 += -epsilon * dW1
b1 += -epsilon * db1
W2 += -epsilon * dW2
b2 += -epsilon * db2
# Assign new parameters to the model
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
# 改为用sigmoid计算损失
if print_loss and i % 1000 == 0:
print("Loss after iteration %i: %f" %(i,
return model
# Build a model with a 3-dimensional hidden layer with sigmoid
model = build_model_sigmoid(3, print_loss=True)
# Plot the decision boundary
plot_decision_boundary(lambda x: predict_sigmoid(model, x))
plt.title("Decision Boundary for hidden layer size 3 with sigmoid")
def draw_ReLU():
fig = plt.figure(figsize=(6, 4))
ax = fig.add_subplot(111)
x = np.arange(-10, 10)
y = np.where(x>0, x, 0)
plt.xlim(-11, 11)
plt.ylim(-11, 11)
ax.spines['bottom'].set_position(('data', 0))
ax.set_xticks([-10, -5, 0, 5, 10])
ax.spines['left'].set_position(('data', 0))
ax.set_yticks([-10, -5, 5, 10])
plt.plot(x, y, label="ReLU", color="blue")
# Helper function to evaluate the total loss on the dataset
# ReLU edition
def calculate_loss_ReLU(model):
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
# Forward propagation to calculate our predictions
z1 = X.dot(W1) + b1
# 激活函数改为了ReLU
a1 = np.where(z1>0, z1, 0)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Calculating the loss
corect_logprobs = -np.log(probs[range(num_examples), y])
data_loss = np.sum(corect_logprobs)
# Add regulatization term to loss (optional)
data_loss += reg_lambda/2 * (np.sum(np.square(W1)) +
return 1./num_examples * data_loss
# Helper function to predict an output (0 or 1)
# ReLU edtion
def predict_ReLU(model, x):
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
# Forward propagation
z1 = x.dot(W1) + b1
# 激活函数改为了ReLU
a1 = np.where(z1>0, z1, 0)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
return np.argmax(probs, axis=1)
# ReLU edition of building model
def build_model_ReLU(nn_hdim, num_passes=20000, print_loss=False):
# Initialize the parameters to random values. We need to learn these.
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
b1 = np.zeros((1, nn_hdim))
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
b2 = np.zeros((1, nn_output_dim))
# This is what we return at the end
model = {}
# Gradient descent. For each batch...
for i in range(0, num_passes):
# Forward propagation
z1 = X.dot(W1) + b1
# 激活函数改为ReLU
a1 = np.where(z1>0, z1,0)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Backpropagation
delta3 = probs
delta3[range(num_examples), y] -= 1
dW2 = (a1.T).dot(delta3)
db2 = np.sum(delta3, axis=0, keepdims=True)
delta2 = delta3.dot(W2.T)
# ReLu求导
delta2 = np.where(z1 >0, delta2, 0)
dW1 = np.dot(X.T, delta2)
db1 = np.sum(delta2, axis=0)
dW2 += reg_lambda * W2
dW1 += reg_lambda * W1
# Gradient descent parameter update
W1 += -epsilon * dW1
b1 += -epsilon * db1
W2 += -epsilon * dW2
b2 += -epsilon * db2
# Assign new parameters to the model
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
# 改为ReLU计算损失
if print_loss and i % 1000 == 0:
print("Loss after iteration %i: %f" %(i,
return model
# Build a model with a 3-dimensional hidden layer with sigmoid
model = build_model_ReLU(3, print_loss=True)
# Plot the decision boundary
plot_decision_boundary(lambda x: predict_ReLU(model, x))
plt.title("Decision Boundary for hidden layer size 3 with ReLU")
# Generate a dataset and plot it
X1, y1 = sklearn.datasets.make_classification(n_samples=300,
plt.scatter(X1[:,0], X1[:,1], s=40, c=y1, cmap=plt.cm.Spectral)
num_examples1 = len(X1) # training set size
nn_input_dim1 = 2 # input layer dimensionality
nn_output_dim1 = 3 # output layer dimensionality
# Helper function to evaluate the total loss on the dataset
# 这里只是简单的把数据集更换了
def calculate_loss1(model):
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
# Forward propagation to calculate our predictions
z1 = X1.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Calculating the loss
corect_logprobs = -np.log(probs[range(num_examples1), y1])
data_loss = np.sum(corect_logprobs)
# Add regulatization term to loss (optional)
data_loss += reg_lambda/2 * (np.sum(np.square(W1)) +
return 1./num_examples1 * data_loss
# Helper function to predict an output (0 or 1)
# 这个就没改···
def predict1(model, x):
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
# Forward propagation
z1 = x.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
return np.argmax(probs, axis=1)
def build_model1(nn_hdim, num_passes=20000, print_loss=False):
# Initialize the parameters to random values. We need to learn these.
W1 = np.random.randn(nn_input_dim1, nn_hdim) / np.sqrt(nn_input_dim1)
b1 = np.zeros((1, nn_hdim))
W2 = np.random.randn(nn_hdim, nn_output_dim1) / np.sqrt(nn_hdim)
b2 = np.zeros((1, nn_output_dim1))
# This is what we return at the end
model = {}
# Gradient descent. For each batch...
for i in range(0, num_passes):
# Forward propagation
z1 = X1.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Backpropagation
delta3 = probs
delta3[range(num_examples1), y1] -= 1
dW2 = (a1.T).dot(delta3)
db2 = np.sum(delta3, axis=0, keepdims=True)
delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
dW1 = np.dot(X1.T, delta2)
db1 = np.sum(delta2, axis=0)
dW2 += reg_lambda * W2
dW1 += reg_lambda * W1
# Gradient descent parameter update
W1 += -epsilon * dW1
b1 += -epsilon * db1
W2 += -epsilon * dW2
b2 += -epsilon * db2
# Assign new parameters to the model
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
if print_loss and i % 1000 == 0:
print("Loss after iteration %i: %f" %(i, calculate_loss1(model)))
return model
def plot_decision_boundary1(pred_func):
# Set min and max values and give it some padding
x_min, x_max = X1[:, 0].min() - .5, X1[:, 0].max() + .5
y_min, y_max = X1[:, 1].min() - .5, X1[:, 1].max() + .5
h = 0.01
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Predict the function value for the whole gid
Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.scatter(X1[:, 0], X1[:, 1], c=y1, cmap=plt.cm.Spectral)
# Build a model with a 5-dimensional hidden layer
model = build_model1(5, print_loss=True)
# Plot the decision boundary
plot_decision_boundary1(lambda x: predict1(model, x))
plt.title("Decision Boundary for hidden layer size 3")
def calculate_loss2(model):
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
# Forward propagation to calculate our predictions
z1 = X.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
a2 = np.tanh(z2)
z3 = a2.dot(W3) + b3
exp_scores = np.exp(z3)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Calculating the loss
corect_logprobs = -np.log(probs[range(num_examples), y])
data_loss = np.sum(corect_logprobs)
# Add regulatization term to loss (optional)
data_loss += reg_lambda/2 * (np.sum(np.square(W1)) +
return 1./num_examples * data_loss
def predict2(model, x):
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
# Forward propagation
z1 = x.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
a2 = np.tanh(z2)
z3 = a2.dot(W3) + b3
exp_scores = np.exp(z3)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
return np.argmax(probs, axis=1)
def build_model2(nn_hdim, nn_hdim1, num_passes=20000, print_loss=False):
# Initialize the parameters to random values. We need to learn these.
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim1)
b1 = np.zeros((1, nn_hdim))
W2 = np.random.randn(nn_hdim, nn_hdim1) / np.sqrt(nn_hdim)
b2 = np.zeros((1, nn_hdim1))
W3 = np.random.randn(nn_hdim1, nn_output_dim) / np.sqrt(nn_hdim1)
b3 = np.zeros((1, nn_output_dim))
# This is what we return at the end
model = {}
# Gradient descent. For each batch...
for i in range(0, num_passes):
# Forward propagation
z1 = X.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
a2 = np.tanh(z2)
z3 = a2.dot(W3) + b3
exp_scores = np.exp(z3)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Backpropagation
delta4 = probs
delta4[range(num_examples), y] -= 1
dW3 = (a2.T).dot(delta4)
db3 = np.sum(delta4, axis=0, keepdims=True)
delta3 = delta4.dot(W3.T) * (1 - np.power(a2, 2))
dW2 = (a1.T).dot(delta3)
db2 = np.sum(delta3, axis=0, keepdims=True)
delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
dW1 = np.dot(X.T, delta2)
db1 = np.sum(delta2, axis=0, keepdims=True)
dW3 += reg_lambda * W3
dW2 += reg_lambda * W2
dW1 += reg_lambda * W1
# Gradient descent parameter update
W1 += -epsilon * dW1
b1 += -epsilon * db1
W2 += -epsilon * dW2
b2 += -epsilon * db2
W3 += -epsilon * dW3
b3 += -epsilon * db3
# Assign new parameters to the model
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2, 'W3': W3, 'b3': b3}
if print_loss and i % 1000 == 0:
print("Loss after iteration %i: %f" %(i, calculate_loss2(model)))
return model
# Build a model with a 3_4-dimensional hidden layer
model = build_model2(3,4, print_loss=True)
# Plot the decision boundary
plot_decision_boundary(lambda x: predict2(model, x))
plt.title("Decision Boundary for hidden layer size 3-4")