本文是吴恩达机器学习课程的第四次编程作业:Neural Network Back Propagation,用python实现。
ex4.py为主程序入口。
作业文件和训练集数据下载地址:https://github.com/toanoyx/MachineLearning-AndrewNg-coursera-python/tree/master/ex4%20NN%20back%20propagation
下文是文件的源代码:
ex4.py
from tensorflow.contrib import opt
from sklearn.metrics import classification_report
from loadData import *
from displayData import *
from feedForward import *
from nnCostFunction import *
from computeNumericalGradient import *
from checkNNGradients import *
""" 第1部分 可视化数据集 """
X, _ = loadData('ex4data1.mat')
displayData(X)
plt.show()
""" 第2部分 模型表示 """
X_raw, y_raw = loadData('ex4data1.mat', transpose=False)
X = np.insert(X_raw, 0, np.ones(X_raw.shape[0]), axis=1)
def expand_y(y):
res = []
for i in y:
y_array = np.zeros(10)
y_array[i - 1] = 1
res.append(y_array)
return np.array(res)
y = expand_y(y_raw)
def load_weight(path):
data = sio.loadmat(path)
return data['Theta1'], data['Theta2']
t1, t2 = load_weight('ex4weights.mat')
""" 第3部分 前向传播和代价函数 """
theta = np.concatenate((np.ravel(t1), np.ravel(t2)))
_, _, _, _, h = feedForward(theta, X)
print("cost function: " + str(nnCostFunction(theta, X, y)) + "(this should be 0.287629)")
""" 第4部分 正则化代价函数 """
t1, t2 = deserialize(theta)
m = X.shape[0]
l = 1
reg_t1 = (l / (2 * m)) * np.power(t1[:, 1:], 2).sum()
reg_t2 = (l / (2 * m)) * np.power(t2[:, 1:], 2).sum()
regularizedCost = nnCostFunction(theta, X, y) + reg_t1 + reg_t2
print("regularized cost function: " + str(regularizedCost) + "(this should be 0.383770)")
""" 第5部分 反向传播 """
sigmoid_gradient(0)
d1, d2 = deserialize(computeNumericalGradient(theta, X, y))
checkNNGradients(theta, X, y, epsilon= 0.0001)
checkNNGradients(theta, X, y, epsilon=0.0001, regularized=True)
""" 第6部分 """
def random_init(size):
return np.random.uniform(-0.12, 0.12, size)
def nn_training(X, y):
init_theta = random_init(10285) # 25*401 + 10*26
res = opt.minimize(fun=regularized_cost,
x0=init_theta,
args=(X, y, 1),
method='TNC',
jac=regularized_gradient,
options={'maxiter': 400})
return res
res = nn_training(X, y)
print(str(res))
_, y_answer = loadData('ex4data1.mat')
print(str(y_answer[:20]))
final_theta = res.x
def show_accuracy(theta, X, y):
_, _, _, _, h = feedForward(theta, X)
y_pred = np.argmax(h, axis=1) + 1
print(classification_report(y, y_pred))
def plot_hidden_layer(theta):
final_theta1, _ = deserialize(theta)
hidden_layer = final_theta1[:, 1:] # ger rid of bias term theta
fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(5, 5))
for r in range(5):
for c in range(5):
ax_array[r, c].matshow(hidden_layer[5 * r + c].reshape((20, 20)),
cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plot_hidden_layer(final_theta)
plt.show()
checkNNGradients.py
from nnCostFunction import *
from computeNumericalGradient import *
def checkNNGradients(theta, X, y, epsilon, regularized=False):
def a_numeric_grad(plus, minus, regularized=False):
if regularized:
return (regularized_cost(plus, X, y) - regularized_cost(minus, X, y)) / (epsilon * 2)
else:
return (nnCostFunction(plus, X, y) - nnCostFunction(minus, X, y)) / (epsilon * 2)
theta_matrix = expand_array(theta) # expand to (10285, 10285)
epsilon_matrix = np.identity(len(theta)) * epsilon
plus_matrix = theta_matrix + epsilon_matrix
minus_matrix = theta_matrix - epsilon_matrix
numeric_grad = np.array([a_numeric_grad(plus_matrix[i], minus_matrix[i], regularized)
for i in range(len(theta))])
analytic_grad = regularized_gradient(theta, X, y) if regularized else computeNumericalGradient(theta, X, y)
diff = np.linalg.norm(numeric_grad - analytic_grad) / np.linalg.norm(numeric_grad + analytic_grad)
print(
'If your backpropagation implementation is correct,\nthe relative difference will be smaller than 10e-9 (assume epsilon=0.0001).\nRelative Difference: {}\n'.format(
diff))
def regularized_cost(theta, X, y, l=1):
t1, t2 = deserialize(theta)
m = X.shape[0]
reg_t1 = (l / (2 * m)) * np.power(t1[:, 1:], 2).sum() # this is how you ignore first col
reg_t2 = (l / (2 * m)) * np.power(t2[:, 1:], 2).sum()
return nnCostFunction(theta, X, y) + reg_t1 + reg_t2
def expand_array(arr):
return np.array(np.matrix(np.ones(arr.shape[0])).T @ np.matrix(arr))
def regularized_gradient(theta, X, y, l=1):
m = X.shape[0]
delta1, delta2 = deserialize(computeNumericalGradient(theta, X, y))
t1, t2 = deserialize(theta)
t1[:, 0] = 0
reg_term_d1 = (l / m) * t1
delta1 = delta1 + reg_term_d1
t2[:, 0] = 0
reg_term_d2 = (l / m) * t2
delta2 = delta2 + reg_term_d2
return np.concatenate((np.ravel(delta1), np.ravel(delta2)))
computeNumericalGradient.py
from feedForward import *
def computeNumericalGradient(theta, X, y):
t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0]
delta1 = np.zeros(t1.shape) # (25, 401)
delta2 = np.zeros(t2.shape) # (10, 26)
a1, z2, a2, z3, h = feedForward(theta, X)
for i in range(m):
a1i = a1[i, :] # (1, 401)
z2i = z2[i, :] # (1, 25)
a2i = a2[i, :] # (1, 26)
hi = h[i, :] # (1, 10)
yi = y[i, :] # (1, 10)
d3i = hi - yi # (1, 10)
z2i = np.insert(z2i, 0, np.ones(1)) # make it (1, 26) to compute d2i
d2i = np.multiply(t2.T @ d3i, sigmoid_gradient(z2i)) # (1, 26)
# careful with np vector transpose
delta2 += np.matrix(d3i).T @ np.matrix(a2i) # (1, 10).T @ (1, 26) -> (10, 26)
delta1 += np.matrix(d2i[1:]).T @ np.matrix(a1i) # (1, 25).T @ (1, 401) -> (25, 401)
delta1 = delta1 / m
delta2 = delta2 / m
return np.concatenate((np.ravel(delta1), np.ravel(delta2)))
displayData.py
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
def displayData(X):
size = int(np.sqrt(X.shape[1]))
sample_idx = np.random.choice(np.arange(X.shape[0]), 100) # 100*400
sample_images = X[sample_idx, :]
fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8))
for r in range(10):
for c in range(10):
ax_array[r, c].matshow(sample_images[10 * r + c].reshape((size, size)),
cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
feedForward.py
import numpy as np
from sigmoid import *
def feedForward(theta, X):
t1, t2 = deserialize(theta)
m = X.shape[0]
a1 = X
z2 = a1 @ t1.T
a2 = np.insert(sigmoid(z2), 0, np.ones(m), axis=1)
z3 = a2 @ t2.T
h = sigmoid(z3)
return a1, z2, a2, z3, h
def deserialize(seq):
return seq[:25 * 401].reshape(25, 401), seq[25 * 401:].reshape(10, 26)
loadData.py
import scipy.io as sio
import numpy as np
def loadData(path, transpose=True):
data = sio.loadmat(path)
y = data.get('y')
y = y.reshape(y.shape[0])
X = data.get('X')
if transpose:
X = np.array([im.reshape((20, 20)).T for im in X])
X = np.array([im.reshape(400) for im in X])
return X, y
nnCostFunction.py
from feedForward import *
def nnCostFunction(theta, X, y):
m = X.shape[0]
_, _, _, _, h = feedForward(theta, X)
pair_computation = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))
return pair_computation.sum() / m
sigmoid.py
import numpy as np
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def sigmoid_gradient(z):
return np.multiply(sigmoid(z), 1 - sigmoid(z))