import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.io import loadmat
import scipy.optimize as opt
data = loadmat('ex4data1.mat')
weight = loadmat('ex4weights')
X = data['X']
Y = data['y']
theta1 = weight['Theta1']
theta2 = weight['Theta2']
theta = np.vstack((theta1.reshape(-1,1),theta2.reshape(-1,1)))
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def hypothesis(theta, X):
theta = theta.T
h = X.dot(theta)
return h
def g_function(a):
return sigmoid(a)
def forward(theta, X):
theta1 = theta[0:25 * 401].reshape(25, 401)
theta2 = theta[25 * 401:10285].reshape(10, 26)
m = X.shape[0]
ones = np.ones((m, 1))
a1 = np.concatenate([ones, X], axis=1)
z2 = hypothesis(theta1, a1)
a2 = np.concatenate([ones, g_function(z2)], axis=1)
z3 = hypothesis(theta2, a2)
a3 = g_function(z3)
return a1, z2, a2, z3, a3
def cost(theta, X, Y, k):
theta1 = theta[0:25 * 401].reshape(25, 401)
theta2 = theta[25 * 401:10285].reshape(10, 26)
m = X.shape[0]
a1, z2, a2, z3, a3 = forward(theta, X)
classY = np.zeros((m, k))
for i in range(1, k + 1):
classY[:, i - 1] = np.array([1 if i == lable else 0 for lable in Y])
j = np.sum((-1) * np.multiply(classY, np.log(a3)) - np.multiply(1 - classY, np.log(1 - a3))) / m
return j
# print(cost(theta,X,Y,k=10))
# 0.2876291651613189
def regularized_cost(theta, X, Y, k=10 , lam = 1):
theta1 = theta[0:25 * 401].reshape(25, 401)
theta2 = theta[25 * 401:10285].reshape(10, 26)
m = X.shape[0]
j = cost(theta, X, Y, k)
_theta1 = theta1.copy()
_theta1[:, 0] = 0
_theta2 = theta2.copy()
_theta2[:, 0] = 0
reg1 = np.sum(_theta1**2)
reg2 = np.sum(_theta2**2)
return j + lam*(reg1+reg2)/(2*m)
# print(regularized_cost(theta, X, Y, k=10 , lam = 1))
# 0.38376985909092365
def sigmoid_gradient(z):
g = sigmoid(z)
return g*(1-g)
# print(sigmoid_gradient(0))
# 0.25
def backpropagation(theta, X, Y, k=10, lam = 1):
theta1 = theta[0:25 * 401].reshape(25, 401)
theta2 = theta[25 * 401:10285].reshape(10, 26)
m = X.shape[0]
a1, z2, a2, z3, a3 = forward(theta, X)
delta3 = np.zeros((m,k))
delta2 = np.zeros((m,z2.shape[1]))
classY = np.zeros((m, k))
for i in range(1, k + 1):
classY[:, i - 1] = np.array([1 if i == lable else 0 for lable in Y])
delta3 = a3 - classY
delta2 = np.multiply( delta3.dot( theta2[:,1:theta2.shape[1]]),sigmoid_gradient(z2))
D2 = np.dot(delta3.T, a2) / m
D1 = np.dot(delta2.T,a1) / m
return D1,D2
# print(backpropagation(theta, X, Y, k = 10 , lam = 1) .shape)
# ((25, 401), (10, 26))
def gradient_checking(theta,X,Y, epsilon):
grad = backpropagation(theta, X, Y, k=10, lam=1)
num_grad = np.zeros((theta.shape[0]))
e = np.zeros((theta.shape[0]))
for i in range(theta.shape[0]):
e[i] = epsilon
j1 = cost(theta+e[i].reshape(-1,1), X, Y, k=10)
j2 = cost(theta-e[i].reshape(-1,1),X, Y, k=10)
num_grad = (j1-j2)/(2*epsilon)
e[i] = 0
diff = np.linalg.norm(num_grad - grad) / np.linalg.norm(num_grad+grad)
print('Relative Difference: {}\n'.format(diff))
#跑了40min
# gradient_checking(theta,X,Y,epsilon=1e-4)
# Relative Difference: 0.9998057514099036
def regularized_gradient(theta, X, Y, k=10 , lam = 1):
D1, D2 = backpropagation(theta, X, Y, k=10, lam = 1)
theta1 = theta[0:25 * 401].reshape(25, 401)
theta2 = theta[25 * 401:10285].reshape(10, 26)
m = X.shape[0]
j = cost(theta, X, Y, k)
_theta1 = theta1.copy()
_theta1[:,0] = 0
_theta2 = theta2.copy()
_theta2[:,0] = 0
D1 = D1+ (lam/m)*_theta1
D2 = D2+ (lam/m)*_theta2
return np.vstack((D1.reshape(-1,1),D2.reshape(-1,1)))
#带正则化项的梯度检测
def gradient_checking(theta,X,Y, epsilon):
grad = regularized_gradient(theta, X, Y, k=10, lam=1)
num_grad = np.zeros((theta.shape[0]))
e = np.zeros((theta.shape[0]))
for i in range(theta.shape[0]):
e[i] = epsilon
j1 = regularized_cost(theta+e[i].reshape(-1,1), X, Y, k=10,lam = 1)
j2 = regularized_cost(theta-e[i].reshape(-1,1),X, Y, k=10,lam = 1)
num_grad = (j1-j2)/(2*epsilon)
e[i] = 0
diff = np.linalg.norm(num_grad - grad) / np.linalg.norm(num_grad+grad)
print('Relative Difference: {}\n'.format(diff))
# gradient_checking(theta,X,Y, epsilon = 1e-4)
# Relative Difference: 0.9998060640429633
def random_initialization(L_out, L_in ):
w = np.zeros((L_out,L_in+1))
epsilon_init = 0.12
w = np.random.rand(L_out,1+L_in) *2 * epsilon_init - epsilon_init
return w
initial_theta1 = random_initialization(25,400)
initial_theta2 = random_initialization(10,25)
initial_theta = np.vstack((initial_theta1.reshape(-1,1),initial_theta2.reshape(-1,1)))
res = opt.minimize(fun=regularized_cost,
x0=initial_theta, args=(X, Y,10, 1), method='tnc', jac=regularized_gradient)
# print(res)
# fun: 0.2998890271866287
# jac: array([[1.43661603e-07],
# [-1.89136207e-16],
# [-9.27484145e-14],
# ...,
# [5.57536194e-08],
# [1.27154957e-07],
# [1.72745129e-07]])
# message: 'Converged (|f_n-f_(n-1)| ~= 0)'
# nfev: 4136
# nit: 171
# status: 1
# success: True
# x: array([-1.15034591e+00, -9.45681033e-13, -4.63742073e-10, ...,
# 1.61928813e+00, 2.07152753e+00, 4.04669213e-01])
final_theta = res.x
def predict(X):
m = X.shape[0]
p = np.zeros(m)
p = np.argmax(X,axis=1)
return p+1
a1, z2, a2, z3, a3 = forward(theta, X)
m = X.shape[0]
y_pre = predict(a3)
correct = np.ones(m)
for i in range(m):
if y_pre[i]==Y[i]:
correct[i] = 1
else:
correct[i] = 0
accuracy = sum(correct) / len(a3)
# print(accuracy)
# 0.9752
def plot_hidden_layer(theta):
final_theta1 = final_theta[0:25 * 401].reshape(25, 401)
final_theta2 = final_theta[25 * 401:10285].reshape(10, 26)
hidden_layer = final_theta1[:, 1:]
fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(5, 5))
for r in range(5):
for c in range(5):
ax_array[r, c].matshow(hidden_layer[5 * r + c].reshape((20, 20)),
cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plot_hidden_layer(final_theta)
plt.show()