In [1]:
import numpy as np
In [2]:
def sigmoid(z): return 1 / (1 + np.exp(-z)) # 对sigmoid函数求导,反向传播中用 def sigmoidDerivative(a): return np.multiply(a, (1 - a))
In [3]:
def initThetas(hiddenNum, unitNum, inputSize, classNum, epsilon): ''' 对边权矩阵初始化 Args: hiddenNum 隐层数目 unitNum 每个隐层的神经元数目 inputSize 输入层规模 classNum 分类数目 epsilon epsilon Returns: Thetas 权值矩阵序列 ''' hiddens = [unitNum for i in range(hiddenNum)] # 隐层所有神经元 units = [inputSize] + hiddens + [classNum] Thetas = [] for idx, unit in enumerate(units): if idx == len(units) - 1: break nextUnit = units[idx + 1] Theta = np.random.rand(nextUnit, unit + 1) * 2 * epsilon - epsilon Thetas.append(Theta) return Thetas
In [4]:
def computeCost(Thetas, y, theLambda, X=None, a=None): """计算代价 Args: Thetas 权值矩阵序列 X 样本 y 标签集 a 各层激活值 Returns: J 预测代价 """ m = y.shape[0] if a is None: a = fp(Thetas, X) # 注意,计算代价的时候,我们只需要关注整个网络的预测和标注之间的差异即可,因此只需要看a[-1] # 另外一个注意点是:标注y已经被向量化了,有且仅有一位是1,其他都是0 error = -np.sum(np.multiply(y.T,np.log(a[-1]))+np.multiply((1-y).T, np.log(1-a[-1]))) # 正则化项,但不包括偏置项。Θ的下标i是下一层的神经元编号,下标j是当前层的节点编号。所以偏置项在第二维的第0个位置 reg = -np.sum([np.sum(Theta[:, 1:]) for Theta in Thetas]) return (1.0 / m) * error + (1.0 / (2 * m)) * theLambda * reg
In [5]:
def adjustLabels(y): """标签向量化 Args: y 标签集 Returns: yAdjusted 向量化后的标签 """ if y.shape[1] == 1: classes = set(np.ravel(y)) classNum = len(classes) minClass = min(classes) if classNum > 2: # 多分类,使用向量标注,对应类别位置设置为1 yAdjusted = np.zeros((y.shape[0], classNum), np.float64) for row, label in enumerate(y): yAdjusted[row, label - minClass] = 1 else: # 二分类 yAdjusted = np.zeros((y.shape[0], 1), np.float64) for row, label in enumerate(y): if label != minClass: yAdjusted[row, 0] = 1.0 return yAdjusted return y
In [6]:
def unroll(matrixes): """参数展开 Args: matrixes 矩阵 Return: vec 向量 """ vec = [] for matrix in matrixes: vector = matrix.reshape(1, -1)[0] vec = np.concatenate((vec, vector)) return vec def roll(vector, shapes): """参数恢复 Args: vector 向量 shapes shape list Returns: matrixes 恢复的矩阵序列 """ matrixes = [] begin = 0 for shape in shapes: end = begin + shape[0] * shape[1] matrix = vector[begin:end].reshape(shape) begin = end matrixes.append(matrix) return matrixes
In [7]:
def fp(Thetas, X): """前向反馈过程 Args: Thetas 权值矩阵 X 输入样本 Returns: a 各层激活向量 """ layers = range(len(Thetas) + 1) layerNum = len(layers) # 激活向量序列 a = list(range(layerNum)) # 要的仅仅是定长list结构,内部元素在下面for循环被重新赋值 # 前向传播计算各层输出 for l in layers: if l == 0: a[l] = X.T else: z = Thetas[l - 1] * a[l - 1] a[l] = sigmoid(z) # 除输出层外,需要添加偏置 if l != layerNum - 1: a[l] = np.concatenate((np.ones((1, a[l].shape[1])), a[l])) return a
In [8]:
def bp(Thetas, a, y, theLambda): """反向传播过程 Args: a 激活值 y 标签 Returns: D 权值梯度 """ m = y.shape[0] layers = range(len(Thetas) + 1) layerNum = len(layers) d = list(range(len(layers))) delta = [np.zeros(Theta.shape) for Theta in Thetas] for l in layers[::-1]: # 反向遍历层 if l == 0: # 输入层不计算误差 break if l == layerNum - 1: # 输出层误差 d[l] = a[l] - y.T else: # 忽略偏置 d[l] = np.multiply((Thetas[l][:,1:].T * d[l + 1]), sigmoidDerivative(a[l][1:, :])) for l in layers[0:layerNum - 1]: delta[l] = d[l + 1] * (a[l].T) D = [np.zeros(Theta.shape) for Theta in Thetas] for l in range(len(Thetas)): Theta = Thetas[l] # 偏置更新增量 D[l][:, 0] = (1.0 / m) * (delta[l][0:, 0].reshape(1, -1)) # 权值更新增量 D[l][:, 1:] = (1.0 / m) * (delta[l][0:, 1:] + theLambda * Theta[:, 1:]) return D
In [9]:
def updateThetas(m, Thetas, D, alpha, theLambda): """更新权值 Args: m 样本数 Thetas 各层权值矩阵 D 梯度 alpha 学习率 theLambda 正规化参数 Returns: Thetas 更新后的权值矩阵 """ for l in range(len(Thetas)): Thetas[l] = Thetas[l] - alpha * D[l] return Thetas
In [10]:
def gradientDescent(Thetas, X, y, alpha, theLambda): """梯度下降 Args: X 样本 y 标签 alpha 学习率 theLambda 正规化参数 Returns: J 预测代价 Thetas 更新后的各层权值矩阵 """ # 样本数,特征数 m, n = X.shape # 前向传播计算各个神经元的激活值 a = fp(Thetas, X) # 反向传播计算梯度增量 D = bp(Thetas, a, y, theLambda) # 计算预测代价 J = computeCost(Thetas,y,theLambda,a=a) # 更新权值 Thetas = updateThetas(m, Thetas, D, alpha, theLambda) if np.isnan(J): J = np.inf return J, Thetas
In [11]:
def gradientCheck(Thetas,X,y,theLambda): """梯度校验 Args: Thetas 权值矩阵 X 样本 y 标签 theLambda 正则化参数 Returns: checked 是否检测通过 """ m, n = X.shape # 前向传播计算各个神经元的激活值 a = fp(Thetas, X) # 反向传播计算梯度增量 D = bp(Thetas, a, y, theLambda) # 计算预测代价 J = computeCost(Thetas, y, theLambda, a=a) DVec = unroll(D) # 数值化计算梯度 epsilon = 1e-4 # 注意,这个epsilon的意义 gradApprox = np.zeros(DVec.shape) ThetaVec = unroll(Thetas) shapes = [Theta.shape for Theta in Thetas] for i,item in enumerate(ThetaVec): ThetaVec[i] = item - epsilon JMinus = computeCost(roll(ThetaVec,shapes),y,theLambda,X=X) ThetaVec[i] = item + epsilon JPlus = computeCost(roll(ThetaVec,shapes),y,theLambda,X=X) gradApprox[i] = (JPlus-JMinus) / (2*epsilon) # 平均差距 diff = np.average(gradApprox - DVec) print('gradient checking diff:', diff) # 3.21615931121e-06 if diff < 1e-5: return True else: return False
In [12]:
def train(X, y,checkFlag=False, Thetas=None, hiddenNum=0, unitNum=5, epsilon=1, alpha=1, theLambda=0, precision=0.0001, maxIters=50): """网络训练 Args: X 训练样本 y 标签集 checkFlag 是否进行梯度校验,默认为False,即不进行校验。梯度校验费时 Thetas 初始化的Thetas,如果为None,由系统随机初始化Thetas hiddenNum 隐藏层数目 unitNum 隐藏层的单元数 epsilon 初始化权值的范围[-epsilon, epsilon] alpha 学习率 theLambda 正规化参数 precision 误差精度 maxIters 最大迭代次数 """ # 样本数,特征数 m, n = X.shape # 标注标签向量化,比如多分类标签要转成向量 y = adjustLabels(y) classNum = y.shape[1] # 初始化Theta if Thetas is None: Thetas = initThetas( inputSize=n, hiddenNum=hiddenNum, unitNum=unitNum, classNum=classNum, epsilon=epsilon ) # 梯度校验 print('Doing Gradient Checking....') if checkFlag: checked = gradientCheck(Thetas, X, y, theLambda) else: checked=True print('Gradient Checked.') if checked: last_error = np.inf for i in range(maxIters): error, Thetas = gradientDescent( Thetas, X, y, alpha=alpha, theLambda=theLambda) if abs(error-last_error) < precision: last_error = error break if error == np.inf: last_error = error break last_error = error return { 'error': error, 'Thetas': Thetas, 'iters': i } else: print('Error: Gradient Cheching Failed!!!') return { 'error': None, 'Thetas': None, 'iters': 0 }
In [13]:
def predict(X, Thetas): """预测函数 Args: X: 样本 Thetas: 训练后得到的参数 Return: a """ a = fp(Thetas,X) return a[-1]
In [14]:
from scipy.io import loadmat from matplotlib import pyplot %matplotlib inline
In [15]:
data = loadmat('data/handwritten_digits.mat')
In [16]:
data['X'][0].shape
Out[16]:
(400,)
In [17]:
pyplot.imshow(data['X'][2200].reshape(20,20).T) # 因为mat数据存储的问题,需要转置一下 print(data['y'][2200])
[4]
In [18]:
Thetas = loadmat('data/init_weights.mat') Thetas = [Thetas['Theta1'], Thetas['Theta2']]
In [19]:
X = np.mat(data['X']) y = np.mat(data['y'])
In [20]:
res = train(X,y,checkFlag=True, hiddenNum=1,unitNum=25,Thetas=Thetas,maxIters=500)
Doing Gradient Checking.... gradient checking diff: 3.2161593109687145e-06 Gradient Checked.
In [21]:
res['iters'], res['error'] # 迭代次数和返回epsilon值
Out[21]:
(499, 0.19417985808360613)
In [22]:
def readable_predict(idx, X, Thetas): print('predict:', (np.argmax(predict(X[idx], Thetas))+1)) # 网络的标签从0开始 print('real tag:', y[idx].ravel()) # 真实的标签把0标记成了10 pyplot.imshow(X[idx].reshape(20,20).T)
In [23]:
readable_predict(3522, X, res['Thetas'])
predict: 7 real tag: [[7]]