“深度学习”学习日记。与学习相关的技巧 -- 参数的更新

2023.1.20

在神经网络的学习这一章,学习过了利用 梯度下降法 对参数进行更新,目的是找到是损失函数的值尽量小的参数;像解决这样的问题称为 最优化 。

由于参数空间十分复杂、参数规模十分庞大,导致“最优化”的过程变得困难。

1, SGD:

回忆一下随机梯度下降法(stochastic gradient descent),简称SGD

W\leftarrow W-\eta \frac{\partial L}{\partial W}

将要更新的权重设置为W,把损失函数关于梯度几位 \frac{\partial L}{\partial W} 。η 代表学习率;表示右边的值更新左边的值。

Python代码实现SGD:

class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, params, grads):  # 为权重于偏置 w1,w2,b1,b2 这样的参数
        for key in params.key():
            params[key] -= self.lr * grads[key]

像这样单独实现进行最优化的类,功能的模块化变得简单:

import numpy as np
from collections import OrderedDict
from dataset.mnist import load_mnist
import sys, os

sys.path.append(os.pardir)


# 数值微分
def numerical_gradient(f, x):
    h = 1e-4  # 0.0001
    grad = np.zeros_like(x)

    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])  # np.nditer() 迭代器处理多维数组
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x)  # f(x+h)

        x[idx] = tmp_val - h
        fxh2 = f(x)  # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2 * h)

        x[idx] = tmp_val  # 还原值
        it.iternext()

    return grad


# 损失函数
def cross_entropy_error(y, t):
    delta = 1e-7
    return -1 * np.sum(t * np.log(y + delta))


# 激活函数
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T

    x = x - np.max(x)
    return np.exp(x) / np.sum(np.exp(x))


def sigmoid(x1):
    return 1 / (1 + np.exp(-x1))


# 加法层、乘法层、激活函数层、Affine层、Softmax层
class Addyer:  # 加法节点
    def __init__(self):
        pass

    def forward(self, x, y):
        out = x + y
        return out

    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy


class Mullyer:  # 乘法节点
    def __init__(self):  # __init__() 中会初始化实例变量
        self.x = None
        self.y = None

    def forward(self, x, y):
        self.x = y
        self.y = x
        out = x * y

        return out

    def backward(self, dout):
        dx = dout * self.x
        dy = dout * self.y

        return dx, dy


class ReLU:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout

        return dx


class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)

        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size

        return dx


class Affine:
    def __init__(self, w, b):
        self.w = w
        self.b = b
        self.x = None
        self.dw = None
        self.db = None

    def forward(self, x):
        self.x = x
        out = np.dot(x, self.w) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.w.T)
        self.dw = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)

        return dx


class TwoLayerNet:
    def __init__(self, input, hidden, output, weight__init__std=0.01):
        # 权重的初始化 假设一个权重
        self.params = {}
        self.params['w1'] = weight__init__std * np.random.randn(input, hidden)
        self.params['b1'] = np.zeros(hidden)
        self.params['w2'] = weight__init__std * np.random.randn(hidden, output)
        self.params['b2'] = np.zeros(output)

        # 生成层
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['w1'], self.params['b1'])
        self.layers['ReLU1'] = ReLU()
        self.layers['Affine2'] = Affine(self.params['w2'], self.params['b2'])

        self.lastlayer = SoftmaxWithLoss()

    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)

        return x

    def loss(self, x, t):  # x:测试数据;t:监督数据
        y = self.predict(x)

        return self.lastlayer.forward(y, t)

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)  # 正确解标签
        if t.ndim != 1:
            t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    def numerical_grandient(self, x, t):  # x:测试数据;t:监督数据
        loss_w = lambda w: self.loss(x, t)

        grads = {}
        grads['w1'] = numerical_gradient(loss_w, self.params['w1'])
        grads['b1'] = numerical_gradient(loss_w, self.params['b1'])
        grads['w2'] = numerical_gradient(loss_w, self.params['w2'])
        grads['b2'] = numerical_gradient(loss_w, self.params['b2'])

        return grads

    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastlayer.backward(dout)

        layers = list(self.layers.values())
        layers.reverse()
        # reserved() 是 Python 内置函数之一,其功能是对于给定的序列(包括列表、元组、字符串以及 range(n) 区间),该函数可以返回一个逆序序列的迭代器(用于遍历该逆序序列)
        for layer in layers:
            dout = layer.backward(dout)

        # setting
        grads = {}
        grads['w1'] = self.layers['Affine1'].dw
        grads['b1'] = self.layers['Affine1'].db
        grads['w2'] = self.layers['Affine2'].dw
        grads['b2'] = self.layers['Affine2'].db

        return grads


# 随机梯度下降法
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, params, grads):  # 为权重于偏置 w1,w2,b1,b2 这样的参数
        for key in params.keys():
            params[key] -= self.lr * grads[key]


# 数据导入
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

networks = TwoLayerNet(input=784, hidden=50, output=10)
optimizer = SGD()

iters_num = 10000
train_size = x_train.shape[0]  # 60000
batch_size = 100  # 批处理数量
learning_rate = 0.1

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)  # mini——batch 处理
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    grads = networks.gradient(x_batch, t_batch)  # 通过误差反向传播法求梯度
    params = networks.params
    print(params, grads)
    optimizer.update(params, grads)

 SGD的方法简单易于实现,但是也有缺点,现在通过一个函数来讲述:

例如:函数 f\left ( x,y\right )=\frac{1}{20}x^{2}+y^{2} 

利用Matlab绘制图像:

x = linspace(-10,10);

y = linspace(-10,10);

[X,Y] = meshgrid(x,y);

Z = (1/20)*X.^2+Y.^2;

Fig = mesh(X,Y,Z);

“深度学习”学习日记。与学习相关的技巧 -- 参数的更新_第1张图片

 易得该函数的最低点在(0,0,0)处,从图中的颜色变化我们可以看到梯度特征,y轴方向上颜色变化大,意味着坡度大;x轴方向,颜色变化小,意味值梯度小;而SGD有一个重要的性质就是随机,假设从(x,y)=(-10,-10)开始搜索,SGD只会向一个梯度减少的方向前进,具体y轴方向、还是x轴方向都有可能。也就是说SGD低效的根本原因是,更新路径没有朝着最小值的方向前进。

 所以现在学习一些更加高效的方法。

2, Momentum:

momentum是动量的意思,和物理有关:

表示方法:v\leftarrow \alpha \upsilon -\eta \frac{\partial L}{\partial W} ;W \leftarrow W +\upsilon ;

这里有一个新的变量 \upsilon ,对应“速度”,v\leftarrow \alpha \upsilon -\eta \frac{\partial L}{\partial W} 表示物体在梯度上受力,在这个力的作用下,物体的“速度”增加。

v\leftarrow \alpha \upsilon -\eta \frac{\partial L}{\partial W}  中的 \alpha \upsilon 这一项 在物体不受力时,他承担时物体减速的任务,故\alpha的值在[0,1] 这样的值:

代码实现:实例变量v会保存物体的速度。初始化时,self.v=None ,表示什么都不存,当update第一次执行时,v会以dict变量的形式保存于参数结构相关的数据

class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None

    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)

        for key in params.keys():
            self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
            params[key] += self.v[key]

与SGD相比,Momentum很大程度上减少了“随机”,如果SGD的更新路径是“Z”型随机,那么momentum的更新路是“S”型。因为x轴上的变化梯度很小,而y轴的变化梯度很大,可以看作x轴上受到的是恒力,而y轴上受到的是变力。

这这篇文章中:https://blog.csdn.net/m0_72675651/article/details/128729159 是利用SGD来实现参数更新,现在来观察一下次利用momentum函数来参数更新的结果,正确率。(代码会在文章最后出现)

“深度学习”学习日记。与学习相关的技巧 -- 参数的更新_第2张图片

3, AdaGrad:

在神经网络得学习中,学习率 \eta 得值很重要。如果学习率过小会耗时很长,如果学习率过大,则会导致学习发散而不能正确进行。

学习率的具体学习在这篇文章:https://blog.csdn.net/m0_72675651/article/details/128635260)

在神经网络的学习中有一种技巧,称为“学习率衰减” ,简单的来讲就是一开始多学习,一会儿后就少学习一些, \eta 的值由大变小

AdaGrad 函数会为不同的参数 “定制” 学习率 (Ada也就是英文Adaptive,即翻译为“适当的”)

表示方法:

h\leftarrow h + \frac{\partial L}{\partial W} \odot \frac{\partial L}{\partial W} ;

W \leftarrow W -\eta \frac{1}{\sqrt{h}}\frac{\partial L}{\partial W} ;

其中 \odot 表示对应矩阵元素的乘法;h 是一个新的参数,然而在参数更新是,可以乘以\frac{1}{\sqrt{h}},就可以调整学习的尺度。这意味这参数元素中变动比较大的元素,学习率将会变小,实现“定制”的效果;

AdaGrad会记录过去所有梯度的平方和。因此,学习越久,更新的幅度就越小。如果无止境地学习,更新量就会变成 0 , 甚至完全不再更新。

为了改善这个问题,可以运用RMSProp方法,这个方法会逐渐遗忘过去的梯度,再加法运算时,将新梯度的信息更多地反映过来。也称为“指数移动平均”,呈现指数函数式地减小过去的梯度的尺度。(具体内容可以参照相关内容)

代码实现:

import numpy as np


class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None

    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)

        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

这加入1e-7是为了防止h=0的情况,导致除数为零;

观察一下运行结果:

“深度学习”学习日记。与学习相关的技巧 -- 参数的更新_第3张图片

这里保存了两次的结果, 好像AdaGrad函数的效果不如Momentum、SGD;不知道会不会是h等于0的情况,取1e-7不够接近0的原因,分别尝试了1e-9、1e-50的情况,看看结果

“深度学习”学习日记。与学习相关的技巧 -- 参数的更新_第4张图片

好像变化不大,按道理AdaGrad函数比SGD函数和Momentum函数,更善于找到最小值的梯度(函数取值更高效地向最小值移动),毕竟评价神经模型的性能不是通过正确率的实现,而是通过损失函数的,关于模型性能的优劣评价,还有可能是数据集的原因 或者没有用到上文所说的RMSProp方法改善更新量为0的原因,也欢迎大家指教,具体什么原因后期再学习吧。

但是,AdaGrad函数感觉比momentum耗时短;

4,Adam:

Adam函数的基本思路是将momentum函数和AdaGrad函数结合再一起;他的一个重要的特征是“偏置校正”

Adam函数是2015年提出的新方法,其理论复杂,入门学习有点难度(其代码也是照搬教材没有作修改)Adam函数会设置3个超参数:学习率、一次momentum系数\beta 1、二次momentum系数\beta 2

代码实现:

class Adam:
    """Adam (http://arxiv.org/abs/1412.6980v8)"""

    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None

    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)

        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)

        for key in params.keys():
            # self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
            # self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key] ** 2 - self.v[key])

            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

            # unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
            # unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
            # params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)

 不过也可以写入原有的代码看看效果

“深度学习”学习日记。与学习相关的技巧 -- 参数的更新_第5张图片

果然,Adam函数的评价正确率是比较高(在0.97以上,速度比较快) 

5,总结:

现在学习了4种更新参数的方法,并且利用MNIST数据集进行过实验,先来观察一下4种方法的更新路径;

“深度学习”学习日记。与学习相关的技巧 -- 参数的更新_第6张图片

 理论上,AdaGrad函数的效果是最好(路径最短),但是当超参数等值的设置不同时,结果也会发生变化,就上文的案例。所以,并不存在在所有问题都表现优秀的方法,更要经行多个实验去取证。

实验代码:根据需要“ctrl+/ 注释或者取消注释” 去选择需要的函数进行所需要的实验 

import numpy as np
from collections import OrderedDict  # 有序字典:记住向字典里添加元素的顺序
import sys, os
from dataset.mnist import load_mnist

sys.path.append(os.pardir)


# 数值微分
def numerical_gradient(f, x):
    h = 1e-4  # 0.0001
    grad = np.zeros_like(x)

    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])  # np.nditer() 迭代器处理多维数组
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x)  # f(x+h)

        x[idx] = tmp_val - h
        fxh2 = f(x)  # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2 * h)

        x[idx] = tmp_val  # 还原值
        it.iternext()

    return grad


# 损失函数
def cross_entropy_error(y, t):
    delta = 1e-7
    return -1 * np.sum(t * np.log(y + delta))


# 激活函数
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T

    x = x - np.max(x)
    return np.exp(x) / np.sum(np.exp(x))


def sigmoid(x1):
    return 1 / (1 + np.exp(-x1))


# 加法层、乘法层、激活函数层、Affine层、Softmax层
class Addyer:  # 加法节点
    def __init__(self):
        pass

    def forward(self, x, y):
        out = x + y
        return out

    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy


class Mullyer:  # 乘法节点
    def __init__(self):  # __init__() 中会初始化实例变量
        self.x = None
        self.y = None

    def forward(self, x, y):
        self.x = y
        self.y = x
        out = x * y

        return out

    def backward(self, dout):
        dx = dout * self.x
        dy = dout * self.y

        return dx, dy


class ReLU:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout

        return dx


class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)

        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size

        return dx


class Affine:
    def __init__(self, w, b):
        self.w = w
        self.b = b
        self.x = None
        self.dw = None
        self.db = None

    def forward(self, x):
        self.x = x
        out = np.dot(x, self.w) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.w.T)
        self.dw = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)

        return dx


class TwoLayerNet:
    def __init__(self, input, hidden, output, weight__init__std=0.01):
        # 权重的初始化 假设一个权重
        self.params = {}
        self.params['w1'] = weight__init__std * np.random.randn(input, hidden)
        self.params['b1'] = np.zeros(hidden)
        self.params['w2'] = weight__init__std * np.random.randn(hidden, output)
        self.params['b2'] = np.zeros(output)

        # 生成层
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['w1'], self.params['b1'])
        self.layers['ReLU1'] = ReLU()
        self.layers['Affine2'] = Affine(self.params['w2'], self.params['b2'])

        self.lastlayer = SoftmaxWithLoss()

    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)

        return x

    def loss(self, x, t):  # x:测试数据;t:监督数据
        y = self.predict(x)

        return self.lastlayer.forward(y, t)

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)  # 正确解标签
        if t.ndim != 1:
            t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    def numerical_grandient(self, x, t):  # x:测试数据;t:监督数据
        loss_w = lambda w: self.loss(x, t)

        grads = {}
        grads['w1'] = numerical_gradient(loss_w, self.params['w1'])
        grads['b1'] = numerical_gradient(loss_w, self.params['b1'])
        grads['w2'] = numerical_gradient(loss_w, self.params['w2'])
        grads['b2'] = numerical_gradient(loss_w, self.params['b2'])

        return grads

    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastlayer.backward(dout)

        layers = list(self.layers.values())
        layers.reverse()
        # reserved() 是 Python 内置函数之一,其功能是对于给定的序列(包括列表、元组、字符串以及 range(n) 区间),该函数可以返回一个逆序序列的迭代器(用于遍历该逆序序列)
        for layer in layers:
            dout = layer.backward(dout)

        # setting
        grads = {}
        grads['w1'] = self.layers['Affine1'].dw
        grads['b1'] = self.layers['Affine1'].db
        grads['w2'] = self.layers['Affine2'].dw
        grads['b2'] = self.layers['Affine2'].db

        return grads


# 输出结果保存
def Result_save(name):
    path = "C:\\Users\\zzh\\Deshtop\\"
    full_path = path + name + '.txt'  # 也可以创建一个.doc的word文档
    file = open(full_path, 'w')

    return file


# class Momentum:
#     def __init__(self, lr=0.01, momentum=0.9):
#         self.lr = lr
#         self.momentum = momentum
#         self.v = None
#
#     def update(self, params, grads):
#         if self.v is None:
#             self.v = {}
#             for key, val in params.items():
#                 self.v[key] = np.zeros_like(val)
#
#         for key in params.keys():
#             self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
#             params[key] += self.v[key]


# class AdaGrad:
#     def __init__(self, lr=0.01):
#         self.lr = lr
#         self.h = None
#
#     def update(self, params, grads):
#         if self.h is None:
#             self.h = {}
#             for key, val in params.items():
#                 self.h[key] = np.zeros_like(val)
#
#         for key in params.keys():
#             self.h[key] += grads[key] * grads[key]
#             params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-50)  # 防止h等于0情况

class Adam:
    """Adam (http://arxiv.org/abs/1412.6980v8)"""

    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None

    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)

        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)

        for key in params.keys():
            # self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
            # self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key] ** 2 - self.v[key])

            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

            # unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
            # unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
            # params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)


filename = 'MNIST_RESULT'
Result_save(filename)
output = sys.stdout
outputfile = open("C:\\Users\\zzh\\Deshtop\\" + filename + '.txt', 'w')
sys.stdout = outputfile

# 数据导入
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

networks = TwoLayerNet(input=784, hidden=50, output=10)

# # 读入数据和梯度确认 数值微分的特点是简单、速度慢、不易出错;误差反向传播法的特点是复杂、速度快,容易出错。所以我们来进行梯度确认,观察输出结果是否一致;
# x_batch = x_train[:3]  # 3 张图片 (3个数字) 因为数值微分运行很慢,取3个结果观察
# t_batch = t_train[:3]
#
# grad_numberical = networks.numerical_grandient(x_batch, t_batch)  # 数值微分法
# grad_backprop = networks.gradient(x_batch, t_batch)  # 误差反向传播法
#
# # 求各个权重的绝对平均值
# print("gradient recognition:", '\n')
# for key in grad_numberical.keys():
#     diff = np.average(np.abs(grad_backprop[key] - grad_numberical[key]))
#     print(key + ":" + str(diff), file=outputfile)
#
# # w1:0.0008062789370314258
# # b1:0.007470903158435932
# # w2:0.007911547556927193
# # b2:0.4162550575209752

# 超参数
iters_num = 10000
train_size = x_train.shape[0]  # 60000
batch_size = 100  # 批处理数量
learning_rate = 0.1

# optimizer = Momentum()  # Momentum
# optimizer = AdaGrad()  # AdaGrad
optimizer = Adam()
train_acc_list = []
test_acc_list = []
train_loss_list = []
iter_per_epoch = max(train_size / batch_size, 1)

print("MNIST classification", '\n', "Nerual Network is learning weight and bias", file=outputfile)
for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)  # mini——batch 处理
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    grad = networks.gradient(x_batch, t_batch)  # 通过误差反向传播法求梯度
    # 更新权重、偏置参数
    # for key in ('w1', 'b1', 'w2', 'b2'):  # SGD
    #     networks.params[key] -= learning_rate * grad[key]
    grads = networks.gradient(x_batch, t_batch)  # 通过误差反向传播法求梯度 Momentum ,AdaGrad
    params = networks.params
    optimizer.update(params, grads)

    loss = networks.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    # 每个epoch的识别精度 输出的是总的正确率 而不是一个数据(图片)的概率
    if i % iter_per_epoch == 0:
        train_acc = networks.accuracy(x_train, t_train)
        test_acc = networks.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc |" + str(train_acc) + ",", str(test_acc), file=outputfile)

# 输出的概率 利用更新好的参数去推理
print("the shape of weight and bias:", '\n', file=outputfile)
print(networks.params['w1'].shape, file=outputfile)  # (784, 50)
print(networks.params['b1'].shape, file=outputfile)  # (50,)
print(networks.params['w2'].shape, file=outputfile)  # (50, 10)
print(networks.params['b2'].shape, file=outputfile)  # (10,)

accuracy_cnt = 0

for i in range(x_test.shape[0]):
    y = networks.predict(x_test[i])
    print("the ", i + 1, " times:", '\n', file=outputfile)
    print("the probability of picture:", '\n', y, file=outputfile)
    print("the right label:", '\n', t_test[i], file=outputfile)
    result = np.argmax(y)
    answer = np.argmax(t_test[i])
    if result == answer:
        print("classified successfully, this picture is", result, '\n', file=outputfile)
        accuracy_cnt += 1
    else:
        print("classified unsuccessfully", file=outputfile)

    print('\n', file=outputfile)

print("the Accuracy:" + str(float(accuracy_cnt) / x_test.shape[0]), file=outputfile)
outputfile.close()

MNIST数据集的导入代码:代码需要在一个命名为命名为dataset的文件夹下命名为mnist,并且与上个代码在同一个文件夹;

# coding: utf-8
try:
    import urllib.request
except ImportError:
    raise ImportError('You should use Python 3.x')
import os.path
import gzip
import pickle
import os
import numpy as np


url_base = 'http://yann.lecun.com/exdb/mnist/'
key_file = {
    'train_img':'train-images-idx3-ubyte.gz',
    'train_label':'train-labels-idx1-ubyte.gz',
    'test_img':'t10k-images-idx3-ubyte.gz',
    'test_label':'t10k-labels-idx1-ubyte.gz'
}

dataset_dir = os.path.dirname(os.path.abspath(__file__))
save_file = dataset_dir + "/mnist.pkl"

train_num = 60000
test_num = 10000
img_dim = (1, 28, 28)
img_size = 784


def _download(file_name):
    file_path = dataset_dir + "/" + file_name
    
    if os.path.exists(file_path):
        return

    print("Downloading " + file_name + " ... ")
    urllib.request.urlretrieve(url_base + file_name, file_path)
    print("Done")
    
def download_mnist():
    for v in key_file.values():
       _download(v)
        
def _load_label(file_name):
    file_path = dataset_dir + "/" + file_name
    
    print("Converting " + file_name + " to NumPy Array ...")
    with gzip.open(file_path, 'rb') as f:
            labels = np.frombuffer(f.read(), np.uint8, offset=8)
    print("Done")
    
    return labels

def _load_img(file_name):
    file_path = dataset_dir + "/" + file_name
    
    print("Converting " + file_name + " to NumPy Array ...")    
    with gzip.open(file_path, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
    data = data.reshape(-1, img_size)
    print("Done")
    
    return data
    
def _convert_numpy():
    dataset = {}
    dataset['train_img'] =  _load_img(key_file['train_img'])
    dataset['train_label'] = _load_label(key_file['train_label'])    
    dataset['test_img'] = _load_img(key_file['test_img'])
    dataset['test_label'] = _load_label(key_file['test_label'])
    
    return dataset

def init_mnist():
    download_mnist()
    dataset = _convert_numpy()
    print("Creating pickle file ...")
    with open(save_file, 'wb') as f:
        pickle.dump(dataset, f, -1)
    print("Done!")

def _change_one_hot_label(X):
    T = np.zeros((X.size, 10))
    for idx, row in enumerate(T):
        row[X[idx]] = 1
        
    return T
    

def load_mnist(normalize=True, flatten=True, one_hot_label=False):
    """读入MNIST数据集
    
    Parameters
    ----------
    normalize : 将图像的像素值正规化为0.0~1.0
    one_hot_label : 
        one_hot_label为True的情况下,标签作为one-hot数组返回
        one-hot数组是指[0,0,1,0,0,0,0,0,0,0]这样的数组
    flatten : 是否将图像展开为一维数组
    
    Returns
    -------
    (训练图像, 训练标签), (测试图像, 测试标签)
    """
    if not os.path.exists(save_file):
        init_mnist()
        
    with open(save_file, 'rb') as f:
        dataset = pickle.load(f)
    
    if normalize:
        for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].astype(np.float32)
            dataset[key] /= 255.0
            
    if one_hot_label:
        dataset['train_label'] = _change_one_hot_label(dataset['train_label'])
        dataset['test_label'] = _change_one_hot_label(dataset['test_label'])
    
    if not flatten:
         for key in ('train_img', 'test_img'):
            dataset[key] = dataset[key].reshape(-1, 1, 28, 28)

    return (dataset['train_img'], dataset['train_label']), (dataset['test_img'], dataset['test_label']) 


if __name__ == '__main__':
    init_mnist()

你可能感兴趣的:(深度学习,python,MNIST)