1. BP神经网络
神经网络 又称多层感知机,主要包括前馈和反向传播算法,对不同的任务,构建包含不同单元数的隐含层,融合合适的激活函数(Sigmoid、softmax、tanh,ReLu等)。对损失函数(交叉熵,最小平方差等)进行梯度下降算法(随机梯度下降、mini-batch梯度下降等)求解,尽量避开局部最小值,收敛到全局最优值。
前馈 所谓前馈,就是信号从输入端,通过隐藏层的加权以及偏置,激活等,最终得到输出的过程。每一层的激活函数的输出,同时作为下一层的输入,在前馈传播中,是不对网络中的参数进行调整的
[图片上传失败...(image-b46ce9-1534136488945)]
[图片上传失败...(image-eeb0c4-1534136488945)]
在手写字体识别中,前向传播代码如下:
def _forward_prop(self, x):
'''
前向传播算法
:param x: 输入向量
:return:
'''
# 输入层
self._activations[0] = x
# 逐层计算
for i in range(1, self.num_layers):
# _zs = w * x + b
self._zs[i] = (self.weights[i].dot(
self._activations[i - 1]) + self.biases[i])
# 激活值
self._activations[i] = self._activate_func[0](self._zs[i])
- 后向传播 在BP神经网络中如何对权重参数进行调优是一个核心问题。在训练开始阶段,输入信号通过网络层到达输出层的结果 z,一般与实际值 y 存在偏差值 z-y,称为误差。输出层的误差通常被认为是与前面隐藏层有权重关系,即误差可以反向通过网络进行传播,得到各个隐藏层的单元误差。详细解释可参考此处
[图片上传失败...(image-3ecf38-1534136488945)]
[图片上传失败...(image-621a7b-1534136488945)]
网络层中的权重,即通过各个层的误差进行更新。更新权值的思路简单粗暴,1)求梯度;2)梯度下降,梯度求导,核心是采用链式求导法则
[图片上传失败...(image-646c10-1534136488945)]
[图片上传失败...(image-173b36-1534136488945)]
手写字体识别中的反向传播代码:
def _back_prob(self, x, y):
'''
后向误差传播
:param x: 输入向量
:param y: 样本标签值
:return:
'''
# 初始化所有层 w,b
nabla_b = [np.zeros(bias.shape) for bias in self.biases]
nabla_w = [np.zeros(weight.shape) for weight in self.weights]
# error = (sigmoid(w*x+b) - y) * sigmoidDri(w*x+b)
error = (self._activations[-1] - y) * \
self._activate_func[-1](self._zs[-1])
nabla_b[-1] = error
nabla_w[-1] = error.dot(self._activations[-2].transpose())
# 后向计算每层的传播误差
for l in range(1,self.num_layers - 1)[::-1]:
# print(l)
# print(np.shape(self.weights[l + 1].transpose()))
# print(np.shape(error))
# print(np.shape(self._activate_func[-1](self._zs[l])))
# print(np.shape(self._activations[l - 1].transpose()))
# exit()
error = np.multiply(
self.weights[l + 1].transpose().dot(error), self._activate_func[-1](self._zs[l]))
nabla_b[l] = error
nabla_w[l] = error.dot(self._activations[l - 1].transpose())
- 梯度下降算法
目前,主要包括以下几种算法: 批梯度下降算法:利用全集样本选取梯度的方向和步长,能不断收敛,能逼近最优解 随机梯度下降算法:SGD或者On-line GD,选取少数的样本代替全局,不一定总沿着最陡的方向下降,在数据量大时,处理速度较快,适合线上操作,但是很容易达到局部收敛,学习率是一个影响很重要的参数。 此外,还有mini-batch GD,将全局按照mini-batch-size的尺度进行切分,对每一个batch的数据集进行梯度下降算法求解。
手写字体识别中,训练样本量为55000,验证集为5000,利用mini-batch方法为回合进行训练
def fit(self, train_data, validation_data=None):
'''
训练 W, b
:param train_data: 训练数据
:param validation_data: 发展集数据
:return:
'''
# for epoch in range(self.epochs):
accuracy = 0.0
for epoch in range(self.epochs):
# 重新加载batch数据集样本
train_data_images, train_data_labels = train_data.next_batch(
self.mini_batch_size)
# 格式化数据
mini_batches = self.formData(train_data_images, train_data_labels)
for sample in mini_batches:
# 初始化每轮batch的 w,b
nbala_b = [np.zeros(bias.shape) for bias in self.biases]
nabla_w = [np.zeros(weight.shape) for weight in self.weights]
# 每个样本数据及其标签
x, y = sample
# 前向传播
self._forward_prop(x)
# 后向误差传播,得出 w,b偏差量
data_nabla_b, data_nabla_w = self._back_prob(x, y)
nbala_b = [nb + dnb for nb,
dnb in zip(nbala_b, data_nabla_b)]
nabla_w = [nw + dnw for nw,
dnw in zip(nabla_w, data_nabla_w)]
# 更新 W,b
self.weights = [
w - (self.eta / self.mini_batch_size) * dw for w, dw in zip(self.weights, nabla_w)]
self.biases = [
b - (self.eta / self.mini_batch_size) * db for b, db in zip(self.biases, nbala_b)]
# 对发展集测试训练结果
if validation_data:
accuracy = self.validate(validation_data) * 100.0
print('Epoch {0}, accuracy {1} %.'.format(
epoch + 1, accuracy))
else:
print('Prcoessed epoch{0}.'.format(epoch))
2. MNIST字体识别【github】
讲述完BP神经网络的基本原理后,借助MNIST数据集进行神经网络的实战。
数据处理 在官网
下载训练集和测试集数据,并进行相应的格式化处理,此处采取tensorflow的处理思路,会自动分析本地目录下是否已经下载四个数据集,否则重新下载解压,并格式化可以传入网络输入层的形式。一下为load.py文件内容
from __future__ import absolute_import, absolute_import, print_function
import os
import gzip
import numpy
from six.moves import urllib, xrange
# the datafile url
SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
def _isDownload(file_name, work_dir):
"""
download the data file if they don't exist
return file_path
"""
if not os.path.exists(work_dir):
os.mkdir(work_dir)
file_path = os.path.join(work_dir, file_name)
if not os.path.exists(file_path):
# download file
file_path, _ = urllib.request.urlretrieve(
SOURCE_URL + file_name, file_path)
state_info = os.stat(file_path)
print('Successfully downloaded!!', file_name,
state_info.st_size, 'bytes.')
return file_path
def _read32(bytestream):
dt = numpy.dtype(numpy.uint32).newbyteorder('>')
return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]
def extractImages(file_name):
"""
Extract the images into a 4D unit8 numpy.array
like [index,y,x,depty]
"""
print('Extracting ', file_name)
with gzip.open(file_name) as bytestream:
magic = _read32(bytestream)
if magic != 2051:
raise ValueError(
'Invalid magic number %d in MNIST image file:%s' %
(magic, file_name)
)
num_images = _read32(bytestream)
rows = _read32(bytestream)
cols = _read32(bytestream)
buf = bytestream.read(rows * cols * num_images)
data = numpy.frombuffer(buf, dtype=numpy.uint8)
data = data.reshape(num_images, rows, cols, 1)
return data
def extractLabels(file_name, one_hot=False):
"""
===> 1D unit8 numpy.array [index]
"""
print('Extracting ', file_name)
with gzip.open(file_name) as bytestream:
magic = _read32(bytestream)
if magic != 2049:
raise ValueError(
'Invalid magic number %d in MNIST image file:%s' %
(magic, file_name)
)
num_items = _read32(bytestream)
buf = bytestream.read(num_items)
labels = numpy.frombuffer(buf, dtype=numpy.uint8)
if one_hot:
return denseToOneHot(labels)
return labels
def denseToOneHot(labels_dense, number_classes=10):
"""
class lables ==> one hot vectors
"""
number_labels = labels_dense.shape[0]
index_offset = numpy.arange(number_labels) * number_classes
labels_ont_hot = numpy.zeros((number_labels, number_classes))
labels_ont_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_ont_hot
class DataSet(object):
def __init__(self, images, labels, fake_data=False):
if fake_data:
self._num_examples = 10000
else:
assert images.shape[0] == labels.shape[0], (
'images.shape:%s labels.shape:%s' % (
images.shape, labels.shape)
)
self._num_examples = images.shape[0]
# [num examples,rows ,cols,depth] ====> [num examples,rows * cols] assuming depth=1
assert images.shape[3] == 1
images = images.reshape(images.shape[0],
images.shape[1] * images.shape[2]
)
# [0,255] ===> [0.0,1.0]
images = images.astype(numpy.float32)
images = numpy.multiply(images, 1.0 / 255.0)
self._images = images
self._labels = labels
self._epochs_completed = 0
self._index_in_epoch = 0
# __build_in__ fget and fset
@property
def images(self):
return self._images
@property
def labels(self):
return self._labels
@property
def num_examples(self):
return self._num_examples
@property
def epoch_completed(self):
return self._epochs_completed
def next_batch(self, batch_size, fake_data=False):
"""
return next "batch_size" examples from data
"""
if fake_data:
# 28 * 28 =784
fake_image = [1.0 for _ in xrange(784)]
fake_label = 0
return[fake_image for _ in xrange(batch_size)], [fake_label for _ in xrange(batch_size)]
start = self._index_in_epoch
self._index_in_epoch += batch_size
# if not over size
if self._index_in_epoch > self._num_examples:
self._epochs_completed += 1
perm = numpy.arange(self._num_examples)
# shuffle the data
numpy.random.shuffle(perm)
self._images = self._images[perm]
self._labels = self._labels[perm]
# start next epoch
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._num_examples
end = self._index_in_epoch
return self._images[start:end], self._labels[start:end]
def readDataSets(train_dir, fake_data=False, one_hot=False):
class DataSets(object):
pass
data_sets = DataSets()
if fake_data:
data_sets.train = DataSet([], [], fake_data=True)
data_sets.validation = DataSet([], [], fake_data=True)
data_sets.test = DataSet([], [], fake_data=True)
return data_sets
TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
VALIDVATION_SIZE = 5000
local_file = _isDownload(TRAIN_IMAGES, train_dir)
train_images = extractImages(local_file)
local_file = _isDownload(TRAIN_LABELS, train_dir)
train_labels = extractLabels(local_file, one_hot=one_hot)
local_file = _isDownload(TEST_IMAGES, train_dir)
test_images = extractImages(local_file)
local_file = _isDownload(TEST_LABELS, train_dir)
test_labels = extractLabels(local_file, one_hot=one_hot)
validation_images = train_images[:VALIDVATION_SIZE]
validation_labels = train_labels[:VALIDVATION_SIZE]
train_images = train_images[VALIDVATION_SIZE:]
train_labels = train_labels[VALIDVATION_SIZE:]
data_sets.train = DataSet(train_images, train_labels)
data_sets.validation = DataSet(validation_images, validation_labels)
data_sets.test = DataSet(test_images, test_labels)
return data_sets
- 构建网络模型
这里构建了一种较为灵活的网络模型,隐藏层的数量和单元数可以在参数 sizes 中随便设置,基本的函数意义在上述BP原理已经做了简单的解释和对应。一下为NeutralNetwork.py内容,此处定义了一个类
[图片上传失败...(image-41e379-1534136488945)]
import time
from activate import *
import load_data as input_data
class NeuralNetwork(object):
"""
three layers(one hidden layer) NN model with Cost_func = 0.5*(z-y) ** 2
Using mini-batch Gradient
"""
def __init__(self, in_units, hidden_units, out_units):
"""Returns a new 3-layer neural network with the specified layer sizes."""
# Hyper parameters
self.input_size = in_units
self.output_size = out_units
self.hidden_size = hidden_units
self.activate_func = [sigmoid, sigmoidDerivative]
# Learning parameters
self.rate = 6.0
# Weight parameters, randomly initialized
self.W1 = np.random.uniform(-0.5, 0.5,
(self.input_size, self.hidden_size))
self.W2 = np.random.uniform(-0.5, 0.5,
(self.hidden_size, self.output_size))
def configure(self, rate=None):
"""Change the learning parameters of the network."""
self.rate = self.rate if rate is None else rate
def init_weights(self):
"""Initialize weights using Nguyen-Widrow."""
self.W1 = np.random.uniform(-0.5, 0.5,
(self.input_size, self.hidden_size))
self.W2 = np.random.uniform(-0.5, 0.5,
(self.hidden_size, self.output_size))
# Initialize the hidden layer weights
beta = 0.7 * (self.hidden_size ** (1.0 / self.input_size))
for n in range(self.hidden_size):
norm_val = np.linalg.norm(self.W1[:, n])
self.W1[:, n] = np.multiply(self.W1[:, n], beta / norm_val)
# Initialize the output layer weights
beta = 0.7 * (self.output_size ** (1.0 / self.hidden_size))
for n in range(self.output_size):
norm_val = np.linalg.norm(self.W2[:, n])
self.W2[:, n] = np.multiply(self.W2[:, n], beta / norm_val)
def forward(self, sample):
"""Forward propagation through the network.
sample: ndarray of shape (n, input_size), where n is number of samples
"""
self.Z2 = np.dot(sample.T, self.W1).T
self.A2 = self.activate_func[0](self.Z2)
self.Z3 = np.dot(self.A2.T, self.W2).T
self.y_hat = self.activate_func[0](self.Z3)
return self.y_hat
def cost(self, estimate, target):
"""Sum Squared Error cost function.
estimate: ndarray of shape (output_size,n), where n is number of samples
target : ndarray of shape (output_size,n)
"""
return np.mean(np.mean((target - estimate) ** 2, axis=0))
def cost_prime(self, sample, target, estimate):
"""Gradient descent derivative.
sample : ndarray of shape (n, input_size), where n is number of samples
target : ndarray of shape (n, output_size)
estimate: ndarray of shape (n, output_size)
"""
total = len(sample)
delta3 = np.multiply(-(target - estimate),
self.activate_func[-1](self.Z3))
dW2 = np.multiply(np.dot(self.A2, delta3.T), 2 / total)
delta2 = np.dot(self.W2, delta3) * self.activate_func[-1](self.Z2)
dW1 = np.multiply(np.dot(sample, delta2.T), 2 / total)
return dW1, dW2
def evaluate(self, sample, target):
"""Evaluate network performace using given data."""
results = self.forward(sample.T)
pairs = [(np.argmax(x), np.argmax(y))
for x, y in zip(results.T, target.T)]
correct = sum(int(x == y) for x, y in pairs)
return correct
def backprop(self, images, labels):
"""Update weights using batch backpropagation."""
size = len(labels)
dW1s = []
dW2s = []
for i in range(size):
label = labels[i]
image = images[i]
estimate = self.forward(image)
dW1, dW2 = self.cost_prime(image, label, estimate)
dW1s.append(dW1)
dW2s.append(dW2)
self.W1 = self.W1 - (self.rate / size) * sum(dW1s)
self.W2 = self.W2 - (self.rate / size) * sum(dW2s)
def train(self, train_data, epochs, batch_size, test_set=None):
"""Train the neural network using given data and parameters."""
if test_set is not None:
size_test = len(test_set.labels)
size = len(train_data.labels)
print("num training data: {}".format(size))
self.costs = []
start = time.time()
for r in range(epochs):
batch_datas = []
for i in range(10):
# 重新加载batch数据集样本
train_data_images, train_data_labels = train_data.next_batch(
batch_size)
# 格式化数据
mini_batches = self.formData(
train_data_images, train_data_labels)
batch_datas.append(mini_batches)
for batch_data in batch_datas:
images, labels = [], []
for data in batch_data:
images.append(data[0])
labels.append(data[1])
self.backprop(images, labels)
# target = train_data.labels
# sample = train_data.images
# estimate = self.forward(sample.T)
# cost = self.cost(estimate, target.T)
# self.costs.append(cost)
# print("Epoch {} complete: cost {}".format(r, cost))
if test_set is not None:
target = test_set.labels
sample = test_set.images
correct = self.evaluate(sample, target.T)
print(" {} / {}".format(correct, size_test))
stop = time.time()
elapsed = stop - start
print("Time elapsed: {} sec".format(elapsed))
def formData(self, data_images, data_labels):
'''
格式化数据
:param data_images: 图片向量集
:param data_labels: 图片标签集
:return:格式化后数据
'''
data = zip([np.reshape(x, (784, 1)) for x in data_images],
[np.reshape(y, (10, 1)) for y in data_labels])
return data
- 训练 训练入口函数在main.py中
import load as input_data
# import MNIST
import network
def train():
mnist = input_data.readDataSets('data', one_hot=True)
train_data = mnist.train
validation_data = mnist.validation
nn = network.NN(sizes=[784,200,10], epochs=50000, mini_batch_size=10,learning_rate=0.3) nn.fit(train_data, validation_data=validation_data)
nn.save()
train()
训练输出过程如下:
$ python3 main.py
Extracting data/train-images-idx3-ubyte.gz
Extracting data/train-labels-idx1-ubyte.gz
Extracting data/t10k-images-idx3-ubyte.gz
Extracting data/t10k-labels-idx1-ubyte.gz
Epoch 1, accuracy 16.900000000000002 %.
Epoch 2, accuracy 16.900000000000002 %.
Epoch 3, accuracy 17.02 %.
Epoch 4, accuracy 16.939999999999998 %.
Epoch 5, accuracy 17.02 %.
Epoch 6, accuracy 17.0 %.
Epoch 7, accuracy 16.98 %.
Epoch 8, accuracy 17.0 %.
Epoch 9, accuracy 17.06 %.
Epoch 10, accuracy 17.080000000000002 %.
Epoch 11, accuracy 17.06 %.
Epoch 12, accuracy 17.1 %.