import numpy as np
import struct
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
# 可以在jupyternotebook看一下怎么使用struct
help(struct)
with open("./dataset/t10k-labels-idx1-ubyte", "rb") as f:
# 如果没有rb, 会读成str格式的文件, 需要rb转换成二进制的文件
data = f.read()
magic_number, num_data = struct.unpack(">ii", data[:8])
magic_number, num_data, np.frombuffer(data[8:], dtype = np.uint8)
(2049, 10000, array([7, 2, 1, ..., 4, 5, 6], dtype=uint8))
with open("./dataset/train-images-idx3-ubyte", "rb") as f:
data = f.read()
magic_number, num_data, rows, cols = struct.unpack(">iiii", data[:16])
magic_number, num_data, rows, cols , np.frombuffer(data[16:], dtype = np.uint8).shape
(2051, 60000, 28, 28, (47040000,))
从这里看出, MNIST是一个尺寸比较小的数据集,28x28。如果我们打算使用RestNet训练会出问题的,由于ResNet第一个卷积层是7x7,再马上跟着一个Maxpooling,一下子就会变成7x7,那这种特征提取就没有意义了,不过本次是使用BP神经网络训练。
上面写好了过程,这里封装一下
def load_labels(file):
with open(file, "rb") as f:
data = f.read()
magic_number, num_data = struct.unpack(">ii", data[:8])
if magic_number != 2049:
print(f"Not labels Because magic num is not 2049, is {magic_number}")
return None
labels = np.frombuffer(data[8:], dtype = np.uint8)
return labels
def load_images(file):
with open(file, "rb") as f:
data = f.read()
magic_number, num_data, rows, cols = struct.unpack(">iiii", data[:16])
if magic_number != 2051:
print(f"Not images Because magic num is not 2051, is {magic_number}")
return None
images = np.frombuffer(data[16:], dtype = np.uint8).reshape(num_data, -1)
return images
labels_test = load_labels("./dataset/train-labels-idx1-ubyte")
images_test = load_images("./dataset/train-images-idx3-ubyte")
labels_test.shape, images_test.shape # 60000张数据, 维度784 = 28 * 28
((60000,), (60000, 784))
images数据是0-255的,这里做一个归一化,使得图像在 -0.5 - 0.5 之间
# Normalization
train_images = load_images("./dataset/train-images-idx3-ubyte")
a = np.max(train_images)
b = np.min(train_images)
print(a, b) # between 255, 0
train_images = train_images / 255 - 0.5
a = np.max(train_images)
b = np.min(train_images)
# 这里验证一下
print(a, b) # between 0.5 -0.5
255 0
0.5 -0.5
# load train data
train_images = load_images("./dataset/train-images-idx3-ubyte")
train_images = train_images / 255 - 0.5
train_labels = load_labels("./dataset/train-labels-idx1-ubyte")
# load val data
val_images = load_images("./dataset/t10k-images-idx3-ubyte")
val_images = val_images / 255 - 0.5
val_labels = load_labels("./dataset/t10k-labels-idx1-ubyte")
# display
print(f"train_images.shape: {train_images.shape}")
print(f"train_labels.shape: {train_labels.shape}")
print(f"val_images.shape: {val_images.shape}")
print(f"val_labels.shape: {val_labels.shape}")
train_images.shape: (60000, 784)
train_labels.shape: (60000,)
val_images.shape: (10000, 784)
val_labels.shape: (10000,)
一共是60000条数据, 每个数据的维度是784,符合我们的预期,28*28=784
以这个案例,label = 3, classes = 10
之前的想法是: 希望3的概率尽可能的接近1,其他的尽可能的接近0
引入图像分类的想法后: 概率论里面并没有绝对的0概率, 是希望其他的类别都有点概率, 毕竟怎么说都是相似度比较嘛
让模型不要过度自信, 导致过拟合的方法, 增加训练的难度
这个trick之后做分类器的时候会用到非常多, 可以提升精度
trick不是万能膏药!!!!
# DU Version
# rewrite label
label = 3
classes = 10
one_hot = np.zeros(shape = (1, classes))
one_hot[0, label] = 1
print(one_hot, one_hot.shape)
# label smoothing
e = 0.2 # label_smoothing的系数 抽出1中的0.2, 平均的分配到所有的类别里面去
one_hot[0, label] = 1 - e
eoff = e / classes
one_hot += eoff
one_hot
print(one_hot, one_hot.shape)
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]] (1, 10)
[[0.02 0.02 0.02 0.82 0.02 0.02 0.02 0.02 0.02 0.02]] (1, 10)
train_labels, train_labels.shape
(array([5, 0, 4, ..., 5, 6, 8], dtype=uint8), (60000,))
def one_hot(labels, classes, label_smoothing = 0.2):
n = len(labels)
eoff = label_smoothing / classes
output = np.ones((n, classes), dtype = np.float32) * eoff
for rows, label in enumerate(labels):
output[rows, label] = 1 - label_smoothing + eoff
return output
one_hot(train_labels, 10, 0.3)
array([[0.03, 0.03, 0.03, ..., 0.03, 0.03, 0.03],
[0.73, 0.03, 0.03, ..., 0.03, 0.03, 0.03],
[0.03, 0.03, 0.03, ..., 0.03, 0.03, 0.03],
...,
[0.03, 0.03, 0.03, ..., 0.03, 0.03, 0.03],
[0.03, 0.03, 0.03, ..., 0.03, 0.03, 0.03],
[0.03, 0.03, 0.03, ..., 0.03, 0.73, 0.03]], dtype=float32)
class Dataset:
def __init__(self, images, labels):
self.images = images
self.labels = labels
def __getitem__(self, index):
return self.images[index], self.labels[index]
def __len__(self): # 让dataset有统计数量的功能
return self.images.shape[0]
dataset = Dataset(train_images, train_labels)
print(len(dataset))
# dataset[0] 可以打印看看,是一个图像的样子
60000
为了加深理解,这里不用框架,手写出来
在训练的时候我们一般都会打乱数据集,我们自己做一个指针出来打乱数据集,这样就避免了直接在数据集上面操作
self.current_index_list_cursor 0 1 2 3 4 5 原来数据集的索引
self.random_index_list 5 3 2 0 1 4 我们打乱后的索引
用打乱过的self.random_index_list 来取dataset的值(随机)
用self.current_index_list_cursor按顺序取self.random_index_list
```python
# DataLoader 每次去dataset提取一个batch的数据, 决定是否打乱, 所以需要三个参数, dataset, batch_size, shuffle
# 一个class迭代完全部的数据
```css
在这里插入代码片
# 只用于看一个批次数据, 同时也具备检测是否看完一个epoch的功能 StopIteration
# 不停的去dataset里面拉数据
# 先打乱全部数据,然后用指针一个批次一个批次的去拉取
class DataLoader:
def __init__(self, dataset, batch_size, shuffle = True):
self.dataset = dataset
self.shuffle = shuffle
self.batch_size = batch_size
def __iter__(self):
return DataLoaderIterator(self.dataset, self.batch_size, self.shuffle)
class DataLoaderIterator:
def __init__(self, dataset, batch_size, shuffle):
self.dataset_size = len(dataset)
self.batch_size = batch_size # 假设batch_size = 16
self.current_index_list_cursor = 0 # 现在的指针, 用来遍历self.random_index_list, 这个是不乱的,按顺序的
self.random_index_list = np.arange(0, self.dataset_size) # 全部的指针, 会被打乱 假设: 0 - 10000
if shuffle:
random.shuffle(self.random_index_list)
def __next__(self): # 会一轮一轮的跌倒用这个__next__
if self.current_index_list_cursor >= self.dataset_size: # 这个不是异常, 只是一个停止迭代的条件
raise StopIteration
# 为了提取打乱的dataset, 先创建一个跟dataset一样大的指针库self.random_index_list, 打乱他,然后再用当前指针self.current_index_list_cursor去遍历数据
# 获取batch个数据
# 指针的位置是未取的值
# 总共10个数据, 现在的batch是5
# 现在指针为0, 获取5个数据是可以的
# 如果我的指针为8, 这时候只能取2个, 不能够取5个了
begin = self.current_index_list_cursor
end = self.current_index_list_cursor + self.batch_size
end = min(end, self.dataset_size) # end在最后一次训练前都是现有指针 + batch_size, 最后一次直接在结尾,不然超出尺寸了
batch_image, batch_label = [], []
# 看一个batch的数据>>>生成一个batch的数据
for i in range(begin, end):
index = self.random_index_list[i] # 遍历这个乱掉的指针
image, label = self.dataset[index] # 用这个指针去dataset取数据, 返回两个东西,一个images, 一个label, 上面写了
batch_image.append(image)
batch_label.append(label)
self.current_index_list_cursor = end
return np.vstack(batch_image), np.vstack(batch_label)
dataset = Dataset(train_images, one_hot(train_labels, 10))
dataloader = DataLoader(dataset, 16, True)
for index, (batch_images, batch_labels) in enumerate(dataloader):
print(index, batch_images.shape)
print(batch_labels)
0 (5, 784)
[[0.82 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02]
[0.02 0.02 0.82 0.02 0.02 0.02 0.02 0.02 0.02 0.02]
[0.02 0.02 0.02 0.82 0.02 0.02 0.02 0.02 0.02 0.02]
[0.02 0.02 0.02 0.02 0.02 0.82 0.02 0.02 0.02 0.02]
[0.02 0.82 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02]]
1 (5, 784)
[[0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.82 0.02]
[0.02 0.02 0.02 0.02 0.02 0.02 0.82 0.02 0.02 0.02]
[0.02 0.02 0.02 0.82 0.02 0.02 0.02 0.02 0.02 0.02]
[0.02 0.02 0.02 0.02 0.02 0.02 0.82 0.02 0.02 0.02]
[0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.82]]
2 (5, 784)
[[0.02 0.02 0.02 0.82 0.02 0.02 0.02 0.02 0.02 0.02]
[0.02 0.02 0.02 0.82 0.02 0.02 0.02 0.02 0.02 0.02]
[0.02 0.82 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02]
[0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.82]
[0.02 0.82 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02]]
....
11999 (5, 784)
[[0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.82]
[0.82 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02]
[0.02 0.02 0.02 0.02 0.02 0.82 0.02 0.02 0.02 0.02]
[0.02 0.02 0.02 0.02 0.02 0.02 0.82 0.02 0.02 0.02]
[0.02 0.02 0.82 0.02 0.02 0.02 0.02 0.02 0.02 0.02]]
十个类别(数字0-9),batch_size, epochs, lr 等都是深度学习的超参数
classes = 10 #10个类别
batch_Size = 12 # 每个批次取多少张图片
epochs = 10 # 退出策略, 全部数据一共看10遍
lr = 1e-3 # 学习率的策略
numdata, datadims = train_images.shape
numdata, datadims
(60000, 784)
train_data = DataLoader(Dataset(train_images, one_hot(train_labels, classes)), batch_Size, shuffle = True)
每个神经元都会进行激活
层的概念就是中间的这些参数
层的作用就是输入输出的映射, 事实上激活神经元也可以看成一个层,一个箱子
抽象理解: 第一个层: 输入输出 第二个层: 输入激活函数输出
最后输出的这个激活是跟loss激活, loss跟sigmoid同时求导
这边这么做是为了更像Pytorch
class Module:
def __init__(self, name):
self.name = name
def __call__(self, *args):
return self.forward(*args)
class LinearLayer(Module):
def __init__(self, input_feature, output_feature):
super().__init__("Linear") # 父类的初始化
self.weight = np.random.normal(size = (input_feature, output_feature))
self.bias = np.zeros((1, output_feature)) # 每一个神经元都有自己的bias
def forward(self, x): # loss计算的时候需要predict和label
self.x_save = x.copy()
print(f"forward operation")
return x @ self.weight # + self.bias
input_to_hidden = LinearLayer(784, 256)
x = input_to_hidden(train_images)
class Module:
def __init__(self, name):
self.name = name
def __call__(self, *args):
return self.forward(*args)
class LinearLayer(Module):
def __init__(self):
super().__init__("Linear")
def forward(self, x): # loss计算的时候需要predict和label
print(f"forward operation")
input_to_hidden = LinearLayer()
x = input_to_hidden(train_images)
forward operation
class Module:
def __init__(self, name):
self.name = name
def __call__(self, *args):
return self.forward(*args)
class aaaa(Module):
def __init__(self):
super().__init__(name = "aaaa")
def forward(self, x):
print(f"forward operation")
forward operation
class Module:
def __init__(self, name):
self.name = name
def __call__(self, *args):
return self.forward(*args)
class LinearLayer(Module):
def __init__(self, input_feature, output_feature):
super().__init__("Linear") # 父类的初始化
self.weight = np.random.normal(size = (input_feature, output_feature))
self.bias = np.zeros((1, output_feature)) # 每一个神经元都有自己的bias
def forward(self, x): # loss计算的时候需要predict和label
self.x_save = x.copy()
print(f"forward operation")
return x @ self.weight # + self.bias
input_to_hidden = LinearLayer(784, 256)
x = input_to_hidden(train_images)
forward operation
def sigmoid(x):
return 1 / (1 + np.exp(-x))
sigmoid(10000), sigmoid(-10000)
:2: RuntimeWarning: overflow encountered in exp
return 1 / (1 + np.exp(-x))
(1.0, 0.0)
def sigmoid(x):
# 传进来的是Numpy数组
x = x.copy()
p0 = x < 0
p1 = ~p0 # 取反
x[p1] = 1 / (1 + np.exp(-x[p1]))
x[p0] = np.exp(x[p0]) / (1 + np.exp(x[p0]))
return x
var = np.array([1, 2, 3, -3000], np.float32)
sigmoid(var)
array([0.7310586 , 0.880797 , 0.95257413, 0. ], dtype=float32)
# 把每一个layer(无论是否是真的Layer)称之为一个模块
# 没啥实际的意义, 就是自动forward()
class Module:
def __init__(self, name):
self.name = name
def __call__(self, *args):
return self.forward(*args)
# 把参数存起来
class Parameter:
def __init__(self, value):
self.value = value
self.delta = np.zeros(value.shape) # delta跟value是一样的,因为参数被delta更新
def zero_grad(self):
self.delta[...] = 0
class LinearLayer(Module):
def __init__(self, input_feature, output_feature):
super().__init__("Linear") # 父类的初始化
self.weight = Parameter(np.random.normal(size = (input_feature, output_feature)))
self.bias = Parameter(np.zeros((1, output_feature))) # 每一个神经元都有自己的bias
def forward(self, x): # loss计算的时候需要predict和label
self.x_save = x.copy()
return x @ self.weight.value # + self.bias
def backward(self, G):
# AB = C G(梯度)
# dA = G @ B.T
# dB = A.t @ G
# 上面Parameter的deleta是zeros, 直接累加,这个G是一个batch清空一次
self.weight.delta[...] = self.x_save.T @ G # 只是求导因为参数更新是优化器负责的事情 SGD优化器就是 self.weight = theta - lr * delta_weight
return G @ self.weight.value.T
class ReLULayer(Module): # inplace避免内存的引用问题, 可以直接修改tensor
def __init__(self, inplace=True):
super().__init__("ReLU")
self.inplace = inplace
def forward(self, x):
self.mask = x <= 0 # 干成0
if not self.inplace: # 不做inplace所以需要拷贝一份
x = x.copy()
x[self.mask] = 0 # 函数曲线,直接干成0
return x
def backward(self, G):
# mask = 0的地方导数也是0
if not self.inplace:
G = G.copy()
G[self.mask] = 0
return G
#输出的激活跟loss封装在一块的
class sigmoidCrossEntroyLossLayer(Module):
def __init__(self, inplace=True):
super().__init__("sigmoidCrossEntroyLoss")
def sigmoid(self, x):
x = x.copy()
p0 = x < 0
p1 = ~p0
x[p1] = 1 / (1 + np.exp(-x[p1]))
x[p0] = np.exp(x[p0]) / (1 + np.exp(x[p0]))
return x
# forward(): sigmoid >>> CrossEntroy (控制交叉熵溢出)
def forward(self, predict, labels): # 只是显示损失函数,这个的forward并不参与更新
self.labels = labels
self.prob = self.sigmoid(predict) # 把predict变成了一个概率self.prob用于做反向传播
self.batch_size = self.prob.shape[0]
# 控制交叉熵溢出的问题
eps = 1e-6
self.prob = np.clip(self.prob, a_min = eps, a_max = 1 - eps)
return -np.sum(labels * np.log(self.prob) + (1 - labels) * np.log(1 - self.prob)) / self.batch_size
def backward(self):
return (self.prob - self.labels) / self.batch_size
这里只写一个隐含层的网络, 256个节点。
classes = 10 #10个类别
batch_Size = 12 # 每个批次取多少张图片
epochs = 10 # 退出策略, 全部数据一共看10遍
lr = 1e-3 # 学习率的策略
num_data, data_dims = train_images.shape # 60000, 784
num_hidden_nodes = 256 # 假设有这么多个神经元
# 因为继承了Moudule的__call__函数, 所以下面调用的时候不用写 func.forward
# 784 >>> 256 >>> 10
# theta1 = 784 * 256 theta2 = 256 * 10 loss.shape = 10(针对10个输出端口)
input_to_hidden = LinearLayer(data_dims, num_hidden_nodes)
activation_hidden = ReLULayer()
hidden_to_output = LinearLayer(num_hidden_nodes, classes)
loss_func = sigmoidCrossEntroyLossLayer()
# for epoch in epochs:
for batch_index, (images, labels) in enumerate(train_data):
#前向传播
x = input_to_hidden(images) # 输入到隐含层
x = activation_hidden(x) # 隐含层激活
predict = hidden_to_output(x) # 隐含层到输出层
loss = loss_func(predict, labels) # 激活输出: 封装在一个class
G = loss_func.backward() # 梯度是拿来更新参数用的 (batch_size, classes) loss是forward()时候的一个信息值, 查看你这个训练的准确度怎么样
G = hidden_to_output.backward(G)
G = activation_hidden.backward(G)
G = input_to_hidden.backward(G)
print(loss, G.shape)
break
58.472900737465665 (12, 784)
比较松散
# 负责模型的结构, 推理和反向
class Model(Module):
def __init__(self, input_feature, num_hidden_nodes, num_output): # num_output 在输出层是classes的作用
super().__init__("model")
self.input_to_hidden = LinearLayer(input_feature, num_hidden_nodes)
self.activation_hidden = ReLULayer()
self.hidden_to_output = LinearLayer(num_hidden_nodes, num_output)
def forward(self, x):
x = self.input_to_hidden(x)
x = self.activation_hidden(x)
x = self.hidden_to_output(x)
return x
def backward(self, G):
G = self.hidden_to_output.backward(G)
G = self.activation_hidden.backward(G)
G = self.input_to_hidden.backward(G)
return G
def update(self, lr):
self.input_to_hidden.weight.value -= lr * self.input_to_hidden.weight.delta
self.hidden_to_output.weight.value -= lr * self.hidden_to_output.weight.delta
class Optimizer:
def __init__(self, model, lr):
self.model = model
self._lr = lr
@property # 只读
def lr():
return self._lr
@lr.setter # 可改写, 如果学习率更新了就改就好了
def lr(value):
self._lr = value
class SGD(Optimizer):
def __init__(self, model, lr):
super().__init__(model, lr)
def update(self): # step in pytorch?
self.model.input_to_hidden.weight.value -= lr * self.model.input_to_hidden.weight.delta
self.model.hidden_to_output.weight.value -= lr * self.model.hidden_to_output.weight.delta
classes = 10 #10个类别
batch_Size = 12 # 每个批次取多少张图片
epochs = 10 # 退出策略, 全部数据一共看10遍
lr = 1e-3 # 学习率的策略
num_data, data_dims = train_images.shape # 60000, 784
num_hidden_nodes = 256 # 假设有这么多个神经元
model = Model(data_dims, num_hidden_nodes, classes)
optim = SGD(model, lr)
loss_func = sigmoidCrossEntroyLossLayer()
iters = 0
for epoch in range(epochs):
for batch_index, (images, labels) in enumerate(train_data):
iters += 1
#前向传播
predict = model(images)
loss = loss_func(predict, labels) # 激活输出: 封装在一个class
G = loss_func.backward() # 梯度是拿来更新参数用的 (batch_size, classes) loss是forward()时候的一个信息值, 查看你这个训练的准确度怎么样
model.backward(G)
optim.update()
if iters % 1000 == 0:
print(f"Iter: {iters}, Loss{loss:.3f}, LR{lr:.5f}")
Iter: 1000, Loss18.699, LR0.00100
Iter: 2000, Loss13.798, LR0.00100
Iter: 3000, Loss12.904, LR0.00100
Iter: 4000, Loss14.906, LR0.00100
Iter: 5000, Loss7.345, LR0.00100
Iter: 6000, Loss7.370, LR0.00100
Iter: 7000, Loss12.224, LR0.00100
Iter: 8000, Loss12.321, LR0.00100
Iter: 9000, Loss5.931, LR0.00100
Iter: 10000, Loss10.206, LR0.00100
Iter: 11000, Loss13.899, LR0.00100
Iter: 12000, Loss9.123, LR0.00100
Iter: 13000, Loss11.403, LR0.00100
Iter: 14000, Loss7.200, LR0.00100
Iter: 15000, Loss11.846, LR0.00100
Iter: 16000, Loss7.050, LR0.00100
Iter: 17000, Loss8.564, LR0.00100
Iter: 18000, Loss7.962, LR0.00100
Iter: 19000, Loss8.171, LR0.00100
Iter: 20000, Loss6.828, LR0.00100
Iter: 21000, Loss8.018, LR0.00100
Iter: 22000, Loss11.010, LR0.00100
Iter: 23000, Loss8.378, LR0.00100
Iter: 24000, Loss6.031, LR0.00100
Iter: 25000, Loss7.636, LR0.00100
Iter: 26000, Loss5.632, LR0.00100
Iter: 27000, Loss6.810, LR0.00100
Iter: 28000, Loss6.750, LR0.00100
Iter: 29000, Loss5.714, LR0.00100
Iter: 30000, Loss5.135, LR0.00100
Iter: 31000, Loss6.204, LR0.00100
Iter: 32000, Loss4.211, LR0.00100
Iter: 33000, Loss6.628, LR0.00100
Iter: 34000, Loss5.161, LR0.00100
Iter: 35000, Loss5.039, LR0.00100
Iter: 36000, Loss5.097, LR0.00100
Iter: 37000, Loss5.381, LR0.00100
Iter: 38000, Loss5.104, LR0.00100
Iter: 39000, Loss4.155, LR0.00100
Iter: 40000, Loss4.377, LR0.00100
Iter: 41000, Loss3.814, LR0.00100
Iter: 42000, Loss3.465, LR0.00100
Iter: 43000, Loss4.394, LR0.00100
Iter: 44000, Loss6.046, LR0.00100
Iter: 45000, Loss3.449, LR0.00100
Iter: 46000, Loss4.723, LR0.00100
Iter: 47000, Loss5.070, LR0.00100
Iter: 48000, Loss4.680, LR0.00100
Iter: 49000, Loss3.365, LR0.00100
Iter: 50000, Loss3.074, LR0.00100
以上就是一个简单的神经网络训练的流程,一起看一下还能怎么优化