Pytorch实战1—— MNIST 数据集分类

分为四部分:1.准备数据集   2. 设计模型    3.  构建损失函数和优化器,4. 训练  5. 测试验证

from pathlib import Path
import requests
#加载数据集
DATA_PATH = Path("data")
PATH = DATA_PATH / "mnist"

PATH.mkdir(parents=True, exist_ok=True)

URL = "http://deeplearning.net/data/mnist/"
FILENAME = "mnist.pkl.gz"

if not (PATH / FILENAME).exists():
        content = requests.get(URL + FILENAME).content
        (PATH / FILENAME).open("wb").write(content)

import pickle
import gzip

with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding="latin-1") #读入数据,y_train是标签,即图片是哪个类

from matplotlib import pyplot
import numpy as np
#测试一下显示加载进来的数据
pyplot.imshow(x_train[0].reshape((28, 28)), cmap="gray") #x_train[0]其中的一个样本
print(x_train.shape)  #x_train[0] = 784,重新做成28*28 的图像

#把数据转化为tensor格式
import torch
x_train, y_train, x_valid, y_valid = map(
    torch.tensor, (x_train, y_train, x_valid, y_valid) #把nd.arry数据格式转化为tensor格式
)
#损失函数
import torch.nn.functional as F
loss_func = F.cross_entropy

#创建一个模型
from torch import nn

class Mnist_NN(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(784, 128) # Linear中这个两个数相乘就是权重参数矩阵的大小
        self.hidden2 = nn.Linear(128, 256) # 假设有输入N个样本
        self.out  = nn.Linear(256, 10)  # y^ N*10 = [.........................], 每行代表一个样本,10列代表10分类
                                        #           [.........................]  这个10个维度中哪个数最大代表属于哪个类别
                                        #            ........................
       #self.dropout = nn.Dropout(0.5)  #按50%比例选取部分神经元训练,防止过拟合
        
    def forward(self, x):
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))
        x = self.out(x) #这里只是输出值,没有转化为概率
        return x

# 准备数据集
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
bs = 64
train_ds = TensorDataset(x_train, y_train)     #训练集
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)

valid_ds = TensorDataset(x_valid, y_valid)    #测试、验证集
valid_dl = DataLoader(valid_ds, batch_size=bs * 2)

def get_data(train_ds, valid_ds, bs):
    return (
        DataLoader(train_ds, batch_size=bs, shuffle=True),
        DataLoader(valid_ds, batch_size=bs * 2),
    )

#训练
import numpy as np
# step:训练迭代次数,model训练模型,loss_func损失函数构建,opt使用的优化器,train_dl待训练的数据集  valid_dl待测试的数据集
def fit(steps, model, loss_func, opt, train_dl, valid_dl):
    for step in range(steps):
        model.train()
        for xb, yb in train_dl:
            loss_batch(model, loss_func, xb, yb, opt)

        model.eval()
        with torch.no_grad():  #验证时不更新权重参数 ,zip把损失和样本数量打包(loss,nums)
            losses, nums = zip( #表示损失是由nums个样本造成的,因为划分batch的时候不一定都能刚好一样的数量,会有剩余
                *[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl] # * 代表解包,把loss和nums单独拿出来
            )
        val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums) #平均损失,np.multiply:两个列表计算内积
        print('当前step:'+str(step), '验证集损失:'+str(val_loss))

from torch import optim
def get_model():    #返回优化器
    model = Mnist_NN()
    return model, optim.SGD(model.parameters(), lr=0.001)

def loss_batch(model, loss_func, xb, yb, opt=None):
    loss = loss_func(model(xb), yb)  # forward, model(xb)预测值,yb真实值

    if opt is not None:
        loss.backward()  #backward
        opt.step()       #更新权重参数
        opt.zero_grad() #torch中每次迭代梯度会累加,所以梯度清零,防止影响下一次

    return loss.item(), len(xb) #返回损失和训练样本个数

train_dl, valid_dl = get_data(train_ds, valid_ds, bs) #加载训练、测试数据集
model, opt = get_model()    #获取模型,优化器
fit(25, model, loss_func, opt, train_dl, valid_dl) # loss_func = F.cross_entropy 交叉熵损失

# _,predicted = torch.max(outputs.data,1)
# _ 每个样本有10输出值,一个batch有128样本,有128个输出值,选出最大那一个输出值对应的索引,就是得到的分类值
# 每个样本输出值:[ Out0,Out1,Out2,Out3,Out4,Out5,Out6,Out7,Out8,Out9 ]
# 索引            [  0,    1,   2,   3,   4,   5,   6,   7,   8,   9  ]
#索引值即是分类值,128个样本的输出值最大输出值放在“_”中,对应的分类值放在“predicted”中
"""
for xb,yb in valid_dl :   #测试集batch是128
    outputs = model(xb)   #把测试集放入训练好的模型验证,得到神经网络的输出值
    _,predicted = torch.max(outputs.data,1)
    print(_)
    print(predicted)
"""
correct = 0
total = 0
for xb,yb in valid_dl :   #测试集batch是128
    outputs = model(xb)   #把测试集放入训练好的模型验证
    _,predicted = torch.max(outputs.data,1) #返回每一行的最大值和 这个最大值在这一行的索引,这个最大值就是测试得到的预测值
    total += yb.size(0)   #返回一个batch有多少个样本,预测值分类值 predicted == yb 代表预测正确
    correct += (predicted == yb).sum().item() # (predicted == yb).sum() 计算 predicted == yb 的有多少
print("10000个测试样本的准确率:", correct * 100 / total)

运行结果:

 

(50000, 784)
当前step:0 验证集损失:2.2777113636016844
当前step:1 验证集损失:2.2423159954071044
当前step:2 验证集损失:2.187604923248291
当前step:3 验证集损失:2.0980567459106445
当前step:4 验证集损失:1.9521116943359376
当前step:5 验证集损失:1.7360260568618775
当前step:6 验证集损失:1.473681090736389
当前step:7 验证集损失:1.224969033432007
当前step:8 验证集损失:1.030651506614685
当前step:9 验证集损失:0.8901778805732727
当前step:10 验证集损失:0.7876866367340087
当前step:11 验证集损失:0.7098233998298645
当前step:12 验证集损失:0.6489719208717346
当前step:13 验证集损失:0.6005636574745178
当前step:14 验证集损失:0.5602233497619629
当前step:15 验证集损失:0.5280175679683685
当前step:16 验证集损失:0.5008529422283172
当前step:17 验证集损失:0.4780473559379578
当前step:18 验证集损失:0.45913500146865843
当前step:19 验证集损失:0.442117148065567
当前step:20 验证集损失:0.4282426989555359
当前step:21 验证集损失:0.41570242791175843
当前step:22 验证集损失:0.40507098932266233
当前step:23 验证集损失:0.3950089809894562
当前step:24 验证集损失:0.38692794737815855
10000个测试样本的准确率: 89.64

你可能感兴趣的:(Pytorch,pytorch,分类,深度学习)