根据刘二大人视频整理。
使用pytorch中的Dataset和Dataloader来自定义数据集类。自定义的数据集继承Dataset类,构造数据集要支持索引操作。需要初始化内部的三个函数:__init__(self),__getitem__(self),__len__(self)
其中__getitem__方法是用来根据索引取出相应数据,__len__方法是获取取出数据的长度。
使用Dataloader来实现mini-batch操作
注意:Dataset类是抽象的类,不能实例化,只能被继承。而Dataloader可以实例化
epoch:所有样本训练一轮
batch_size:一次前向传播输入的样本数
iteration:样本总数/batch_size
构造数据集DiabetesDataset,继承Dataset类
import torch
import numpy as np
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn
class DiabetesDataset(Dataset):
def __init__(self,filepath):
xy = np.loadtxt(filepath,delimiter=',',dtype=np.float32)
self.len = xy.shape[0]
self.x_data = torch.from_numpy(xy[:,: -1])
self.y_data = torch.from_numpy(xy[:,[-1]])
def __getitem__(self, index):
return self.x_data[index],self.y_data[index]
def __len__(self):
return self.len
filepath为文件路径,delimiter为分隔符,dtype为数据格式
xy.shape[0]得到数据样本的个数,#这个数据集是N行9列
xy[: , :-1]进行切片操作,得到第1到8列
xy[: , [-1] ]得到最后一列
__init__读入数据时有两种情况:1、一次性读入全部数据。适合小数据集
2、对于图像语音等大数据集,不能一次全部读入,会把数据变成一个个文件,__init__方法只做一些初始化工作。
数据集的加载
dataset = DiabetesDataset('diabetes.csv.gz')
train_loader = DataLoader(dataset=dataset,batch_size=32,shuffle=True)
shuffle,打乱数据
import torch
import numpy as np
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn
import matplotlib
matplotlib.use('TkAgg')
from matplotlib import pyplot as plt
class DiabetesDataset(Dataset):
def __init__(self,filepath):
xy = np.loadtxt(filepath,delimiter=',',dtype=np.float32)
self.len = xy.shape[0]
self.x_data = torch.from_numpy(xy[:,: -1])
self.y_data = torch.from_numpy(xy[:,[-1]])
def __getitem__(self, index):
return self.x_data[index],self.y_data[index]
def __len__(self):
return self.len
dataset = DiabetesDataset('diabetes.csv.gz')
train_loader = DataLoader(dataset=dataset,batch_size=32,shuffle=True)
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear1 = nn.Linear(8,6)
self.linear2 = nn.Linear(6,4)
self.linear3 = nn.Linear(4,1)
self.sigmoid = torch.nn.Sigmoid()
def forward(self,x):
x = self.sigmoid(self.linear1(x))
x = self.sigmoid(self.linear2(x))
x = self.sigmoid(self.linear3(x))
return x
model = Model()
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(),
lr=0.01,)
losses = []#损失函数值
acces = []#准确率
for epoch in range(100):
train_loss = 0
train_acc = 0
for i,data in enumerate(train_loader,0):
#1、predata
inputs,labels = data
#2、forward
y_pred = model(inputs)
loss = criterion(y_pred,labels)
if epoch %10 ==0:
print(epoch,i,loss.item())
#3、backward
optimizer.zero_grad()
loss.backward()
#4、update
optimizer.step()
_, pred = y_pred.max(1)
num_correct = (pred == labels).sum().item()
acc = num_correct / inputs.shape[0]
train_loss += loss.item()
train_acc += acc
losses.append(train_loss / len(train_loader))
acces.append((train_acc / len(train_loader)))
#可视化训练及测试损失值
plt.title('train loss')
plt.plot(np.arange(len(losses)),losses)
plt.legend(['Train Loss'],loc='upper right')
plt.show()
for i ,data in enumerate(tain_loader,0)
inputs,labels = data
i从0开始,取x_data[i] , y_data[i] 根据mini-batch变成矩阵,并自动变成张量Tensor
一行是一个样本数据,即取出的数据维度为[batch_size,input_size]