记录一次完全自己实现的深度学习炼丹过程(包括怎么初步设置的参数,怎么排错,记录第一次在kaggle的gpu上跑)。声明:本次训练的准确率并不高,因为为了想自己体验炼丹的过程,卷积层和池化层的kernel_size以及stride这些参数我都是自己手动设置的,并没有用现成的网络,所以期待下次自己的调参能使准确率变高。
要导入的一些包:
import torch
from torch import nn, optim
import torchvision
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt
from torch.utils.tensorboard import SummaryWriter
transform=torchvision.transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
train_dataset=torchvision.datasets.CIFAR10('./p10_dataset',train=True,transform=transform,download=True)
test_dataset=torchvision.datasets.CIFAR10('./p10_dataset',train=False,transform=transform,download=True)
# length 长度
print('训练数据集长度: {}'.format(len(train_dataset)))
print('测试数据集长度: {}'.format(len(test_dataset)))
# DataLoader创建数据集
train_dataloader=DataLoader(train_dataset,batch_size=64,shuffle=True)
test_dataloader=DataLoader(test_dataset,batch_size=64,shuffle=True)
examples=enumerate(test_dataloader)#组合成一个索引序列
batch_idx,(example_data,example_targets)=next(examples)
classes=('airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck')
fig=plt.figure()
for i in range(6):
plt.subplot(2,3,i+1)
#plt.tight_layout()
img= example_data[i]
print(img.shape)
img = img.swapaxes(0, 1)
img = img.swapaxes(1, 2)
plt.imshow(img,interpolation='none')
plt.title('target: {}'.format(example_targets[i]))
plt.xticks([])
plt.yticks([])
plt.show()
这两行的作用是使plt能正确显示彩色图像,一般图片的shape是channel,height,weight而plt是height,weight,channel
img = img.swapaxes(0, 1)
img = img.swapaxes(1, 2)
结果:
因为是实现Alextnet,所以一些层就直接抄,in_channel和out_channel也直接抄原版的了。
class MyAlexNet(nn.Module):
def __init__(self):
super(MyAlexNet,self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(in_channels=3,out_channels=96,kernel_size=(3,3),stride=1,padding=1),nn.ReLU(),nn.MaxPool2d(kernel_size=(3,3),stride=2),
nn.Conv2d(in_channels=96,out_channels=256,kernel_size=(5,5),stride=1,padding=2),nn.ReLU(),nn.MaxPool2d(kernel_size=(3,3),stride=2),
nn.Conv2d(in_channels=256,out_channels=384,kernel_size=(3,3),padding=1),nn.ReLU(),
nn.Conv2d(in_channels=384,out_channels=384,kernel_size=(3,3),padding=1),nn.ReLU(),
nn.Conv2d(in_channels=384,out_channels=256,kernel_size=(3,3),padding=1),nn.ReLU(),
nn.MaxPool2d(kernel_size=(3,3),stride=2)
)
self.layer2 = nn.Sequential(
nn.Flatten(),
nn.Linear(256*3*3, 1024), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(1024, 1024), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(1024, 10)
)
def forward(self,x):
x = self.layer1(x)
x = self.layer2(x)
return x
第一个卷积层的in_channel由于我们数据局是3*32*32的所以当然是3啦。其他层的in_channels和out_channels直接抄Alexnet的
这里就要记录一下自己的坑以及总结一下经验和套路了。
一开始在设置kernel_size和stride的时候设置的不恰当的话很容易使输出维度变得越来越小最后报错,比如:
当我的网络第一层初始设置为如下时:
运行如下代码查看各层的形状时,直接报错:
可以看见到ReLU层的输出形状已经是256*1*1了,所以我们得继续调参,最直接的将stride调小
调成如下并进行测试:
结果:
至此,起码调参到不会报错的程度了,剩下的让神经网络自己学习去哈哈。
net = MyAlexNet()
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
print("参数数:{}".format(sum(x.numel() for x in net.parameters())))
# 损失函数与优化器
loss=nn.CrossEntropyLoss()
optimizer=optim.SGD(net.parameters(),lr=0.01)
# 记录训练的次数
total_train_step = 0
#记录测试的次数
total_test_step = 0
# 添加tensorboard
writer=SummaryWriter("./logs_train")
for epoch in range(20):
print("——————第 {} 轮训练开始——————".format(epoch+1))
#训练开始
net.train()
for imgs,targets in train_dataloader:
imgs = imgs.to(device)
targets = targets.to(device)
output=net(imgs)
Loss=loss(output,targets)
# 优化器优化模型
optimizer.zero_grad()
Loss.backward()
optimizer.step()
_, pred = output.max(1)
num_correct = (pred == targets).sum().item()
acc = num_correct / (64)
total_train_step = total_train_step + 1
if total_train_step%100 == 0:
print("训练次数:{}, Loss: {}".format(total_train_step,Loss.item()))
writer.add_scalar("train_loss", Loss.item(), total_train_step)
writer.add_scalar("train_acc", acc, total_train_step)
# 测试步骤开始
net.eval()
eval_loss = 0
eval_losses = 0
eval_acc = 0
eval_acces = 0
with torch.no_grad():
for imgs,targets in test_dataloader:
imgs=imgs.to(device)
targets=targets.to(device)
output=net(imgs)
Loss=loss(output,targets)
_, pred = output.max(1)
num_correct = (pred == targets).sum().item()
eval_loss += Loss
acc = num_correct / imgs.shape[0]
eval_acc += acc
eval_losses = eval_loss/(len(test_dataloader))
eval_acces = eval_acc/(len(test_dataloader))
print("整体测试集上的Loss: {}".format(eval_losses))
print("整体测试集上的正确率: {}".format(eval_acces))
writer.add_scalar("test_loss", eval_losses, total_test_step)
writer.add_scalar("test_accuracy", eval_acces, total_test_step)
total_test_step = total_test_step + 1
torch.save(net, "tudui_{}.pth".format(epoch))
print("模型已保存")
writer.close()
训练和验证过程的代码比较常规。
完整代码:
import torch
from torch import nn, optim
import torchvision
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt
from torch.utils.tensorboard import SummaryWriter
transform=torchvision.transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
train_dataset=torchvision.datasets.CIFAR10('./p10_dataset',train=True,transform=transform,download=True)
test_dataset=torchvision.datasets.CIFAR10('./p10_dataset',train=False,transform=transform,download=True)
# length 长度
print('训练数据集长度: {}'.format(len(train_dataset)))
print('测试数据集长度: {}'.format(len(test_dataset)))
# DataLoader创建数据集
train_dataloader=DataLoader(train_dataset,batch_size=64,shuffle=True)
test_dataloader=DataLoader(test_dataset,batch_size=64,shuffle=True)
examples=enumerate(test_dataloader)#组合成一个索引序列
batch_idx,(example_data,example_targets)=next(examples)
classes=('airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck')
fig=plt.figure()
for i in range(6):
plt.subplot(2,3,i+1)
#plt.tight_layout()
img= example_data[i]
print(img.shape)
img = img.swapaxes(0, 1)
img = img.swapaxes(1, 2)
#img = img[:,:,::-1]
plt.imshow(img,interpolation='none')
plt.title('target: {}'.format(example_targets[i]))
plt.xticks([])
plt.yticks([])
plt.show()
class MyAlexNet(nn.Module):
def __init__(self):
super(MyAlexNet,self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(in_channels=3,out_channels=96,kernel_size=(3,3),stride=1,padding=1),nn.ReLU(),nn.MaxPool2d(kernel_size=(3,3),stride=2),
nn.Conv2d(in_channels=96,out_channels=256,kernel_size=(5,5),stride=1,padding=2),nn.ReLU(),nn.MaxPool2d(kernel_size=(3,3),stride=2),
nn.Conv2d(in_channels=256,out_channels=384,kernel_size=(3,3),padding=1),nn.ReLU(),
nn.Conv2d(in_channels=384,out_channels=384,kernel_size=(3,3),padding=1),nn.ReLU(),
nn.Conv2d(in_channels=384,out_channels=256,kernel_size=(3,3),padding=1),nn.ReLU(),
nn.MaxPool2d(kernel_size=(3,3),stride=2)
)
self.layer2 = nn.Sequential(
nn.Flatten(),
nn.Linear(256*3*3, 1024), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(1024, 1024), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(1024, 10)
)
def forward(self,x):
x = self.layer1(x)
x = self.layer2(x)
return x
net = MyAlexNet()
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
print("参数数:{}".format(sum(x.numel() for x in net.parameters())))
# 损失函数与优化器
loss=nn.CrossEntropyLoss()
optimizer=optim.SGD(net.parameters(),lr=0.01)
# 记录训练的次数
total_train_step = 0
#记录测试的次数
total_test_step = 0
# 添加tensorboard
writer=SummaryWriter("./logs_train")
for epoch in range(20):
print("——————第 {} 轮训练开始——————".format(epoch+1))
#训练开始
net.train()
for imgs,targets in train_dataloader:
imgs = imgs.to(device)
targets = targets.to(device)
output=net(imgs)
Loss=loss(output,targets)
# 优化器优化模型
optimizer.zero_grad()
Loss.backward()
optimizer.step()
_, pred = output.max(1)
num_correct = (pred == targets).sum().item()
acc = num_correct / (64)
total_train_step = total_train_step + 1
if total_train_step%100 == 0:
print("训练次数:{}, Loss: {}".format(total_train_step,Loss.item()))
writer.add_scalar("train_loss", Loss.item(), total_train_step)
writer.add_scalar("train_acc", acc, total_train_step)
# 测试步骤开始
net.eval()
eval_loss = 0
eval_losses = 0
eval_acc = 0
eval_acces = 0
with torch.no_grad():
for imgs,targets in test_dataloader:
imgs=imgs.to(device)
targets=targets.to(device)
output=net(imgs)
Loss=loss(output,targets)
_, pred = output.max(1)
num_correct = (pred == targets).sum().item()
eval_loss += Loss
acc = num_correct / imgs.shape[0]
eval_acc += acc
eval_losses = eval_loss/(len(test_dataloader))
eval_acces = eval_acc/(len(test_dataloader))
print("整体测试集上的Loss: {}".format(eval_losses))
print("整体测试集上的正确率: {}".format(eval_acces))
writer.add_scalar("test_loss", eval_losses, total_test_step)
writer.add_scalar("test_accuracy", eval_acces, total_test_step)
total_test_step = total_test_step + 1
torch.save(net, "tudui_{}.pth".format(epoch))
print("模型已保存")
writer.close()
1、登录kaggle(怎么登录就八仙过海各显神通了)
2、用kaggle的GPU训练(过程如下动图所示)
注意,需要先点击下面这个按钮进行短信验证才能出现右侧的Accelerator和Internet按钮。
并且,不打开Internet按钮我们这个代码是无法运行的,因为我们的数据集是从网上下载的。
运行结果:
可以看到,训练了700次的准确率才0.1,验证集准确率最高为0.68,不是很好,期待下次调参有好结果再更新!
更新:
更新:把核的大小改成和Alexnet一模一样还加了两个BatchNormld层,之后我又看见没有激活函数哈,自己在forward函数里加了激活函数,训练集准确率达到0.95,验证集准确率达到0.75(优点过拟合了哈,不知道还有没有提升空间)