项目数据集地址:https://www.kaggle.com/datasets/ardamavi/sign-language-digits-dataset
观察到数据集已经做过预先的整理,十分工整,txt文件中类别标记清晰详细
项目文件如上图所示,接下来分文件展示代码。
import os
import torch
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor, Lambda
from PIL import Image
ANNOTATIONS_FILE = "./images/train.txt"
IMG_DAR = "./images/train"
class CustomImageDataset(Dataset):
def __init__(self):
with open(ANNOTATIONS_FILE, "r") as f:
# 读取标签文件 读取一行,去掉结尾的\n 然后根据空格分割为图片地址和标签
self.labels = [line.strip('\n').split(" ") for line in f.readlines()]
self.img_dir = IMG_DAR # 图片地址
self.transform = ToTensor() # 图片转化方法
self.target_transform = Lambda(lambda y: int(y)) # 标签转换方法
# __len__ 方法返回数据集的总长度
def __len__(self):
return len(self.labels)
# __getitem__ 方法使数据集可以使用下表索引,返回值为一个样本
def __getitem__(self, idx):
image = Image.open(self.labels[idx][0])
label = self.labels[idx][1]
if self.transform:
image = self.transform(image)
if self.target_transform:
label = self.target_transform(label)
sample = {'image': image, 'label': label}
return sample
from torch import nn
class CnnNet(nn.Module):
def __init__(self):
super(CnnNet, self).__init__()
self.conve1 = nn.Sequential(
# Conv2d 卷积神经网络,
#rgb色彩,所以是3
#位深
#卷积核的大小 5*5
nn.Conv2d(3, 24, 5, padding=2),
# 归一化层:数据经过处理之后就会变成均值为零方差为一的正态分布,防止梯度消失
nn.BatchNorm2d(24),
# 激活函数
nn.ReLU()
)
# 池化层:作用是防止过拟合和减小训练参数
# 原来的图片是64*64 的 现在已经变成了32 * 32的
self.pool1 = nn.MaxPool2d(2, 2)
self.conve2 = nn.Sequential(
nn.Conv2d(24, 48, 3, padding=1),
nn.BatchNorm2d(48),
nn.ReLU()
)
self.pool2 = nn.MaxPool2d(2, 2)
self.fc = nn.Sequential(
#输出的“图片”16 * 16 * 48 的大小,全连接层开始时有16*16*48
nn.Linear(48 * 16 * 16, 1024),
nn.ReLU(inplace=True),
nn.Linear(1024, 128),
nn.ReLU(inplace=True),
nn.Linear(128, 6)
)
def forward(self, x):
x = self.conve1(x)
x = self.pool1(x)
x = self.conve2(x)
x = self.pool2(x)
# 把原先tensor中的数据按照行优先的顺序排成一个一维的数据
x = x.view(x.size(0), -1)
out = self.fc(x)
return out
import torch
from torch import nn
from torch.utils.data import DataLoader
from dataset import CustomImageDataset
from model import MyCnnNet
def train(dataloader, model, loss_fn, optimizer, device):
size = len(dataloader.dataset)
for i in range(0, 100):
for batch, data in enumerate(dataloader):
pred = model(data['image'].to(device))
loss = loss_fn(pred, data['label'].to(device))
# 清空一下梯度
optimizer.zero_grad()
# 进行反向传播和模型优化
loss.backward()
optimizer.step()
# 每隔一段时间输出一下过程
if batch % 100 == 0:
loss, current = loss.item(), batch * len(data['image'])
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
# 保存模型
torch.save(model, './model.pkl')
if __name__ == '__main__':
model = MyCnnNet()
device = torch.device('cuda:0')
model.to(device)
train_dataloader = DataLoader(CustomImageDataset(), batch_size=32)
loss_fn = nn.CrossEntropyLoss()#交叉熵
learning_rate = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
#optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# 调用训练方法
train(train_dataloader, model, loss_fn, optimizer, device)
模型会存储在./路径下,调用模型使用torch.load()既可运行识别自己的图像。
运行以下测试代码片段
import torch
from PIL import Image
from torchvision.transforms import ToTensor, Lambda
# 1. 模型的加载
model = torch.load('./model.pkl')
# 2. 加载图片并且转化为tensor
transform = ToTensor()
img_in = Image.open("./images/test/signs/img_0008.png")
device = torch.device('cuda:0')
# 使用 unsqueeze(0)将序列扩充一维,在训练时我们使用的是一组图片,这里的一张成组
img_in = transform(img_in).unsqueeze(0).to(device)
# 3. 将图片扔到模型里得到输出
out = model(img_in)
print(out)
结果:
tensor([[-4.7332, -5.4074, -2.5515, 3.7113, 6.9314, 9.3520]],
device='cuda:0', grad_fn=)
概率最高的为数字5
经检查结果与实际图像匹配。