准备数据集,我的数据集是这样的,数据集总数有将近上万张多张大概有三四张类型,所有的特征是[“零”, “一”, “二”, “三”, “四”, “五”, “六”, “七”, “八”, “九”, “加”, “减”, “乘”, “除”, “等”, “于”, “?”, “以”],我这里还没有两位数及以上的计算题,有的话,再加一点特征其实是一样的,为什么我这里即使没有两位数,也用了RCNN呢,主要是为了兼容一种数据集,就是乘、乘以、除、除以,这个符号的中文是不定长的,也没有多少数据集,当然,需要数据集的话,私聊博主就可以,文章每一节,我会贴出所有代码,代码非常详细清晰。
这里搭建数据loader与之前的 文章有所类似,里面的数据处理随便搞,主要是最后输出的数据格式和标签格式,这里最后用6位字符作为标签,举个例子,如果是一加一等于,那么就在该标签前面添加一个前缀标签_,如果是一乘以一等于,则不用添加前缀标签,这里如果知道RNN网络模型的话其实不用多讲, 最后的数据格式当然是图像的tensor。
class NumberDataset(Dataset):
def __init__(self, path: str, transform=None, ):
"""
如果想封装一个 train = True/False 都可以,随便搞
:param path: 数据集路径
:param transform:
"""
super(NumberDataset, self).__init__()
if not transform:
transform = transforms.Compose([transforms.ToTensor(), ])
self.transform = transform
self.path = path
self.picture_list = list(os.walk(self.path))[0][-1]
self.label_map = [i for i in "_" + "".join(calc_list)]
def __len__(self):
return len(self.picture_list)
def __getitem__(self, item):
"""
:param item: ID
:return: (图片,标签)
"""
picture_path_list = self._load_picture()
img = Image.open(picture_path_list[item])
img = self.transform(img)
label = self.picture_list[item].split('_')[0]
# 设置最大长度,不足位在后面补_ ,暂确为7,后期应封装好
for i in range(6 - len(label)):
label += '_'
label = [self.label_map.index(i) for i in label]
label = torch.as_tensor(label, dtype=torch.int64)
# padding = torch.LongTensor([0] * (4 -
return img, label, len(label)
def _load_picture(self):
return [self.path + '/' + i for i in self.picture_list]
这里使用resnet18作为特征提取网络,魔改resnet18,将最后的fc全连接层改成lstm,并添加bidirectional参数,大功告成,就这么简单。
class RestNetBasicBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride):
super(RestNetBasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1)
self.bn2 = nn.BatchNorm2d(out_channels)
def forward(self, x):
output = self.conv1(x)
output = F.relu(self.bn1(output))
output = self.conv2(output)
output = self.bn2(output)
return F.relu(x + output)
class RestNetDownBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride):
super(RestNetDownBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride[0], padding=1)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride[1], padding=1)
self.bn2 = nn.BatchNorm2d(out_channels)
self.extra = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride[0], padding=0),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
extra_x = self.extra(x)
output = self.conv1(x)
out = F.relu(self.bn1(output))
out = self.conv2(out)
out = self.bn2(out)
return F.relu(extra_x + out)
class resnet18(nn.Module):
def __init__(self):
super(resnet18, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
self.bn1 = nn.BatchNorm2d(64)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = nn.Sequential(RestNetBasicBlock(64, 64, 1),
RestNetBasicBlock(64, 64, 1))
self.layer2 = nn.Sequential(RestNetDownBlock(64, 128, [2, 1]),
RestNetBasicBlock(128, 128, 1))
self.layer3 = nn.Sequential(RestNetDownBlock(128, 256, [2, 1]),
RestNetBasicBlock(256, 256, 1))
self.layer4 = nn.Sequential(RestNetDownBlock(256, 512, [2, 1]),
RestNetBasicBlock(512, 512, 1))
def forward(self, x):
out = self.conv1(x)
out = self.layer1(out)
out = self.layer2(out)
# out = self.layer3(out)
# out = self.layer4(out)
return out
class LstmNet(nn.Module):
def __init__(self, image_shape, label_map_length):
super(LstmNet, self).__init__()
# resnet18
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
self.bn1 = nn.BatchNorm2d(64)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = nn.Sequential(RestNetBasicBlock(64, 64, 1),
RestNetBasicBlock(64, 64, 1))
self.layer2 = nn.Sequential(RestNetDownBlock(64, 128, [2, 1]),
RestNetBasicBlock(128, 128, 1))
self.layer3 = nn.Sequential(RestNetDownBlock(128, 256, [2, 1]),
RestNetBasicBlock(256, 256, 1))
self.layer4 = nn.Sequential(RestNetDownBlock(256, 512, [2, 1]),
RestNetBasicBlock(512, 512, 1))
# 计算shape
x = torch.zeros((1, 3) + image_shape) # [1, 3, 64, 160]
shape = resnet18()(x).shape # [1, 256, 4, 10] BATCH, DIM, HEIGHT, WIDTH
# print(shape)
bone_output_shape = shape[1] * shape[2]
self.lstm = nn.LSTM(bone_output_shape, bone_output_shape, num_layers=1, bidirectional=True)
self.fc = nn.Linear(bone_output_shape * 2, label_map_length)
def forward(self, x):
x = self.conv1(x)
x = self.layer1(x)
x = self.layer2(x)
# x = self.layer3(x)
# x = self.layer4(x) # [20, 512, 50, 150]
# print(x.shape)
x = x.permute(3, 0, 1, 2) # [10, 1, 256, 4] [150, 20, 512, 50]
# print(x.shape)
w, b, c, h = x.shape
x = x.view(w, b, c * h) # [10, 1, 256 * 4] time_step batch_size input
# print(x.shape)
x, _ = self.lstm(x)
time_step, batch_size, hidden = x.shape # [10, 1, 2048] time_step batch_size hidden
x = x.view(time_step * batch_size, hidden)
x = self.fc(x) # [time_step * batch_size, label_map_length]
return x.view(time_step, batch_size, -1) # [time_step, batch_size, label_map_length] [10, 1, 37]
既然标题说要使用CTCLoss,损失函数当然得用CTCLoss,优化器就adam就行,这里我没有加上优化器学习策略,因为训练到一定程度,我的模型准确率也很高了,训练步骤就是正常的torch训练,其他没有差异,其实到这里为止,跟定长模型识别最简单的区别就是最后一层使用了lstm,然后损失函数换成了CTCLoss,其他大致没啥区别,所以总的来讲,这个也是比较简单的。如果对训练或者模型搭建中有一些不理解的函数意义,这里我就不多介绍了,可以百度或者看看我以前的文章。
mapping = "_" + "".join(calc_list)
device = torch.device('cuda:1')
model = LstmNet((100, 300), len(mapping)).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_func = nn.CTCLoss()
if os.path.exists('./models/model_rcnn.pkl'):
model.load_state_dict(torch.load("./models/model_rcnn.pkl"))
optimizer.load_state_dict(torch.load("./models/optimizer_rcnn.pkl"))
transform = transforms.Compose(
[
transforms.Resize((100, 300), ),
transforms.ToTensor(), # 变张量
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # 标准化
]
)
for epoch in range(30):
train_data = NumberDataset('./datasets_rcnn', transform=transform)
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True, drop_last=True)
bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader)) # enumerate 讲数据加上一个索引 打包成一个索引并返回
total_loss = []
model.train()
for idx, (input, label, _) in bar:
# 梯度设置为0 优化器置空
optimizer.zero_grad()
# 计算预测值
input = input.to(device)
label = label.to(device)
output = model(input)
# for i in range(output.shape[1]):
# output = output[:, i, :] # [10, 37]
#
# output = output.max(dim=0) # [10]
# # output = output.contiguous()
# print(output[-1])
# exit()
predict_lengths = torch.IntTensor([int(output.shape[0])] * label.shape[0])
# 获取交叉熵损失结果
loss = loss_func(output, label, predict_lengths, _)
# 反向传播
loss.backward()
total_loss.append(loss.item())
# 优化器参数更新
optimizer.step()
# 打印数据
# if idx % 50 == 0:
bar.set_description("epcoh:{} idx:{},loss:{:.6f}".format(epoch, idx, np.mean(total_loss)))
if idx % 200 == 0:
torch.save(model.state_dict(), './models/model_rcnn.pkl', _use_new_zipfile_serialization=True) # 模型保存
torch.save(optimizer.state_dict(), './models/optimizer_rcnn.pkl', _use_new_zipfile_serialization=True) # 优化器保存
torch.save(model.state_dict(), './models/model_rcnn.pkl', _use_new_zipfile_serialization=True) # 模型保存
torch.save(optimizer.state_dict(), './models/optimizer_rcnn.pkl', _use_new_zipfile_serialization=True) # 优化器保存