参与Datawhale 零基础入门CV赛事打卡记录
本次的任务为街道字符识别,github链接.
简单浏览后发现,数据集中提供了object detection标注和类别标签。因此可使用分类网络或者目标检测网络完成任务。
以目前掌握的来说,只能使用分类网络来实现,因此还是以分类网络来完成任务为主,目标检测Mask-rcnn,YOLO等还没有跑过,以本次任务来说应该无需太多改动,可以直接拿来用。计划在使用分类网络完成任务的情况下,考虑实现检测网络。
难点分析:主要的难点为字符个数不确定,若以整体考虑取值范围太大(图片中最多有6位数),最后使用全连接层的话复杂度有点爆炸,但其实也可以考虑这样做。另外一种思路就是单个字符来分类,一幅图片中最多6位,那就直接预测6位,不足6位的补一个空。因此,使用这种方法的话,题目提供的标签数据需要进行处理。一个keras实现的例子: 卷积神经网络实现多个数字识别
dataloader实现后再发出来。
未完待续。。
作于2020.5.22
已经完成了模型训练,没有用验证集,训练了2000epoch,最后分数才0.03… 然后又选练了次50epoch, 最后得分0.53, 很明显,不用验证集的话过拟合了。。 明天接着改吧~
接下来逐步放代码吧
import torch
from torch.utils.data import Dataset
import cv2
import numpy as np
import json
import os
import sys
from PIL import Image
import torchvision.transforms as transforms
class train_data(Dataset):
def __init__(self, url, json_url):
super(train_data, self).__init__()
self.transforms = transforms.Compose([
transforms.Resize((70, 140)),
transforms.RandomCrop((64, 128)),
transforms.ColorJitter(0.3, 0.3, 0.2),
transforms.RandomRotation(10),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
self.data = []
json_label = json.load(open(json_url))
for dirpath, dirnames, filenames in os.walk(url, topdown=True):
for i in range(len(filenames)):
if not filenames[i].endswith('png'):
continue
img_path = os.path.join(dirpath, filenames[i])
label = parse_json(json_label[filenames[i]])
self.data.append([img_path, label])
def __getitem__(self, item):
img_url, label_ = self.data[item]
image = Image.open(img_url).convert('RGB')
image = self.transforms(image)
label = [10 for i in range(5)]
for i in range(min(5, len(label_))):
label[i] = label_[i]
label = np.array(label)
label.astype(np.uint8)
return image, label
def __len__(self):
return len(self.data)
简单解释下, 这里用了torchvision的data argumention(一直是自己单独写的,但这是分类问题,比较简单,可以直接调库)。 url指的是的存图片的根目录。 label部分是当作5个11分类问题,若出现10则为空,否则的话就是0-9数字
作于2020.5.26
from torch.autograd import Variable
from ocr.models.model_ocr import OCR
from ocr.preprocessing.data_loader import train_data
import torch
import torch.optim as optim
import time
from torch.utils import data
import argparse
from tensorboardX import SummaryWriter
import torch.nn as nn
import os
import sys
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"
def val(model):
model.eval()
val_url = "/home/guang/ocr/dataset/mchar_val/"
val_json = "/home/guang/ocr/dataset/val_json.json"
val_loader = train_data(val_url, val_json)
valloader = data.DataLoader(val_loader, batch_size=1)
true_pred = 0
with torch.no_grad():
for i, (images, labels) in enumerate(valloader):
if torch.cuda.is_available():
images = Variable(images.float().cuda())
pred = model(images)
labels = labels[0].cpu().numpy()
ans = 0
for j in range(len(labels)):
temp = torch.argmax(pred[j], dim=1).cpu().numpy()[0]
if temp != 10:
ans = ans * 10 + temp
label = 0
for j in range(len(labels)):
if labels[j] != 10:
label = label * 10 + labels[j]
if label == ans:
true_pred += 1
return true_pred / len(valloader)
def train(args):
# setup logger
train_url = "/home/guang/ocr/dataset/mchar_train/"
train_json = "/home/guang/ocr/dataset/train_json.json"
cur_time = time.localtime()
log_dir = './logs/' + "{}-version1-{}-{}-{}-{}-{}-{}".format(args.arch, args.dataset, cur_time.tm_mon,
cur_time.tm_mday, cur_time.tm_hour, cur_time.tm_min,
cur_time.tm_sec)
# Setup Dataloader
train_loader = train_data(train_url, train_json)
trainloader = data.DataLoader(train_loader, batch_size=args.batch_size, num_workers=2, pin_memory=True, shuffle=True, drop_last=True)
model = OCR()
if torch.cuda.is_available():
model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count()))
model.float()
model.cuda()
class_criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=args.l_rate, momentum=0.9, weight_decay=5e-4)
scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epoch)
writer = SummaryWriter(logdir=log_dir, comment='fine tune on {}, lr {},{} epoch in total'.format(
weight_dir, args.l_rate, args.n_epoch))
min_loss = 0
for epoch in range(args.n_epoch):
loss_sum = 0
model.train()
for i, (images, labels) in enumerate(trainloader):
loss_ = 0
images = images.float()
labels = labels.long()
if torch.cuda.is_available():
images = images.cuda()
labels = labels.cuda()
optimizer.zero_grad()
predictions = model(images)
for i in range(len(predictions)):
loss_ = loss_ + class_criterion(predictions[i], labels[:, i])
loss_ = loss_ / len(predictions)
loss_.backward()
optimizer.step()
loss_sum = loss_sum + loss_.item()
mean_loss = loss_sum / len(trainloader)
prec = val(model)
print("Epoch [%d/%d] lr: %.7f. mean_Loss: %.6f. val_prec: %.6f"
% (epoch + 1, args.n_epoch, optimizer.state_dict()['param_groups'][0]['lr'], mean_loss, prec))
if prec >= min_loss:
min_loss = prec
torch.save(model.module.state_dict(), os.path.join(log_dir, "version.pkl"))
scheduler_cosine.step()
torch.save(model.module.state_dict(), os.path.join(log_dir,"{}_{}_{}_{}_{}_v.pkl".format(args.arch, args.dataset,args.batch_size,
args.l_rate,args.n_epoch)))
writer.export_scalars_to_json(os.path.join(log_dir, "./all_scalars.json"))
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Hyperparams')
parser.add_argument('--arch', nargs='?', type=str, default='resnet34')
parser.add_argument('--dataset', nargs='?', type=str, default='OCR')
parser.add_argument('--img_rows', nargs='?', type=int, default=64)
parser.add_argument('--img_cols', nargs='?', type=int, default=128)
parser.add_argument('--n_epoch', nargs='?', type=int, default=500)
parser.add_argument('--batch_size', nargs='?', type=int, default=2048)
parser.add_argument('--l_rate', nargs='?', type=float, default=1e-2)
args = parser.parse_args()
train(args)
目前的结果:
index | data | model | result |
---|---|---|---|
0 | train | resnet18 | 0.5467 |
1 | train | resnet34 | 0.6489 |
2 | train+val | resnet34 | 0.6555 |
train+val代表把验证集一起放入训练。
下面尝试加入更多训练技巧warm-up, 标签平滑等等。 待续。。
作于2020.5.30
# encoding:utf-8
import argparse
import sys
import os
import csv
from ocr.models.model_ocr import OCR
from ocr.preprocessing.data_loader import test_data
import numpy as np
import torch
from PIL import Image
from torch.utils import data
from torch.autograd import Variable
import math
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
def val_test(model, valdata, args):
result_dir = 'result/' + args.models + '/'
dic = {}
if not os.path.exists(result_dir):
os.makedirs(result_dir)
with open(os.path.join(result_dir, "ans.csv"), "w") as f:
csv_writer = csv.writer(f)
csv_writer.writerow(["file_name", "file_code"])
with torch.no_grad():
for i, (img, name) in enumerate(valdata):
if torch.cuda.is_available():
img = Variable(img.float().cuda())
else:
img = Variable(img)
pred = model(img)
ans = 0
for i in range(4):
temp = torch.argmax(pred[i], dim=1).cpu().numpy()[0]
if temp != 10:
ans = ans * 10 + temp
dic[name[0]] = ans
for key in sorted(dic):
csv_writer.writerow([str(key), str(dic[key])])
def ceshi(args):
loader = test_data()
test_loader = data.DataLoader(loader, batch_size=args.batch_size, num_workers=0, shuffle=False)
print("test on %d images" % test_loader.__len__())
# Setup Model
model = OCR()
weight = torch.load(args.model_path)
model.load_state_dict(weight, strict=False)
model.to(0)
model.eval()
val_test(model, test_loader, args)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Params')
parser.add_argument('--model_path', nargs='?', type=str,
default=
"logs/resnet34-version1-OCR-5-27-9-47-6/version.pkl"
, help='Path to the saved model')
parser.add_argument('--dataset', nargs='?', type=str, default='OCR',
help='Dataset to use [\'pascal, Vaihingen, Potsdam etc\']')
parser.add_argument('--models', nargs='?', type=str, default='resnet-34',
help='Name of models')
parser.add_argument('--batch_size', nargs='?', type=int, default=1,
help='Batch Size')
parser.add_argument('--split', nargs='?', type=str, default='test',
help='Split of dataset to test on')
args = parser.parse_args()
ceshi(args)
上述代码是对测试集进行测试,并写入csv文件。这里对目标数字的个数由5个改为4个,发现效果更好,可以达到0.69+。