Datawhale 零基础入门CV赛事

参与Datawhale 零基础入门CV赛事打卡记录

一、任务简介

本次的任务为街道字符识别,github链接.

简单浏览后发现,数据集中提供了object detection标注和类别标签。因此可使用分类网络或者目标检测网络完成任务。

二、思路

以目前掌握的来说,只能使用分类网络来实现,因此还是以分类网络来完成任务为主,目标检测Mask-rcnn,YOLO等还没有跑过,以本次任务来说应该无需太多改动,可以直接拿来用。计划在使用分类网络完成任务的情况下,考虑实现检测网络。

难点分析:主要的难点为字符个数不确定,若以整体考虑取值范围太大(图片中最多有6位数),最后使用全连接层的话复杂度有点爆炸,但其实也可以考虑这样做。另外一种思路就是单个字符来分类,一幅图片中最多6位,那就直接预测6位,不足6位的补一个空。因此,使用这种方法的话,题目提供的标签数据需要进行处理。一个keras实现的例子: 卷积神经网络实现多个数字识别

dataloader实现后再发出来。
未完待续。。


作于2020.5.22


三、解决方案

已经完成了模型训练,没有用验证集,训练了2000epoch,最后分数才0.03… 然后又选练了次50epoch, 最后得分0.53, 很明显,不用验证集的话过拟合了。。 明天接着改吧~

接下来逐步放代码吧

1. dataloader

import torch
from torch.utils.data import Dataset
import cv2
import numpy as np
import json
import os
import sys
from PIL import Image
import torchvision.transforms as transforms
class train_data(Dataset):
    def __init__(self, url, json_url):
        super(train_data, self).__init__()
        self.transforms = transforms.Compose([
            transforms.Resize((70, 140)),
            transforms.RandomCrop((64, 128)),
            transforms.ColorJitter(0.3, 0.3, 0.2),
            transforms.RandomRotation(10),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])
        self.data = []
        json_label = json.load(open(json_url))
        for dirpath, dirnames, filenames in os.walk(url, topdown=True):
            for i in range(len(filenames)):
                if not filenames[i].endswith('png'):
                    continue
                img_path = os.path.join(dirpath, filenames[i])
                label = parse_json(json_label[filenames[i]])
                self.data.append([img_path, label])

    def __getitem__(self, item):
        img_url, label_ = self.data[item]
        image = Image.open(img_url).convert('RGB')
        image = self.transforms(image)
        label = [10 for i in range(5)]
		for i in range(min(5, len(label_))):
            label[i] = label_[i]
        label = np.array(label)
        label.astype(np.uint8)
        return image, label

    def __len__(self):
        return len(self.data)

简单解释下, 这里用了torchvision的data argumention(一直是自己单独写的,但这是分类问题,比较简单,可以直接调库)。 url指的是的存图片的根目录。 label部分是当作5个11分类问题,若出现10则为空,否则的话就是0-9数字


作于2020.5.26


2. train

from torch.autograd import Variable
from ocr.models.model_ocr import OCR
from ocr.preprocessing.data_loader import train_data
import torch
import torch.optim as optim
import time
from torch.utils import data
import argparse
from tensorboardX import SummaryWriter
import torch.nn as nn
import os
import sys

os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"

def val(model):
    model.eval()
    val_url = "/home/guang/ocr/dataset/mchar_val/"
    val_json = "/home/guang/ocr/dataset/val_json.json"
    val_loader = train_data(val_url, val_json)
    valloader = data.DataLoader(val_loader, batch_size=1)
    true_pred = 0
    with torch.no_grad():
        for i, (images, labels) in enumerate(valloader):
            if torch.cuda.is_available():
                images = Variable(images.float().cuda())
            pred = model(images)
            labels = labels[0].cpu().numpy()
            ans = 0
            for j in range(len(labels)):
                temp = torch.argmax(pred[j], dim=1).cpu().numpy()[0]
                if temp != 10:
                    ans = ans * 10 + temp
            label = 0
            for j in range(len(labels)):
                if labels[j] != 10:
                    label = label * 10 + labels[j]
            if label == ans:
                true_pred += 1
    return true_pred / len(valloader)


def train(args):
    # setup logger
    train_url = "/home/guang/ocr/dataset/mchar_train/"
    train_json = "/home/guang/ocr/dataset/train_json.json"
    cur_time = time.localtime()
    log_dir = './logs/' + "{}-version1-{}-{}-{}-{}-{}-{}".format(args.arch, args.dataset, cur_time.tm_mon,
                                                                 cur_time.tm_mday, cur_time.tm_hour, cur_time.tm_min,
                                                                 cur_time.tm_sec)
    # Setup Dataloader
    train_loader = train_data(train_url, train_json)
    trainloader = data.DataLoader(train_loader, batch_size=args.batch_size, num_workers=2, pin_memory=True, shuffle=True, drop_last=True)
    model = OCR()
    if torch.cuda.is_available():
        model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count()))
        model.float()
        model.cuda()
    class_criterion = nn.CrossEntropyLoss()
    
    optimizer = torch.optim.SGD(model.parameters(), lr=args.l_rate, momentum=0.9, weight_decay=5e-4)
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epoch)
    writer = SummaryWriter(logdir=log_dir, comment='fine tune on {}, lr {},{} epoch in total'.format(
        weight_dir, args.l_rate, args.n_epoch))
      
    min_loss = 0
    for epoch in range(args.n_epoch):
        loss_sum = 0
        model.train()
        for i, (images, labels) in enumerate(trainloader):
            loss_ = 0
            images = images.float()
            labels = labels.long()
            if torch.cuda.is_available():
                images = images.cuda()
                labels = labels.cuda()
            optimizer.zero_grad()
            predictions = model(images)
            for i in range(len(predictions)):
                loss_ = loss_ + class_criterion(predictions[i], labels[:, i])
            loss_ = loss_ / len(predictions)
            
            loss_.backward()
            optimizer.step()
            loss_sum = loss_sum + loss_.item()

        mean_loss = loss_sum / len(trainloader)
        prec = val(model)
        print("Epoch [%d/%d] lr: %.7f. mean_Loss: %.6f. val_prec: %.6f"
              % (epoch + 1, args.n_epoch, optimizer.state_dict()['param_groups'][0]['lr'], mean_loss, prec))
        
        if prec >= min_loss:
            min_loss = prec
            torch.save(model.module.state_dict(), os.path.join(log_dir, "version.pkl"))
        scheduler_cosine.step()
    torch.save(model.module.state_dict(), os.path.join(log_dir,"{}_{}_{}_{}_{}_v.pkl".format(args.arch, args.dataset,args.batch_size,
args.l_rate,args.n_epoch)))
    writer.export_scalars_to_json(os.path.join(log_dir, "./all_scalars.json"))
    writer.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Hyperparams')
    parser.add_argument('--arch', nargs='?', type=str, default='resnet34')
    parser.add_argument('--dataset', nargs='?', type=str, default='OCR')
    parser.add_argument('--img_rows', nargs='?', type=int, default=64)
    parser.add_argument('--img_cols', nargs='?', type=int, default=128)
    parser.add_argument('--n_epoch', nargs='?', type=int, default=500)
    parser.add_argument('--batch_size', nargs='?', type=int, default=2048)
    parser.add_argument('--l_rate', nargs='?', type=float, default=1e-2)
    args = parser.parse_args()
    train(args)

目前的结果:

index data model result
0 train resnet18 0.5467
1 train resnet34 0.6489
2 train+val resnet34 0.6555

train+val代表把验证集一起放入训练。
下面尝试加入更多训练技巧warm-up, 标签平滑等等。 待续。。


作于2020.5.30


3. test

# encoding:utf-8
import argparse
import sys
import os
import csv
from ocr.models.model_ocr import OCR
from ocr.preprocessing.data_loader import test_data
import numpy as np
import torch
from PIL import Image
from torch.utils import data
from torch.autograd import Variable
import math
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

def val_test(model, valdata, args):
    result_dir = 'result/' + args.models + '/'
    dic = {}
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    with open(os.path.join(result_dir, "ans.csv"), "w") as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(["file_name", "file_code"])
        with torch.no_grad():
            for i, (img, name) in enumerate(valdata):
                if torch.cuda.is_available():
                    img = Variable(img.float().cuda())
                else:
                    img = Variable(img)
                pred = model(img)
                ans = 0
                for i in range(4):
                    temp = torch.argmax(pred[i], dim=1).cpu().numpy()[0]
                    if temp != 10:
                        ans = ans * 10 + temp
                dic[name[0]] = ans
        for key in sorted(dic):
            csv_writer.writerow([str(key), str(dic[key])])

def ceshi(args):
    loader = test_data()
    test_loader = data.DataLoader(loader, batch_size=args.batch_size, num_workers=0, shuffle=False)
    print("test on %d images" % test_loader.__len__())
    # Setup Model
 
    model = OCR()
    weight = torch.load(args.model_path)
    model.load_state_dict(weight, strict=False)
    model.to(0)
    model.eval()
    val_test(model, test_loader, args)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Params')
    parser.add_argument('--model_path', nargs='?', type=str,
                        default=
                        "logs/resnet34-version1-OCR-5-27-9-47-6/version.pkl"
                        , help='Path to the saved model')
    parser.add_argument('--dataset', nargs='?', type=str, default='OCR',
                        help='Dataset to use [\'pascal, Vaihingen, Potsdam etc\']')
    parser.add_argument('--models', nargs='?', type=str, default='resnet-34',
                        help='Name of models')
    parser.add_argument('--batch_size', nargs='?', type=int, default=1,
                        help='Batch Size')
    parser.add_argument('--split', nargs='?', type=str, default='test',
                        help='Split of dataset to test on')
    args = parser.parse_args()
    ceshi(args)

上述代码是对测试集进行测试,并写入csv文件。这里对目标数字的个数由5个改为4个,发现效果更好,可以达到0.69+。

你可能感兴趣的:(Datawhale 零基础入门CV赛事)