在模型训练前,我们完成了数据处理和模型分析。
经过模型分析决定先采用baseline+全连接层的网络先得到初步结果,再根据模型的问题调优。baseline选择resnet18(因为数据集简单一点,先用小网络快速得到结果),采用交叉熵函数作为损失函数,用Adam优化器进行训练。
下面我们将构建模型进行训练。首先定义训练和测试的数据载入,此处利用dataloader来载入数据。
dataPath = 'your-data-path'
trainImgPath = dataPath+'mchar_train/mchar_train/'
ValImgPath = dataPath+'mchar_val/mchar_val/'
trainLabelPath = dataPath+'mchar_train.json'
ValLabelPath = dataPath+'mchar_val.json'
class TrainDataLoader(Dataset):
def __init__(self, root, csvPath, json_Dir):
data = []
self.root = root
with open(csvPath, 'r') as csvfile:
csv_reader = csv.reader(csvfile)
for row in csv_reader:
data.append(row[0])
with open(json_Dir, 'r') as f:
info = json.load(f)
self.dataList = data
self.InfoDict = info
self.num = len(self.dataList)
def __len__(self):
return self.num
def ImgProcess(self, img):
# ColorJitter(brightness=0, contrast=0, saturation=0, hue=0)
# RandomRotation(degrees, resample=False, expand=False, center=None) 在(-degrees,+degrees)之间随机旋转
# transforms.ToTensor, 将PIL Image或者 ndarray 转换为tensor,并且归一化至[0-1]
transform = transforms.Compose([transforms.Resize((64, 128)),
transforms.ColorJitter(0.3, 0.3, 0.2),
transforms.RandomRotation(degrees=10,fill=None),
transforms.ToTensor()])
imTensor = transform(img) # H,W,N*C
return imTensor.float()
def __getitem__(self, idx):
# print('data path: ', self.root+self.dataList[idx])
imgName = self.dataList[idx]
img = Image.open(self.root+imgName)
imgInfo = self.InfoDict[imgName]
imgTensor = self.ImgProcess(img)
label = imgInfo['label']
label += [10]*(6-len(label)) ## 标签字符填充
label = torch.Tensor(label).type(torch.LongTensor)
# print('label:', label)
sample = {'image':imgTensor, 'label':label}
return sample
class ValDataLoader(Dataset):
def __init__(self, root, csvPath, json_Dir):
data = []
self.root = root
with open(csvPath, 'r') as csvfile:
csv_reader = csv.reader(csvfile)
for row in csv_reader:
data.append(row[0])
with open(json_Dir, 'r') as f:
info = json.load(f)
self.dataList = data
self.InfoDict = info
self.num = len(self.dataList)
# self.transform = transform
def __len__(self):
return self.num
def ImgProcess(self, img):
transform = transforms.Compose([transforms.Resize((64, 128)),
transforms.ToTensor()])
imTensor = transform(img) # H,W,N*C
return imTensor.float()
def __getitem__(self, idx):
# print('data path: ', self.root+self.dataList[idx])
imgName = self.dataList[idx]
img = Image.open(self.root+imgName)
imgInfo = self.InfoDict[imgName]
imgTensor = self.ImgProcess(img)
label = imgInfo['label']
label = label + [10]*(6-len(label)) ## 标签字符填充
label = torch.Tensor(label).type(torch.LongTensor)
# print('label:', label)
sample = {'image':imgTensor, 'label':label}
return sample
定义模型
class model_resnet18(nn.Module):
def __init__(self):
super(model_resnet18, self).__init__()
model_conv = models.resnet18()
model_conv.avgpool = nn.AdaptiveAvgPool2d(1)
model_conv = nn.Sequential(*list(model_conv.children())[:-1])
self.cnn = model_conv.cuda()
self.fc1 = nn.Linear(512, 11)
self.fc2 = nn.Linear(512, 11)
self.fc3 = nn.Linear(512, 11)
self.fc4 = nn.Linear(512, 11)
self.fc5 = nn.Linear(512, 11)
self.fc6 = nn.Linear(512, 11)
def forward(self, img):
feat = self.cnn(img)
# print(feat.shape)
feat = feat.view(feat.shape[0], -1)
c1 = self.fc1(feat)
c2 = self.fc2(feat)
c3 = self.fc3(feat)
c4 = self.fc4(feat)
c5 = self.fc5(feat)
c6 = self.fc6(feat)
return c1, c2, c3, c4, c5, c6
验证和测试
def validate(val_loader, model, criterion):
# 切换模型为预测模型
model.eval()
val_loss = []
with torch.no_grad():
for step, sample in enumerate(val_loader):
imgTest = sample['image'].to(device)
labelTest = sample['label'].to(device)
c1, c2, c3, c4, c5, c6 = model(imgTest)
loss = criterion(c1, labelTest[:, 0]) + \
criterion(c2, labelTest[:, 1]) + \
criterion(c3, labelTest[:, 2]) + \
criterion(c4, labelTest[:, 3]) + \
criterion(c5, labelTest[:, 4]) + \
criterion(c6, labelTest[:, 5])
val_loss.append(loss.item())
if(step%100==0):
print('test step: [%d],step loss: [%.4f]' % (step, loss.item()))
return np.mean(val_loss)
def predict(test_loader, model, criterion, use_gpu=True):
# 切换模型为预测模型
model.eval()
val_loss = []
with torch.no_grad():
for step, sample in enumerate(val_loader):
mgData = sample['image'].to(device)
label = sample['label'].to(device)
c1, c2, c3, c4, c5, c6 = model(input)
output = np.concatenate([c1, c2, c3, c4, c5, c6])
return output
一些用到的函数
def mk_dir(dir_path):
if not os.path.exists(dir_path):
os.makedirs(dir_path)
def weights_init_model(m):
classname = m.__class__.__name__
if classname.find('Conv2d') != -1:
# init.kaiming_normal_(m.weight.data)
init.xavier_uniform_(m.weight.data)
def Adjust_learning_rate(optimizer, epoch, base_lr=1e-4):
Ur = 100 # Update Range
dr = 0.25 # decay rate
if ((epoch%Ur)==0):
lr = base_lr * (dr**(epoch//Ur)) #学习率呈指数衰减(近似模拟指数衰减)
print('epoch ',epoch,' Update LR: ',optimizer.param_groups[0]['lr'],' -> ',lr)
optimizer.param_groups[0]['lr'] = lr
模型训练+验证,用tensorboard来记录训练情况,log文件被保存在log文件夹下。
## 预训练模型下载: https://blog.csdn.net/Jorbo_Li/article/details/106248808
import numpy as np
import os
import csv
import json
import math
from PIL import Image
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch
from torch.nn import init
import torch.nn as nn
from tensorboardX import SummaryWriter
from torch.nn import functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DataRootPath = dataPath
log_path = dataPath+'log/'
SavePath = dataPath+'checkpoint/'
mk_dir(log_path)
mk_dir(SavePath)
BATCH_SIZE = 16
EPOCH_TOTAL = 100
learning_rate = 1e-5
# use_gpu = True
train_mode = 1 # 0-init, 1-continue
train_dataset = TrainDataLoader(DataRootPath+'mchar_train/mchar_train/', DataRootPath+'mchar_train/train.csv',DataRootPath+'mchar_train/mchar_train.json' )
train_num = len(train_dataset)
train_loader = DataLoader(dataset = train_dataset, batch_size = BATCH_SIZE, shuffle = True)
test_dataset = ValDataLoader(DataRootPath+'mchar_val/mchar_val/', DataRootPath+'mchar_val/test.csv',DataRootPath+'mchar_val/mchar_val.json')
test_loader = DataLoader(dataset = test_dataset, batch_size = BATCH_SIZE)
STEP_MAX = math.ceil(len(train_loader.dataset) / BATCH_SIZE)
model = model_resnet18()
##====== init/load weight ======##
if(train_mode):
print('continue train: ',train_mode)
checkpoint = torch.load(SavePath+'model.pth')
dict_trained = checkpoint['net']
epochDone = checkpoint['epoch']
else:
print('first train',train_mode)
dict_trained = torch.load(SavePath+'resnet18-5c106cde.pth')
model.apply(weights_init_model)
model_dict = model.state_dict()
loaded_dict = {k: v for k, v in dict_trained.items() if k in model_dict}
model_dict.update(loaded_dict)
model.load_state_dict(model_dict)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999),eps=1e-08)
criterion = nn.CrossEntropyLoss() # pytorch里交叉熵已经包含了softmax操作
writer = SummaryWriter(log_path)
best_loss = 1000.0
modelFileName = 'model_resnet18'
for epoch in range(EPOCH_TOTAL):
if (train_mode):
epoch += epochDone
Adjust_learning_rate(optimizer, epoch, learning_rate)
epoch_loss = 0
print('validate epoch %d' % epoch)
val_loss = validate(test_loader, model, criterion)
writer.add_scalar(modelFileName + '/data/Val_step_loss',val_loss.item(), epoch)
if val_loss < best_loss:
best_loss = val_loss
# print('Find better model in Epoch {0}, saving model.'.format(epoch))
state = {'net':model.state_dict(), 'epoch':epoch }
torch.save(state, SavePath+'/model.pth')
for step, sample in enumerate(train_loader):
imgData = sample['image'].to(device)
label = sample['label'].to(device)
model.train()
optimizer.zero_grad()
c1, c2, c3, c4, c5, c6 = model(imgData)
step_loss = criterion(c1, label[:, 0]) + \
criterion(c2, label[:, 1]) + \
criterion(c3, label[:, 2]) + \
criterion(c4, label[:, 3]) + \
criterion(c5, label[:, 4]) + \
criterion(c6, label[:, 5])
step_loss.backward()
optimizer.step()
epoch_loss += step_loss.item()
if(step%100 == 0):
print('epoch: [%d/%d], step: [%d/%d],step loss: [%.4f]' % ( \
epoch, EPOCH_TOTAL, step, STEP_MAX, step_loss.item()))
writer.add_scalar(modelFileName + '/data/step_loss',step_loss.item(), step+epoch*STEP_MAX)
epoch_loss = epoch_loss / (step+1)
writer.add_scalar(modelFileName + '/data/epoch_loss',epoch_loss,epoch)
writer.close()
print('Done!')
在预训练模型初始化情况下,学习率不要设置过大,否则容易过拟合。