完整代码在我的github:NLP文本分类(包含多种模型)
可以大致分为下面几个步骤:
数据预处理—>训练框架—>模型搭建—>模型调优
此时若是要用中文 则在tokennize中加入分词函数
def tokenizer(text):
return list(jieba.cut(text))
text_field = data.Field(lower=True, tokenize=tokenizer)
label_field = data.Field(sequential=False)
#load own dataset
def weibo(text_field, label_field,rawdata, **kargs):
train, dev, test = getmydata.mydata.getdataSplit(text_field, label_field, rawdata)
###使用自己的词向量###
vector = Vectors(name='./vector_cache/sgns.weibo.word')
text_field.build_vocab(train, dev, test, vectors=vector)
label_field.build_vocab(train, dev, test)
word_embedding = text_field.vocab.vectors##在此函数中返回词向量值
print("the embedding",word_embedding)
train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test),
batch_sizes=(args.batch_size, len(dev), len(test)),
**kargs)
return train_iter, dev_iter, test_iter, word_embedding
def __init__(self, args):
super(CNN,self).__init__()
self.args = args
V = args.embed_num
D = args.embed_dim
C = args.class_num
Ci = 1
Co = args.kernel_num
Ks = args.kernel_sizes
self.embed = nn.Embedding(V, D)
self.embed.weight = nn.Parameter(args.word_embedding, requires_grad=False)#parameter可以理解为类型转换函数,将一个不可训练的类型Tensor转换成可以训练的类型parameter,并将这个parameter绑定到这个module里,成为了模型中根据训练可以改动的参数了。使用这个函数的目的也是想让某些变量在学习的过程中不断的修改其值以达到最优化
self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
self.dropout = nn.Dropout(args.dropout)
self.fc1 = nn.Linear(len(Ks)*Co, C)
def forward(self, x):
x = self.embed(x) # (N, W, D)
if self.args.static:
x = Variable(x)
x = x.unsqueeze(1) # (N, Ci, W, D)
x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]# [(N, Co, W), ...]*len(Ks)
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]# [(N, Co), ...]*len(Ks)
x = torch.cat(x, 1)
x = self.dropout(x) # (N, len(Ks)*Co)
logit = self.fc1(x) # (N, C)
return logit
注意:
torch.nn只支持小批量,不支持一次输入一个样本,即一次必须是一个batch
nn.Convd的输入必须是4维,形如 nSamples * nChannels * Height * Width
def train(train_iter, dev_iter, model, args):
if args.cuda:
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
steps = 0
best_acc = 0
last_step = 0
model.train()
for epoch in range(1, args.epochs+1):
for batch in train_iter:
feature, target = batch.text, batch.label
with torch.no_grad():
feature.t_(), target.sub_(1)
if args.cuda:
feature, target = feature.cuda(), target.cuda()
if (feature.size()[0] is not args.batch_size):
continue
optimizer.zero_grad()
logit = model(feature)
loss = F.cross_entropy(logit, target)
loss.backward()
clip_gradient(model, 1e-1)
optimizer.step()
steps += 1
if steps % args.log_interval == 0:
corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
accuracy = 100.0 * corrects/batch.batch_size
sys.stdout.write('\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(steps,loss.data,accuracy,corrects,batch.batch_size))
if steps % args.test_interval == 0:
dev_acc = eval(dev_iter,model,args)
if dev_acc > best_acc:
best_acc = dev_acc
last_step = steps
if args.save_best:
save(model, args.save_dir, 'best',steps)
else:
if steps - last_step >= args.early_stop:
print('early stop by {} steps.'.format(args.early_stop))
elif steps % args.save_interval == 0:
save(model, args.save_dir, 'snapchat',steps)
如果你沿着 loss 反向传播的方向使用.grad_fn属性,将看到如下的计算图
input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
-> view -> linear -> relu -> linear -> relu -> linear
-> MSELoss
-> loss
当我们调用loss.backward()时,整个图与损失有区别,图中所有变量都将用.grad梯度累加他们的变量
另外我们需要注意的是反向传播时调用loss.backward()时,需要清除现有梯度,否则梯度会累加之前的梯度
def eval(data_iter, model, args):
model.eval()
corrects, avg_loss = 0, 0
for batch in data_iter:
feature, target = batch.text, batch.label
with torch.no_grad():
feature.t_(), target.sub_(1)
if args.cuda:
feature, target = feature.cuda(), target.cuda()
logit = model(feature)
loss = F.cross_entropy(logit,target,size_average=False)
avg_loss += loss.data
corrects += (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
size = len(data_iter.dataset)
avg_loss /= size
accuracy = 100.0 * corrects/size
print('\nEvaluation - loss: {:.6f} acc: {:.4f}%({}/{}) \n'.format(avg_loss,accuracy,corrects,size))
return accuracy
def predict(text, model, text_field, label_field, cuda_flag):
assert isinstance(text, str)
model.eval()
text = text_field.preprocess(text)
text = [[text_field.vocab.stoi[x] for x in text]]
x = torch.tensor(text)
x = autograd.Variable(x)
if cuda_flag:
x = x.cuda()
output = model(x)
_, predicted = torch.max(output, 1)
return label_field.vocab.itos[predicted.data[0]+1]
def save(model, save_dir, save_prefix, steps):
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
save_prefix = os.path.join(save_dir, save_prefix)
save_path = '{}_steps_{}.pt'.format(save_prefix, steps)
torch.save(model.state_dict(), save_path)
现在我们可以通过初始化各参数来调用我们所定义的网络啦
train_iter, dev_iter, test_iter, word_embedding = weibo(text_field,label_field,rawdata,device=-1,repeat=False)
#print("vocabulary",text_field.vocab.stoi)
# update args and print
args.embed_num = len(text_field.vocab)
args.class_num = len(label_field.vocab)-1
args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda
args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
args.save_dir = os.path.join(args.save_dir,datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
args.word_embedding = word_embedding
print("\nParameters:")
for attr, value in sorted(args.__dict__.items()):
print("\t{}={}".format(attr.upper(), value))
#model
cnn = CNN.CNN(args, text_field)
if args.snapshot is not None:
print('\nLoading model from {}'.format(args.snapshot))
cnn.load_state_dict(torch.load(args.snapshot))
if args.cuda:
torch.cuda.set_device(args.device)
cnn = cnn.cuda()
if args.predict is not None:
label = train.predict(args.predict, cnn, text_field, label_field, args.cuda)
print('\n[Text] {}\n[Label] {}\n'.format(args.predict, label))
elif args.test:
try:
train.eval(test_iter, cnn, args)
except Exception as e:
print("\nSorry. The test dataset doesn't exist.\n")
else:
try:
train.train(train_iter, dev_iter, cnn, args)
except KeyboardInterrupt:
print('\n' + '-' * 89)
print('Exiting from training early')
RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
仔细检查发现我的h_0预期的尺寸与实际尺寸不匹配,仔细一研究发现是实际代入的batch_size与RNN模型中定义的尺寸不一致。顿时我心中产生了疑问,为什么CNN中却不报错却在RNN中报错呢?于是我仔细研究了我的代码,发现了我存在的两个问题:
(1)在生成训练集,测试集以及验证集的batch_iter时,我将训练集与测试集的batch_size直接设为了其整个大小,而与训练集中的batch_size不匹配了,为了验证和测试训练后的RNN模型,我们在验证集和测试集中的batch_size需要跟我们训练模型时的batch_size相同。
(2)另外一个需要注意的是,在CNN模型中我们对于模型的定义并不需要batch_size,而在RNN模型中,我们定义h_0时需要使用batch_size,所以才会出现在CNN下正常运行而RNN下报错的问题。
这里参考了CSDN上大神的建议
方案1,寻找可以整除训练数据大小的batchsize
比如训练数据一共50000万个,每个批次35个数据就不合适,可以设置成50个。
方案2,舍弃不能整除的部分数据
比如,batchsize是50,训练数据总个数是50025,那么就要舍弃最后的25个训练数据。
方案3,增加训练数据
你可以临时再去收集一些数据集,或者在数据不足的批次把数据库中的数据重复利用,扩充数据集。
方案4,不能整除的数据并入下次循环
也就是说基本上每个数据用到的次数是等同的。
方案5,把不能整除的部分也作为一个批次训练
虽然批数据的数量少了,但是依然可以送进网络进行训练。具体做法是,遍历数据集的时候,设置一个计数变量。如果这个变量达到了batchsize的大小,就送到网络进行训练;再或者,如果遍历到了数据集的末尾,也送进网络进行训练
这里我常用的是方案2
RuntimeError: cudnn RNN backward can only be called in training mode
这里的原因是由于在训练时,设置的是model.train() 在评估模型效果并保存时, 设置为model.eval(), 再回到训练环节,此时的网络依然是eval()模式,因此出现上述bug, 只要在继续每个batch模型之前加上model.train()即可完美解决问题,把model.train()放到for batch in train_iter:下一行