论文总体结构
使用卷积神经网络处理句子级别文本分类,并在多个数据集上有好的效果
使用预训练词向量和卷积神经网络,提出一种有效分类模型
本文的主要契机:
1、深度学习的发展(2012)
2、预训练词向量方法
3、卷积神经网络的方法
本文的历史意义:
1、开启基于深度学习的文本分类的序幕
2、推动卷积神经网络在自然语言处理的发展
TextCNN模型结构和正则化
模型结构如上所示,先通过卷积操作,计算卷积核和数据卷积的效果,然后进行池化操作,最后进行全连接 + softmax操作
注:全连接之前拼接成的数据维度等于卷积核的个数,模型的通道(channel)个数等于相同维度卷积核的个数
为了防止过拟合,本文提出了两种方法:
1、Dropout:
在神经网络的传播过程中,让某个神经元以一定的概率p停止工作,从而增加模型的泛化能力
2、L2-正则
给所有参数加以限制,使得学习不偏激,增加模型的泛化能力
数据集介绍、实验超参设置以及实验结果
实验参数及模型对比,证明该模型较好。
实验探究、通道个数讨论和词向量使用方法讨论
对全文进行总结
关键点:
1、预训练的词向量 - wordd2vec、Glove
2、卷积神经网络结构- 一维卷积、池化
3、超参选择- 卷积核选择、词向量方式选择
创新点:
1、提出基于CNN文本分类模型TextCNN
2、提出多种词向量设置方式
3、在四个文本分类任务上取得最优结果
4、对超参进行大量实验和分析
启发点:
1、在预训练模型的基础上微调能够得到非常好的结果,说明预训练词向量学习到了一些通用的特征
2、预训练词向量的基础上使用简单模型比复杂模型表现的还要好
3、对于不在预训练词向量中的词,微调能够学习到更多的意义
# ************* 数据预处理部分 **********
# encoding = 'utf-8'
from torch.utils import data
import os
import random
import numpy as np
from gensim.test.utils import datapath,get_tmpfile
from gensim.models import KeyedVectors
"""加载预训练词向量模型"""
wvmodel = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz",binary=True)
""" 读取论文中的实验数据 """
pos_data = open("./data/MR/rt-polarity.pos",errors='ignore').readlines()
neg_data = open("./data/MR/rt-polarity.neg",errors='ignore').readlines()
datas = pos_data + neg_data
datas = [data.split() for data in datas]
labels = [1] * len(pos_data) + [0] * len(neg_data)
""" 构建词表,以句子最大长度为标准做padding"""
max_sentence_length = max([len(sentence) for sentence in datas])
word2id = {'':0}
for i,data in enumerate(datas):
for j,word in enumerate(data):
if word2id.get(word) == None:
word2id[word] = len(word2id)
datas[i][j] = word2id[word]
datas[i] = datas[i] + [0] * (max_sentence_length - len(datas[i]))
""" 计算已有词向量的均值和方差 """
tmp = []
for word,index in word2id.items():
try:
tmp.append(wvmodel.get_vector(word))
except:
pass
mean = np.mean(np.array(tmp))
std = np.std(np.array(tmp))
print(mean,std)
""" 如果词在预训练的词向量模型中,则使用词向量,否则使用已有词向量计算的均值和方差构造的随机初始化向量 """
vocab_size = len(word2id)
embed_size = 300
embedding_weigths = np.random.normal(mean,std,[vocab_size,embed_size])
for word,index in word2id.items():
try:
embedding_weigths[index,:] = wvmodel.get_vector(word)
except:
pass
""" 打乱数据顺序 """
c = list(zip(datas,labels))
random.seed(1)
random.shuffle(c)
datas[:],labels[:] = zip(*c)
""" 生成训练集、验证集、 测试集 """
k = 0
train_datas = datas[:int(k*len(datas)/10)] + datas[int((k+1)*len(datas)/10):]
train_labels = labels[:int(k*len(datas)/10)] + labels[int((k+1)*len(datas)/10):]
valid_datas = np.array(train_datas[int(0.9*len(train_datas)):])
valid_labels = np.array(train_labels[int(0.9*len(train_labels)):])
train_datas = np.array(train_datas[:int(0.9*len(train_datas))])
train_labels = np.array(train_labels[:int(0.9*len(train_labels))])
test_datas = np.array(datas[int(k*len(datas)/10):int((k+1)*len(datas)/10)])
test_labels = np.array(datas[int(k*len(datas)/10):int((k+1)*len(datas)/10)])
# *********** 模型构建部分 ***********
# encoding = 'utf-8'
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
class BasicModule(nn.Module):
def __init__(self):
super(BasicModule,self).__init__()
self.model_name = str(type(self))
def load(self, path):
self.load_state_dict(torch.load(path))
def save(self, path):
torch.save(self.state_dict(),path)
def forward(self):
pass
class TextCNN(BasicModule):
def __init__(self,config):
super(TextCNN,self).__init__()
# 嵌入层
if config.embedding_pretrained is not None:
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
self.embedding = nn.Embedding(config.n_vocab, config.embed_size)
# 卷积层
self.conv1d_1 = nn.Conv1d(config.embed_size, config.filter_num, config.filters[0])
self.convld_2 = nn.Conv1d(config.embed_size, config.filter_num, config.filters[1])
self.convld_3 = nn.Conv1d(config.embed_size, config.filter_num, config.filters[2])
# 池化层
self.max_pool_1 = nn.MaxPool1d(config.sentence_max_size - config.filters[0] + 1)
self.max_pool_2 = nn.MaxPool1d(config.sentence_max_size - config.filters[1] + 1)
self.max_pool_3 = nn.MaxPool1d(config.sentence_max_size - config.filters[2] + 1)
# Dropout 层
self.dropout = nn.Dropout(config.dropout)
# 分类层
self.fc = nn.Linear(config.filter_num * len(config.filters), config.label_num)
def forward(self, x):
x = x.long()
out = self.embedding(x) # batch_size * embeding_size * sentence_length
out = out.transpose(1,2).contiguous() # batch_size * sentence_length * embeding_size
x1 = F.relu(self.conv1d_1(out))
x2 = F.relu(self.convld_2(out))
x3 = F.relu(self.convld_3(out))
x1 = self.max_pool_1(x1).squeeze()
x2 = self.max_pool_2(x2).squeeze()
x3 = self.max_pool_3(x3).squeeze()
out = torch.cat([x1,x2,x3],1)
out = self.dropout(out)
out = self.fc(out)
return out
class config:
def __init__(self):
self.embedding_pretrained = None # 是否使用预训练词向量
self.n_vocab = 100 # 词表中单词个数
self.embed_size = 300 # 词向量维度
self.cuda = False # 是否使用GPU
self.filter_num = 100 # 每种尺寸卷积核的个数
self.filters = [3,4,5] # 卷积核尺寸
self.label_num = 2 # 标签个数
self.dropout = 0.5 # dropout的概率
self.sentence_max_size = 50 # 最大句子长度
configs = config()
textcnn = TextCNN(configs)
summary(textcnn,input_size=(50,))
# ******** 模型训练部分 **********
from pytorchtools import EarlyStopping
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from model import TextCNN
from data import MR_Dataset
import numpy as np
import config as argumentparser
config = argumentparser.ArgumentParser()
config.filters = list(map(int,config.filters.split(",")))
early_stopping = EarlyStopping(patience=10, verbose=True,cv_index=i)
training_set = MR_Dataset(state="train",k=i)
config.n_vocab = training_set.n_vocab
training_iter = torch.utils.data.DataLoader(dataset=training_set,
batch_size=config.batch_size,
shuffle=True,
num_workers=2)
if config.use_pretrained_embed:
config.embedding_pretrained = torch.from_numpy(training_set.weight).float()
else:
pass
valid_set = MR_Dataset(state="valid", k=i)
valid_iter = torch.utils.data.DataLoader(dataset=valid_set,
batch_size=config.batch_size,
shuffle=False,
num_workers=2)
test_set = MR_Dataset(state="test", k=i)
test_iter = torch.utils.data.DataLoader(dataset=test_set,
batch_size=config.batch_size,
shuffle=False,
num_workers=2)
model = TextCNN(config)
if config.cuda and torch.cuda.is_available():
model.cuda()
config.embedding_pretrained.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
count = 0
loss_sum = 0
def get_test_result(data_iter,data_set):
model.eval()
data_loss = 0
true_sample_num = 0
for data, label in data_iter:
if config.cuda and torch.cuda.is_available():
data = data.cuda()
label = label.cuda()
else:
data = torch.autograd.Variable(data).long()
out = model(data)
loss = criterion(out, autograd.Variable(label.long()))
data_loss += loss.data.item()
true_sample_num += np.sum((torch.argmax(out, 1) == label).cpu().numpy()) #(0,0.5)
acc = true_sample_num / data_set.__len__()
return data_loss,acc
for epoch in range(config.epoch):
# 训练开始
model.train()
for data, label in training_iter:
if config.cuda and torch.cuda.is_available():
data = data.cuda()
label = label.cuda()
else:
data = torch.autograd.Variable(data).long()
label = torch.autograd.Variable(label).squeeze()
out = model(data)
# l2_alpha*w^2
l2_loss = config.l2*torch.sum(torch.pow(list(model.parameters())[1],2))
loss = criterion(out, autograd.Variable(label.long()))+l2_loss
loss_sum += loss.data.item()
count += 1
if count % 100 == 0:
print("epoch", epoch, end=' ')
print("The loss is: %.5f" % (loss_sum / 100))
loss_sum = 0
count = 0
optimizer.zero_grad()
loss.backward()
optimizer.step()
# save the model in every epoch
# 一轮训练结束
# 验证集上测试
valid_loss,valid_acc = get_test_result(valid_iter,valid_set)
early_stopping(valid_loss, model)
print ("The valid acc is: %.5f" % valid_acc)
if early_stopping.early_stop:
print("Early stopping")
break
# 训练结束,开始测试
model.load_state_dict(torch.load('./checkpoints/checkpoint%d.pt'%i))
test_loss, test_acc = get_test_result(test_iter, test_set)
print("The test acc is: %.5f" % test_acc)
""" EarlyStopping """
import numpy as np
import torch
class EarlyStopping:
"""Early stops the training if validation loss doesn't improve after a given patience."""
def __init__(self, patience=7, verbose=False, delta=0,cv_index = 0):
"""
Args:
patience (int): How long to wait after last time validation loss improved.
Default: 7
verbose (bool): If True, prints a message for each validation loss improvement.
Default: False
delta (float): Minimum change in the monitored quantity to qualify as an improvement.
Default: 0
"""
self.patience = patience
self.verbose = verbose
self.counter = 0
self.best_score = None
self.early_stop = False
self.val_loss_min = np.Inf
self.delta = delta
self.cv_index = cv_index
def __call__(self, val_loss, model):
score = -val_loss
if self.best_score is None:
self.best_score = score
self.save_checkpoint(val_loss, model)
elif score < self.best_score + self.delta:
self.counter += 1
print('EarlyStopping counter: %d out of %d'%(self.counter,self.patience))
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.save_checkpoint(val_loss, model)
self.counter = 0
def save_checkpoint(self, val_loss, model):
'''Saves model when validation loss decrease.'''
if self.verbose:
print('Validation loss decreased (%.5f --> %.5f). Saving model ...'%(self.val_loss_min,val_loss))
torch.save(model.state_dict(), './checkpoints/checkpoint%d.pt'%self.cv_index)
self.val_loss_min = val_loss
完整代码详见:https://github.com/wangtao666666/NLP/tree/master/TextCNN