之前写过一篇基于循环神经网络(RNN)的情感分类文章,这次我们换种思路,采用卷积神经网络(CNN)来进行文本分类任务。倘若对CNN如何在文本上进行卷积的可以移步博主的快速入门CNN在NLP中的使用一文。话不多说,直接上干货。
本次实验的数据集来源于Github上一个2.4k星的中文NLP开源数据集项目CLUEbenchmark(官方地址),本文选择的是其中的文本分类数据集waimai_10k。该数据集某外卖平台收集的用户评价,正向 4000 条,负向7987条,下面展示该数据集中的部分数据
1,不错,就是餐盒贵
1,火烧夹肉好咸啊!没法吃。其他还行
1,"味道不错,配送速度比预计快"
0,菜品质量好,味道好,就是百度的问题,总是用运力原因来解释,我也不懂这是什么原因,晚了三个小时呵呵厉害吧!反正订了就退不了,只能干等……
0,分量还可以……就是有点没特色……下回不吃啦
0,没什么味道,送来的晚凉了
步骤一:把数据集划分为训练集、验证集和测试集,划分比例为6:2:2。
def split(data):
"""
划分训练集、验证集和测试集:6:2:2
"""
data_size = data.shape[0]
indices = np.arange(data_size)
np.random.shuffle(indices)
train_size, valid_size = int(data_size * 0.6), int(data_size * 0.2)
train_indices, valid_indices, test_indices = indices[:train_size], \
indices[train_size:train_size + valid_size], indices[train_size + valid_size:]
return data[train_indices], data[valid_indices], data[test_indices]
划分完成后,训练集、验证集和测试集的样本数分别为7192、2397和2398。
步骤二:对划分好的训练集、验证集和测试集进行中文分词和删除停用词操作。分词采用jieba
分词,分词后进行停用词的删除。
def tokenization(data, stop_words, save_path):
"""
中文分词,删除停用词
"""
dataset = []
for label,review in data:
review = jieba.cut(review)
review = [c for c in review if c not in stop_words]
dataset.append([label, review])
with open(save_path, 'w', encoding='utf-8') as fp:
json.dump(dataset, fp, ensure_ascii=False)
注意:在分词阶段发现有几条数据删完停用词后为空,对其采取的措施是将其从数据集中删除。
首先统计训练集,验证集和测试集的不同标签类别数据的分布情况:
结论:可以看出划分后训练集、验证集和测试集的标签分布是一致的。
然后,统计了训练集,验证集和测试集的句子长度分布情况:
结论:数据集的三部分的句子长度分布也是基本一致的。
步骤三:在训练集上构建词表,在构建词表的过程中过滤低频词(词频小于5的词)。
def build_vocabulary(data, min_count=1):
"""
构建中文词表
"""
vocabs = defaultdict(int)
for _, review in data:
for word in review:
vocabs[word] += 1
word2id = {}
word2id['unk'] = 0
for k,v in vocabs.items():
if v < min_count:continue # 过滤低频词
word2id[k] = len(word2id)
with open('data/word2id.json', 'w', encoding='utf-8') as fp:
json.dump(word2id, fp, ensure_ascii=False)
步骤四:在训练集上训练Word2Vec词嵌入,并保存训练好的词嵌入,词嵌入的维度为100维。
if __name__ == "__main__":
train_path = "data/train.json"
sents = []
# 加载词表
with open('data/word2id.json', 'r', encoding='utf-8') as fp:
word2id = json.load(fp)
# 加载训练语料
with open(train_path, "r", encoding="utf-8") as fp:
for _,review in json.load(fp):
sentence = []
# 将训练语料中未出现在词表中的词全部处理为unk
for w in review:
if word2id.get(w):
sentence.append(w)
else:
sentence.append('unk')
sents.append(sentence)
# 训练词嵌入和保存
model = word2vec.Word2Vec(sents)
model.wv.save_word2vec_format('data/word2vec.bin', binary=False)
本文设计两种堆叠方式的CNN用于文本分类任务:
横向堆叠卷积神经网络的大致示意图如下所示:
模型源码为:
import torch
import torch.nn as nn
import torch.nn.functional as F
class GlobalMaxPool1d(nn.Module):
def __init__(self):
super(GlobalMaxPool1d, self).__init__()
def forward(self, x):
# x shape: (batch_size, channel, seq_len)
# return shape: (batch_size, channel, 1)
return F.max_pool1d(x, kernel_size=x.shape[2])
class TextCNNH(nn.Module):
def __init__(self, word_count, word_dim, num_filters, ngram_size, y_num, drop_prob):
super(TextCNNH, self).__init__()
# 词嵌入层
self.word_embed = nn.Embedding(word_count, word_dim)
# 卷积层
self.convs = nn.ModuleList()
for c,k in zip(num_filters, ngram_size):
self.convs.append(
nn.Conv1d(
in_channels = word_dim,
out_channels = c,
kernel_size = k
))
self.pooling_layer = GlobalMaxPool1d()
self.activation = nn.ReLU()
self.dropout = nn.Dropout(drop_prob)
self.fc = nn.Linear(sum(num_filters), y_num)
def load_pretrained_word_embedding(self, pre_word_embeddings, updated=False):
self.word_embed.weight = nn.Parameter(torch.Tensor(pre_word_embeddings))
self.word_embed.weight.requires_grad = updated
def forward(self, word_ids):
#转换为词向量
word_emb = self.word_embed(word_ids)
word_emb = word_emb.transpose(1, 2)
# 多尺度卷积
embeds = []
for conv in self.convs:
embeds.append(self.pooling_layer(self.activation(conv(word_emb))).squeeze(-1))
final_embed = torch.cat(embeds, dim=1)
return self.fc(self.dropout(final_embed))
纵向堆叠的卷积神经网络的大致示意图如下所示:
模型源码为:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
class GlobalMaxPool1d(nn.Module):
def __init__(self):
super(GlobalMaxPool1d, self).__init__()
def forward(self, x):
# x shape: (batch_size, channel, seq_len)
# return shape: (batch_size, channel, 1)
return F.max_pool1d(x, kernel_size=x.shape[2])
class TextCNNV(nn.Module):
def __init__(self, word_count, y_num, word_dim, num_filters, ngram_size, drop_prob):
super(TextCNNV, self).__init__()
self.word_embed = nn.Embedding(word_count, word_dim)
self.conv1 = nn.Sequential(nn.Conv1d(word_dim, num_filters[0], kernel_size=ngram_size[0]), nn.ReLU(),
nn.MaxPool1d(2))
self.conv2 = nn.Sequential(nn.Conv1d(num_filters[0], num_filters[1], kernel_size=ngram_size[1]), nn.ReLU(),
GlobalMaxPool1d())
self.dropout = nn.Dropout(drop_prob)
self.fc = nn.Linear(num_filters[1], y_num)
self._init_weights(mean=0.0, std=0.05)
def load_pretrained_word_embedding(self, pre_word_embeddings, updated=False):
self.word_embed.weight = nn.Parameter(torch.Tensor(np.array(pre_word_embeddings)))
self.word_embed.weight.requires_grad = updated
def _init_weights(self, mean=0.0, std=0.05):
for module in self.modules():
if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
module.weight.data.normal_(mean, std)
def forward(self, input):
input = self.word_embed(input)
input = input.transpose(1, 2)
output = self.conv1(input)
output = self.conv2(output)
output = output.view(output.size(0), -1)
return self.fc(self.dropout(output))
模型训练与测评的主函数源码如下所示,在实验过程中通过训练集来训练模型,然后通过验证集来筛选模型,最后将在验证集上表现最好的模型用于测试集的测评。
def main(args):
# 加载数据集
train_set = load('data/train.json')
valid_set = load('data/valid.json')
test_set = load('data/test.json')
if args['model'] == 'textcnnv':
model = TextCNNV(
word_count=len(word2id),
y_num=args['y_num'],
word_dim=args['word_dim'],
num_filters=args['num_filters'],
ngram_size=args['ngram_size'],
drop_prob=args['drop_prob']
)
elif args['model'] == 'textcnnh':
model = TextCNNH(
word_count=len(word2id),
y_num=args['y_num'],
word_dim=args['word_dim'],
num_filters=args['num_filters'],
ngram_size=args['ngram_size'],
drop_prob=args['drop_prob']
)
if args['extra_embedding'] == True:
# 加载词嵌入
word_embedding = load_pretrained_embedding('data/word2vec.bin')
model.load_pretrained_word_embedding(word_embedding, True)
model.to(device)
optimizer = optim.Adam(model.parameters(), args['lr'], weight_decay=args['wd'])
valid_f1, valid_p,valid_r = [],[],[]
best_f1, best_model = 0., None
for e in range(args['n_epochs']):
train_loss = train(model, train_set, args['batch_size'], optimizer)
valid_loss, f1, p, r = evaluate_accuracy(model, valid_set, args['batch_size'])
valid_f1.append(f1)
valid_p.append(p)
valid_r.append(r)
print('Epoch {} train_loss: {:.6f} valid_loss: {:.6f}, f1 {:.4f}, p {:.4f}, r {:.4f}'.format(
e + 1, train_loss, valid_loss, f1, p, r
))
if best_f1 < f1:
best_f1 = f1
best_model = deepcopy(model)
x = list(range(1, len(valid_f1) + 1))
plt.title('Metrics On Valid Set')
plt.plot(x, valid_f1)
plt.plot(x, valid_p)
plt.plot(x, valid_r)
plt.legend(['f1-score', 'precision', 'recall'])
plt.savefig('images/{}_outcome.png'.format(args['model']))
plt.show()
_, f1, precision, recall = evaluate_accuracy(best_model, test_set, args['batch_size'])
print('testset: f1 {:.4f}, p {:.4f}, r {:.4f}'.format(f1, precision, recall))
if __name__ == "__main__":
try:
params = {
'model':'textcnnv', # ['textcnnv', 'textcnnh']
'word_dim':100,
'ngram_size':[2, 4],
'num_filters':[64, 64],
'lr':1e-4,
'batch_size':64,
'n_epochs':50,
"embedding": "w2v",
'y_num':2,
'wd':0,
'drop_prob':0.5
}
print(params)
main(params)
except Exception as exception:
raise
实验的测评指标包括F1分数、查准率和查全率。对于纵向堆叠的CNN,实验中设置的参数为:
params = {
'model':'textcnnv', # ['textcnnv', 'textcnnh']
'word_dim':100,
'ngram_size':[2, 4],
'num_filters':[64, 64],
'lr':5e-4,
'batch_size':64,
'n_epochs':50,
"extra_embedding": True,
'y_num':2,
'wd':0,
'drop_prob':0.3
}
训练过程中验证集上的测评指标随epoch的变化情况如下所示:
对于横向堆叠的CNN,实验中设置的参数为:
params = {
'model':'textcnnh', # ['textcnnv', 'textcnnh']
'word_dim':100,
'ngram_size':[2, 3, 4],
'num_filters':[32, 32, 32],
'lr':5e-4,
'batch_size':64,
'n_epochs':50,
"extra_embedding": True,
'y_num':2,
'wd':0,
'drop_prob':0.3
}
训练过程中验证集上的测评指标随epoch的变化情况如下所示:
两组实验在测试集上的结果如下表所示:
实验设置 | F1-Score | Precision | Recall |
---|---|---|---|
TextCNNV + Word2Vec | 0.7826 | 0.7728 | 0.7927 |
TextCNNH + Word2Vec | 0.7799 | 0.7794 | 0.7804 |
结论:从实验结果可以看出使用CNN进行文本分类确定也能取得不错的性能,限于时间博主并没有进行太过细致的调参实验,有兴趣的小伙伴可以自己去试试。
完整项目下载地址:基于CNN的中文文本分类实战(有条件的可以支持一下)
本文使用的模型图来源于:Convolutional Neural Networks for Text
以上便是本文的全部内容,要是觉得不错的话,可以点个赞或关注一下博主,你们的支持是博主进步的不竭动力,当然要是有问题的话也敬请批评指正!!!