使用卷积神经网络对汽车行业评论文本进行情感分析。
dateset
爬取汽车之家车主口碑评论文本,抽取口碑中最满意以及最不满意评论文本,分别作为正向情感语料库和负向情感语料库。
爬虫技术视频链接:https://pan.baidu.com/s/1ySXWuVmPW79Wa0lDvDKy_Q
语料库基本信息如下:
utils.py
为数据的预处理代码。
import os
import time
from datetime import timedelta
import numpy as np
from collections import Counter
import tensorflow.contrib.keras as kr
def time_diff(start_time):
"""当前距初始时间已花费的时间"""
end_time = time.time()
diff = end_time - start_time
return timedelta(seconds=int(round(diff)))
def batch_index(length, batch_size, is_shuffle=True):
"""
生成批处理样本序列id.
:param length: 样本总数
:param batch_size: 批处理大小
:param is_shuffle: 是否打乱样本顺序
:return:
"""
index = [idx for idx in range(length)]
if is_shuffle:
np.random.shuffle(index)
for i in range(int(np.ceil(length / batch_size))):
yield index[i * batch_size:(i + 1) * batch_size]
def cat_to_id(classes=None):
"""
:param classes: 分类标签;默认为pos, neg
:return: {分类标签:id}
"""
if not classes:
classes = ['pos', 'neg']
cat2id = {cat: idx for (idx, cat) in enumerate(classes)}
return classes, cat2id
def load_corpus(path, word2id, max_sen_len=50):
"""
:param path: 样本语料库的文件
:return: 文本内容contents,以及分类标签labels(onehot形式)
"""
_, cat2id = cat_to_id()
contents, labels = [], []
with open(path, encoding='utf-8') as f:
for line in f.readlines():
sp = line.strip().split()
label = sp[0]
content = [word2id.get(w, 0) for w in sp[1:]]
content = content[:max_sen_len]
if len(content) < max_sen_len:
content += [word2id['_PAD_']] * (max_sen_len - len(content))
labels.append(label)
contents.append(content)
counter = Counter(labels)
print('总样本数为:%d' % (len(labels)))
print('各个类别样本数如下:')
for w in counter:
print(w, counter[w])
contents = np.asarray(contents)
labels = [cat2id[l] for l in labels]
labels = kr.utils.to_categorical(labels, len(cat2id))
return contents, labels
def build_word2id(file):
"""
:param file: word2id保存地址
:return: None
"""
word2id = {'_PAD_': 0}
path = [os.path.join('./data/', w) for w in os.listdir('./data/')]
for _path in path:
with open(_path, encoding='utf-8') as f:
for line in f.readlines():
sp = line.strip().split()
for word in sp[1:]:
if word not in word2id.keys():
word2id[word] = len(word2id)
with open(file, 'w', encoding='utf-8') as f:
for w in word2id:
f.write(w+'\t')
f.write(str(word2id[w]))
f.write('\n')
# build_word2id('./data/word_to_id.txt')
def load_word2id(path):
"""
:param path: word_to_id词汇表路径
:return: word_to_id:{word: id}
"""
word_to_id = {}
with open(path, encoding='utf-8') as f:
for line in f.readlines():
sp = line.strip().split()
word = sp[0]
idx = int(sp[1])
if word not in word_to_id:
word_to_id[word] = idx
return word_to_id
def build_word2vec(fname, word2id, save_to_path=None):
"""
:param fname: 预训练的word2vec.
:param word2id: 语料文本中包含的词汇集.
:param save_to_path: 保存训练语料库中的词组对应的word2vec到本地
:return: 语料文本中词汇集对应的word2vec向量{id: word2vec}.
"""
import gensim
n_words = max(word2id.values()) + 1
model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
word_vecs = np.array(np.random.uniform(-1., 1., [n_words, model.vector_size]))
for word in word2id.keys():
try:
word_vecs[word2id[word]] = model[word]
except KeyError:
pass
if save_to_path:
with open(save_to_path, 'w', encoding='utf-8') as f:
for vec in word_vecs:
vec = [str(w) for w in vec]
f.write(' '.join(vec))
f.write('\n')
return word_vecs
# word2id = load_word2id('./data/word_to_id.txt')
# w2v = build_word2vec('./data/wiki_word2vec_50.bin', word2id, save_to_path='./data/corpus_word2vec.txt')
def load_corpus_word2vec(path):
"""加载语料库word2vec词向量,相对wiki词向量相对较小"""
word2vec = []
with open(path, encoding='utf-8') as f:
for line in f.readlines():
sp = [float(w) for w in line.strip().split()]
word2vec.append(sp)
return np.asarray(word2vec)
cat_to_id()
: 分类类别以及id对应词典{pos:0, neg:1};build_word2id()
: 构建词汇表并存储,形如{word: id};load_word2id()
: 加载上述构建的词汇表;build_word2vec()
: 基于预训练好的word2vec构建训练语料中所含词语的word2vec;load_corpus_word2vec()
: 加载上述构建的word2ve;load_corpus()
: 加载语料库:train/dev/test;batch_index()
: 生成批处理id序列。经过数据预处理,数据的格式如下:
x为构成一条语句的单词所对应的id。 y为onehot编码: pos-[1, 0], neg-[0, 1]。
CNN可配置的参数如下所示
class CONFIG():
update_w2v = True # 是否在训练中更新w2v
vocab_size = 37814 # 词汇量,与word2id中的词汇量一致
n_class = 2 # 分类数:分别为pos和neg
max_sen_len = 75 # 句子最大长度
embedding_dim = 50 # 词向量维度
batch_size = 100 # 批处理尺寸
n_hidden = 256 # 隐藏层节点数
n_epoch = 10 # 训练迭代周期,即遍历整个训练样本的次数
opt = 'adam' # 训练优化器:adam或者adadelta
learning_rate = 0.001 # 学习率;若opt=‘adadelta',则不需要定义学习率
drop_keep_prob = 0.5 # dropout层,参数keep的比例
num_filters = 256 # 卷积层filter的数量
kernel_size = 3 # 卷积核的尺寸;nlp任务中通常选择2,3,4,5
print_per_batch = 100 # 训练过程中,每100词batch迭代,打印训练信息
save_dir = './checkpoints/' # 训练模型保存的地址
...
代码
class TextCNN(object):
def __init__(self, config, embeddings=None):
self.update_w2v = config.update_w2v
self.vocab_size = config.vocab_size
self.n_class = config.n_class
self.max_sen_len= config.max_sen_len
self.embedding_dim = config.embedding_dim
self.batch_size = config.batch_size
self.num_filters = config.num_filters
self.kernel_size = config.kernel_size
self.n_hidden = config.n_hidden
self.n_epoch = config.n_epoch
self.opt = config.opt
self.learning_rate = config.learning_rate
self.drop_keep_prob = config.drop_keep_prob
self.x = tf.placeholder(tf.int32, [None, self.max_sen_len], name='x')
self.y = tf.placeholder(tf.int32, [None, self.n_class], name='y')
# self.word_embeddings = tf.constant(embeddings, tf.float32)
# self.word_embeddings = tf.Variable(embeddings, dtype=tf.float32, trainable=self.update_w2v)
if embeddings is not None:
self.word_embeddings = tf.Variable(embeddings, dtype=tf.float32, trainable=self.update_w2v)
else:
self.word_embeddings = tf.Variable(
tf.zeros([self.vocab_size, self.embedding_dim]),
dtype=tf.float32,
trainable=self.update_w2v)
self.build()
def cnn(self):
"""
:param mode:默认为None,主要调节dropout操作对训练和预测带来的差异。
:return: 未经softmax变换的fully-connected输出结果
"""
inputs = self.add_embeddings()
with tf.name_scope("cnn"):
# CNN layer
conv = tf.layers.conv1d(inputs, self.num_filters, self.kernel_size, name='conv')
# global max pooling layer
gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')
# dropout 卷积层后加dropout效果太差
# gmp = tf.contrib.layers.dropout(gmp, self.drop_keep_prob)
with tf.name_scope("score"):
# fully-connected
fc = tf.layers.dense(gmp, self.n_hidden, name='fc1')
# dropout
fc = tf.contrib.layers.dropout(fc, self.drop_keep_prob)
# nonlinear
fc = tf.nn.relu(fc)
# fully-connected
pred = tf.layers.dense(fc, self.n_class, name='fc2')
return pred
def add_embeddings(self):
inputs = tf.nn.embedding_lookup(self.word_embeddings, self.x)
return inputs
def add_loss(self, pred):
cost = tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=self.y)
cost = tf.reduce_mean(cost)
return cost
def add_optimizer(self, loss):
if self.opt == 'adadelta':
optimizer = tf.train.AdadeltaOptimizer(learning_rate=1.0, rho=0.95, epsilon=1e-6)
else:
optimizer = tf.train.AdamOptimizer(self.learning_rate)
opt = optimizer.minimize(loss)
return opt
def add_accuracy(self, pred):
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(self.y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
return accuracy
def get_batches(self, x, y=None, batch_size=100, is_shuffle=True):
for index in batch_index(len(x), batch_size, is_shuffle=is_shuffle):
n = len(index)
feed_dict = {
self.x: x[index]
}
if y is not None:
feed_dict[self.y] = y[index]
yield feed_dict, n
def build(self):
self.pred = self.cnn()
self.loss = self.add_loss(self.pred)
self.accuracy = self.add_accuracy(self.pred)
self.optimizer = self.add_optimizer(self.loss)
def train_on_batch(self, sess, feed):
_, _loss, _acc = sess.run([self.optimizer, self.loss, self.accuracy], feed_dict=feed)
return _loss, _acc
def test_on_batch(self, sess, feed):
_loss, _acc = sess.run([self.loss, self.accuracy], feed_dict=feed)
return _loss, _acc
def predict_on_batch(self, sess, feed, prob=True):
result = tf.argmax(self.pred, 1)
if prob:
result = tf.nn.softmax(logits=self.pred, dim=1)
res = sess.run(result, feed_dict=feed)
return res
def predict(self, sess, x, prob=False):
yhat = []
for _feed, _ in self.get_batches(x, batch_size=self.batch_size, is_shuffle=False):
_yhat = self.predict_on_batch(sess, _feed, prob)
yhat += _yhat.tolist()
# yhat.append(_yhat)
return np.array(yhat)
def evaluate(self, sess, x, y):
"""评估在某一数据集上的准确率和损失"""
num = len(x)
total_loss, total_acc = 0., 0.
for _feed, _n in self.get_batches(x, y, batch_size=self.batch_size):
loss, acc = self.test_on_batch(sess, _feed)
total_loss += loss * _n
total_acc += acc * _n
return total_loss / num, total_acc / num
def fit(self, sess, x_train, y_train, x_dev, y_dev, save_dir=None, print_per_batch=100):
saver = tf.train.Saver()
if save_dir:
if not os.path.exists(save_dir):
os.makedirs(save_dir)
sess.run(tf.global_variables_initializer())
print('Training and evaluating...')
start_time = time.time()
total_batch = 0 # 总批次
best_acc_dev = 0.0 # 最佳验证集准确率
last_improved = 0 # 记录上次提升批次
require_improvement = 500 # 如果超过500轮模型效果未提升,提前结束训练
flags = False
for epoch in range(self.n_epoch):
print('Epoch:', epoch + 1)
for train_feed, train_n in self.get_batches(x_train, y_train, batch_size=self.batch_size):
loss_train, acc_train = self.train_on_batch(sess, train_feed)
loss_dev, acc_dev = self.evaluate(sess, x_dev, y_dev)
if total_batch % print_per_batch == 0:
if acc_dev > best_acc_dev:
# 保存在验证集上性能最好的模型
best_acc_dev = acc_dev
last_improved = total_batch
if save_dir:
saver.save(sess=sess, save_path=os.path.join(save_dir, 'sa-model'))
improved_str = '*'
else:
improved_str = ''
time_dif = time_diff(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' + \
' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss_train, acc_train, loss_dev, acc_dev, time_dif, improved_str))
total_batch += 1
if total_batch - last_improved > require_improvement:
print('No optimization for a long time, auto-stopping...')
flags = True
break
if flags:
break
进行训练。
加载word2vec==========================
加载train语料库========================
总样本数为:40000
各个类别样本数如下:
pos 20000
neg 20000
加载dev语料库==========================
总样本数为:10000
各个类别样本数如下:
pos 5000
neg 5000
加载test语料库=========================
总样本数为:20000
各个类别样本数如下:
pos 10000
neg 10000
Training and evaluating...
Epoch: 1
Iter: 0, Train Loss: 0.71, Train Acc: 51.00%, Val Loss: 0.86, Val Acc: 49.96%, Time: 0:00:04 *
Iter: 100, Train Loss: 0.29, Train Acc: 89.00%, Val Loss: 0.26, Val Acc: 89.16%, Time: 0:04:37 *
Iter: 200, Train Loss: 0.22, Train Acc: 93.00%, Val Loss: 0.2, Val Acc: 91.85%, Time: 0:09:05 *
Iter: 300, Train Loss: 0.082, Train Acc: 96.00%, Val Loss: 0.17, Val Acc: 93.26%, Time: 0:13:26 *
Epoch: 2
Iter: 400, Train Loss: 0.16, Train Acc: 96.00%, Val Loss: 0.17, Val Acc: 93.19%, Time: 0:17:52
Iter: 500, Train Loss: 0.11, Train Acc: 97.00%, Val Loss: 0.17, Val Acc: 93.51%, Time: 0:22:11 *
Iter: 600, Train Loss: 0.16, Train Acc: 97.00%, Val Loss: 0.15, Val Acc: 94.22%, Time: 0:26:36 *
Iter: 700, Train Loss: 0.15, Train Acc: 91.00%, Val Loss: 0.15, Val Acc: 94.05%, Time: 0:30:54
Epoch: 3
Iter: 800, Train Loss: 0.11, Train Acc: 95.00%, Val Loss: 0.15, Val Acc: 94.13%, Time: 0:35:13
Iter: 900, Train Loss: 0.058, Train Acc: 97.00%, Val Loss: 0.16, Val Acc: 94.33%, Time: 0:39:37 *
Iter: 1000, Train Loss: 0.048, Train Acc: 98.00%, Val Loss: 0.15, Val Acc: 94.33%, Time: 0:43:53
Iter: 1100, Train Loss: 0.054, Train Acc: 97.00%, Val Loss: 0.16, Val Acc: 94.10%, Time: 0:48:21
Epoch: 4
Iter: 1200, Train Loss: 0.065, Train Acc: 96.00%, Val Loss: 0.16, Val Acc: 94.52%, Time: 0:52:43 *
Iter: 1300, Train Loss: 0.056, Train Acc: 97.00%, Val Loss: 0.17, Val Acc: 94.55%, Time: 0:57:09 *
Iter: 1400, Train Loss: 0.016, Train Acc: 100.00%, Val Loss: 0.17, Val Acc: 94.40%, Time: 1:01:30
Iter: 1500, Train Loss: 0.1, Train Acc: 97.00%, Val Loss: 0.16, Val Acc: 94.90%, Time: 1:05:49 *
Epoch: 5
Iter: 1600, Train Loss: 0.021, Train Acc: 99.00%, Val Loss: 0.16, Val Acc: 94.28%, Time: 1:10:00
Iter: 1700, Train Loss: 0.045, Train Acc: 99.00%, Val Loss: 0.18, Val Acc: 94.40%, Time: 1:14:16
Iter: 1800, Train Loss: 0.036, Train Acc: 98.00%, Val Loss: 0.21, Val Acc: 94.10%, Time: 1:18:36
Iter: 1900, Train Loss: 0.014, Train Acc: 100.00%, Val Loss: 0.2, Val Acc: 94.18%, Time: 1:22:59
在验证集上的最佳效果为94.90%。
est()进行测试。
INFO:tensorflow:Restoring parameters from ./checkpoints/sa-model
Precision, Recall and F1-Score...
precision recall f1-score support
pos 0.96 0.96 0.96 10000
neg 0.96 0.96 0.96 10000
avg / total 0.96 0.96 0.96 20000
Confusion Matrix...
[[9597 403]
[ 449 9551]]
在测试集上的准确率达到了95.74%,且各类的precision, recall和f1-score都超过了95%。
predict.py
中的predict()进行预测。
>> test = ['噪音大、车漆很薄', '性价比很高,价位不高,又皮实耐用。']
>> print(predict(test, label=True))
INFO:tensorflow:Restoring parameters from ./checkpoints/sa-model
['neg', 'pos']
爬虫技术视频链接:https://pan.baidu.com/s/1ySXWuVmPW79Wa0lDvDKy_Q
深度学习视频链接:https://pan.baidu.com/s/13rMKQDEXR3jwf6uOdplnZA 提取码:D2B2
自然语言处理视频链接:https://pan.baidu.com/s/19cojEzvTdCBb--K_0pzgYw
项目链接:https://download.csdn.net/download/weixin_40651515/10973640