这次的数据集来自github,万分感激原作者的收集,数据集是京东的购物评论,分为正面情绪以及负面情绪的两个文本。
其中,正面情绪样本947条, 负面情绪样本2142条。
使用全部的单词,做词向量的训练。词向量使用gensim训练,非常地方便和快捷,强推强推!
先上词向量的代码,词向量维数100维:
from gensim.models.word2vec import Word2Vec
import text_util
pos = text_util.load_txt("pos.txt")
neg = text_util.load_txt("neg.txt")
pos_list = text_util.seg_words(pos)
neg_list = text_util.seg_words(neg)
#创建词向量模型 由于语料库样本少 保留全部词汇进行训练
model = Word2Vec(pos_list + neg_list, sg=1, size=100, window=5, min_count=1, negative=3, sample=0.001, hs=1, workers=4)
#检测词向量之间关系
model.similarity(u"好用", u"不错")
#保存模型
model.save("评论.model")
词向量训练完毕后,因为后面的语料预处理中有使用到CountVectorizer和TfidfVectorizer,而他们都会过滤长度为1的中文单词,这是个坑坑坑,于是笔者参考博客 http://www.cnblogs.com/zz22--/p/9492720.html, 修改了python源码中的内容,具体参考可见博客。
准备工作完成,开始使用lstm进行情感分析地训练:
(1)语料的处理
#载入数据
pos = textutil.load_txt("pos.txt")
neg = textutil.load_txt("neg.txt")
#分词,处理数据
pos = textutil.seg_words_with_blank(pos)
neg = textutil.seg_words_with_blank(neg)
#混合数据
randIt = []
data = []
labels = []
for i in range(len(pos)):
randIt.append([pos[i],[0,1]]) #正面
for i in range(len(neg)):
randIt.append([neg[i],[1,0]]) #负面
for i in range(len(randIt)):
data.append(randIt[i][0])
labels.append(randIt[i][1])
#调整超参数的设置
batch_size = 50 #样本数
lstm_size = 128 #lstm cell里面的神经元数目
num_layers = 3#堆叠
target_classes = 2#最终分为2类
learning_rate = 0.001
keep_prob = 0.5
num_keywords = 2000
word_classes = 100 #词向量维数为100维
#提取关键字
keywords = textutil.key_words(data,num_keywords)
#创建由关键字组成的稀疏矩阵
textmatrix = textutil.count_sparse_matrix(data,True)
textmatrix = textmatrix[keywords]
#整理数据
wordlists, steps = textutil.matrix_to_wordlists(textmatrix)
worddata = pd.DataFrame({"WORD": wordlists, "STEP":steps})
(2)拆分训练集和测试集,并创建get_batches方法用于随机梯度下降
#拆分训练集和数据集
traindata,testdata,trainlabel,testlabel = train_test_split(worddata, labels, test_size = 0.2)
#载入词向量model
model = Word2Vec.load("评论.model")
#词向量读取方法
def word_to_vec(word):
try:
wordvec = model[word]
except KeyError as e:
print(word, "不存在词向量")
wordvec = np.array([0] * 100)
return wordvec
#内含one-hot的get-batches
def get_batches(traindata,trainlabel,batch_size, word_classes):
n_batches = len(traindata) // batch_size
traindata = traindata.iloc[ : batch_size * n_batches, :]
for i in range(n_batches):
dataframe_x = traindata.iloc[i * batch_size : (i + 1) * batch_size]
batch_y = np.array(trainlabel[i * batch_size : (i + 1) * batch_size])
step = max(dataframe_x["STEP"])
word_x = dataframe_x["WORD"]
batch_item = []
for item in word_x:
steps_x = []
for j in range(step):
if j < len(item):
step_word = word_to_vec(item[j])
else:
step_word = [0]*word_classes #补0
steps_x.append(step_word)
batch_item.append(np.array(steps_x))
batch_x = np.array(batch_item)
yield batch_x, batch_y, step
#get_batches方法的测试
i = 1
for x,y,step in get_batches(traindata, trainlabel, batch_size, word_classes):
print("*************第",i,"次**********************")
print("step:", step)
print("x的shape:", x.shape)
print("y的shape:",y.shape)
i += 1
print("********************************************")
在get_batches方法中可以看到,每批的时间步长度并不相同,如下:
*************第 1 次********************** step: 24 x的shape: (50, 24, 100) y的shape: (50, 2) ******************************************** *************第 2 次********************** step: 52 x的shape: (50, 52, 100) y的shape: (50, 2) ******************************************** *************第 3 次********************** step: 32 x的shape: (50, 32, 100) y的shape: (50, 2) ******************************************** *************第 4 次********************** step: 34 x的shape: (50, 34, 100) y的shape: (50, 2) ******************************************** *************第 5 次********************** step: 55 x的shape: (50, 55, 100) y的shape: (50, 2) ******************************************** *************第 6 次********************** step: 28 x的shape: (50, 28, 100) y的shape: (50, 2) ******************************************** *************第 7 次********************** step: 28 x的shape: (50, 28, 100) y的shape: (50, 2) ********************************************
(3)构建lstm网络与全连接层
#设置输入placeholder
def build_inputs(batch_size, word_classes, target_classes):
inputs = tf.placeholder(tf.float32, [batch_size, None, word_classes], name = "inputs")
targets = tf.placeholder(tf.float32, [batch_size,target_classes], name = "targets")
keep_prob = tf.placeholder(tf.float32, name = "keep_prob")
return inputs, targets, keep_prob
#构建lstm单元
def lstm_cell(lstm_size, keep_prob):
cell = tf.contrib.rnn.LSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
#构建多层lstm
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
cell = tf.contrib.rnn.MultiRNNCell([lstm_cell(lstm_size,keep_prob) for _ in range(num_layers)],
state_is_tuple=True)
#初始化所有权重为0
initial_state = cell.zero_state(batch_size, tf.float32)
return cell, initial_state
#构建返回结果 包含两层全连接层
def build_output(lstm_output, lstm_size, target_classes):
seq_output = tf.concat(lstm_output, axis=1)
seq_output = seq_output[:, -1, :] #选取最后一个值输出
print('lstm输出到全连接层的结构:',seq_output)
x = tf.reshape(seq_output, [-1, lstm_size])
print("reshape后的结构:", x)
with tf.variable_scope("softmax"):
hidden_w = tf.Variable(tf.truncated_normal((lstm_size, lstm_size // 2), stddev=0.1))
hidden_b = tf.Variable(tf.zeros(lstm_size // 2))
softmax_w = tf.Variable(tf.truncated_normal((lstm_size // 2, target_classes), stddev=0.1))
softmax_b = tf.Variable(tf.zeros(target_classes))
hidden = tf.matmul(x, hidden_w) + hidden_b
hidden = tf.nn.relu(hidden)
logist = tf.matmul(hidden, softmax_w) + softmax_b
out = tf.nn.softmax(logist, name = "predictions")
print("out的结构:",out)
print("logist的结构:",logist)
return out,logist
#构建损失 和 准确率
def build_loss(logits, targets):
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets)
loss = tf.reduce_mean(loss)
return loss
#准确率
def build_accuracy(logits, targets):
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(targets, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
return accuracy
#构建梯度下降optimizer
def build_optimizer(loss, learning_rate, grad_clip):
tvars = tf.trainable_variables()
grads,_ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
train_op = tf.train.AdamOptimizer(learning_rate)
optimizer = train_op.apply_gradients(zip(grads, tvars))
return optimizer
class CharRNN:
def __init__(self, word_classes = word_classes, target_classes = target_classes,
batch_size = 100,lstm_size = 20, num_layers = 5, learning_rate = 0.01,
grad_clip = 5):
tf.reset_default_graph()
self.inputs, self.targets, self.keep_prob = build_inputs(batch_size,word_classes, target_classes)
print("输入的结构为:",self.inputs)
#创建lstm网络
cell, self.initial_state = build_lstm(lstm_size, num_layers,
batch_size, self.keep_prob)
outputs, state = tf.nn.dynamic_rnn(cell, self.inputs,
initial_state = self.initial_state)
self.final_state = state
self.prediction, self.logits = build_output(outputs, lstm_size,target_classes)
self.loss = build_loss(self.logits, self.targets)
self.accuracy = build_accuracy(self.logits, self.targets)
self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)
(4)创建会话,开始训练
#开始训练
epochs = 10
model_train = CharRNN(word_classes=word_classes, target_classes=target_classes,batch_size = batch_size,
lstm_size = lstm_size, num_layers = num_layers,learning_rate = learning_rate)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print("\n************************训练开始**************************")
count = 0
while True:
for i in range(epochs):
new_state = sess.run(model_train.initial_state)
loss = 0
for x, y, step in get_batches(traindata, trainlabel, batch_size, word_classes):
feed ={
model_train.inputs : x,
model_train.targets : y,
model_train.keep_prob : keep_prob,
model_train.initial_state:new_state}
batch_loss,_ = sess.run([model_train.loss,model_train.optimizer],feed_dict = feed)
loss += batch_loss
print("第",(epochs * count + i + 1), "轮,损失为:",loss)
count += 1
#预测前100个
test_new_state = sess.run(model_train.initial_state)
accuracy = 0
for x,y,step in get_batches(testdata, testlabel, batch_size, word_classes):
feed = {
model_train.inputs : x,
model_train.targets : y,
model_train.keep_prob : 1.,
model_train.initial_state:test_new_state}
test_accuracy = sess.run(model_train.accuracy, feed_dict = feed)
accuracy += test_accuracy
print("测试集的预测准确率为:", accuracy / (testdata.shape[0] // batch_size))
if test_accuracy > 0.90:
print("\n************************训练结束**************************")
break
源码地址:
https://github.com/freeingfree/lstm