使用词向量 + lstm 做情感分析

这次的数据集来自github,万分感激原作者的收集,数据集是京东的购物评论,分为正面情绪以及负面情绪的两个文本。

其中,正面情绪样本947条, 负面情绪样本2142条。

使用全部的单词,做词向量的训练。词向量使用gensim训练,非常地方便和快捷,强推强推!

先上词向量的代码,词向量维数100维:

from gensim.models.word2vec import Word2Vec
import text_util

pos = text_util.load_txt("pos.txt")
neg = text_util.load_txt("neg.txt")

pos_list = text_util.seg_words(pos)
neg_list = text_util.seg_words(neg)

#创建词向量模型 由于语料库样本少 保留全部词汇进行训练
model = Word2Vec(pos_list + neg_list, sg=1, size=100,  window=5,  min_count=1,  negative=3, sample=0.001, hs=1, workers=4)

#检测词向量之间关系
model.similarity(u"好用", u"不错")

#保存模型
model.save("评论.model")

词向量训练完毕后,因为后面的语料预处理中有使用到CountVectorizer和TfidfVectorizer,而他们都会过滤长度为1的中文单词,这是个坑坑坑,于是笔者参考博客 http://www.cnblogs.com/zz22--/p/9492720.html, 修改了python源码中的内容,具体参考可见博客。

准备工作完成,开始使用lstm进行情感分析地训练:

(1)语料的处理

#载入数据
pos = textutil.load_txt("pos.txt")
neg = textutil.load_txt("neg.txt")

#分词,处理数据
pos = textutil.seg_words_with_blank(pos)
neg = textutil.seg_words_with_blank(neg)

#混合数据
randIt = []
data = []
labels = []
for i in range(len(pos)):
    randIt.append([pos[i],[0,1]]) #正面
for i in range(len(neg)):
    randIt.append([neg[i],[1,0]]) #负面
for i in range(len(randIt)):
    data.append(randIt[i][0])
    labels.append(randIt[i][1])

#调整超参数的设置
batch_size = 50 #样本数
lstm_size = 128 #lstm cell里面的神经元数目
num_layers = 3#堆叠
target_classes = 2#最终分为2类
learning_rate = 0.001
keep_prob = 0.5
num_keywords = 2000
word_classes = 100 #词向量维数为100维

#提取关键字
keywords = textutil.key_words(data,num_keywords)

#创建由关键字组成的稀疏矩阵
textmatrix = textutil.count_sparse_matrix(data,True)
textmatrix = textmatrix[keywords]

#整理数据
wordlists, steps = textutil.matrix_to_wordlists(textmatrix)
worddata = pd.DataFrame({"WORD": wordlists, "STEP":steps})

(2)拆分训练集和测试集,并创建get_batches方法用于随机梯度下降

#拆分训练集和数据集
traindata,testdata,trainlabel,testlabel = train_test_split(worddata, labels, test_size = 0.2)

#载入词向量model
model = Word2Vec.load("评论.model")

#词向量读取方法
def word_to_vec(word):
    try:
        wordvec = model[word]
    except KeyError as e:
        print(word, "不存在词向量")
        wordvec = np.array([0] * 100)
    return wordvec

#内含one-hot的get-batches
def get_batches(traindata,trainlabel,batch_size, word_classes):
    n_batches = len(traindata) // batch_size
    traindata = traindata.iloc[ : batch_size * n_batches, :]

    for i in range(n_batches):
        dataframe_x = traindata.iloc[i * batch_size : (i + 1) * batch_size]
        batch_y = np.array(trainlabel[i * batch_size : (i + 1) * batch_size])
        step = max(dataframe_x["STEP"])
        word_x = dataframe_x["WORD"]
        batch_item = []
        for item in word_x:
            steps_x = []
            for j in range(step):
                if j < len(item):
                    step_word = word_to_vec(item[j])
                else:
                    step_word = [0]*word_classes #补0
                steps_x.append(step_word)
            batch_item.append(np.array(steps_x))
        batch_x = np.array(batch_item)
        yield batch_x, batch_y, step

#get_batches方法的测试
i = 1
for x,y,step in get_batches(traindata, trainlabel, batch_size, word_classes):
    print("*************第",i,"次**********************")
    print("step:", step)
    print("x的shape:", x.shape)
    print("y的shape:",y.shape)
    i += 1
    print("********************************************")

在get_batches方法中可以看到,每批的时间步长度并不相同,如下:

*************第 1 次**********************
step: 24
x的shape: (50, 24, 100)
y的shape: (50, 2)
********************************************
*************第 2 次**********************
step: 52
x的shape: (50, 52, 100)
y的shape: (50, 2)
********************************************
*************第 3 次**********************
step: 32
x的shape: (50, 32, 100)
y的shape: (50, 2)
********************************************
*************第 4 次**********************
step: 34
x的shape: (50, 34, 100)
y的shape: (50, 2)
********************************************
*************第 5 次**********************
step: 55
x的shape: (50, 55, 100)
y的shape: (50, 2)
********************************************
*************第 6 次**********************
step: 28
x的shape: (50, 28, 100)
y的shape: (50, 2)
********************************************
*************第 7 次**********************
step: 28
x的shape: (50, 28, 100)
y的shape: (50, 2)
******************************************** 

(3)构建lstm网络与全连接层

#设置输入placeholder
def build_inputs(batch_size, word_classes, target_classes):
    inputs = tf.placeholder(tf.float32, [batch_size, None, word_classes], name = "inputs")
    targets = tf.placeholder(tf.float32, [batch_size,target_classes], name = "targets")
    
    keep_prob = tf.placeholder(tf.float32, name = "keep_prob")
    return inputs, targets, keep_prob

#构建lstm单元
def lstm_cell(lstm_size, keep_prob):
    cell = tf.contrib.rnn.LSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
    return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)

#构建多层lstm
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell(lstm_size,keep_prob) for _ in range(num_layers)],
                                       state_is_tuple=True)
    #初始化所有权重为0
    initial_state = cell.zero_state(batch_size, tf.float32)
    return cell, initial_state

#构建返回结果 包含两层全连接层
def build_output(lstm_output, lstm_size, target_classes):
    seq_output = tf.concat(lstm_output, axis=1)
    seq_output = seq_output[:, -1, :] #选取最后一个值输出
    print('lstm输出到全连接层的结构:',seq_output)
    x = tf.reshape(seq_output, [-1, lstm_size])
    print("reshape后的结构:", x)
    
    with tf.variable_scope("softmax"):
        hidden_w = tf.Variable(tf.truncated_normal((lstm_size, lstm_size // 2), stddev=0.1))
        hidden_b = tf.Variable(tf.zeros(lstm_size // 2))
        
        softmax_w = tf.Variable(tf.truncated_normal((lstm_size // 2, target_classes), stddev=0.1))
        softmax_b = tf.Variable(tf.zeros(target_classes))
        
    hidden = tf.matmul(x, hidden_w) + hidden_b
    hidden = tf.nn.relu(hidden)
        
    logist = tf.matmul(hidden, softmax_w) + softmax_b
    out = tf.nn.softmax(logist, name = "predictions")
    print("out的结构:",out)
    print("logist的结构:",logist)
    return out,logist

#构建损失 和 准确率
def build_loss(logits, targets):
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets)
    loss = tf.reduce_mean(loss)
    return loss

#准确率
def build_accuracy(logits, targets):
    correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(targets, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    return accuracy

#构建梯度下降optimizer
def build_optimizer(loss, learning_rate, grad_clip):
    tvars = tf.trainable_variables()
    grads,_ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    return optimizer


class CharRNN:
    def __init__(self, word_classes = word_classes, target_classes = target_classes, 
                 batch_size = 100,lstm_size = 20, num_layers = 5, learning_rate = 0.01,
                 grad_clip = 5):
        tf.reset_default_graph()
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size,word_classes, target_classes)
        print("输入的结构为:",self.inputs)
        
        #创建lstm网络
        cell, self.initial_state = build_lstm(lstm_size, num_layers, 
                                              batch_size, self.keep_prob)

        outputs, state = tf.nn.dynamic_rnn(cell, self.inputs, 
                                         initial_state = self.initial_state)
        
        self.final_state = state
        self.prediction, self.logits = build_output(outputs, lstm_size,target_classes)
        self.loss = build_loss(self.logits, self.targets)
        self.accuracy = build_accuracy(self.logits, self.targets)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)



(4)创建会话,开始训练

#开始训练
epochs = 10
model_train = CharRNN(word_classes=word_classes, target_classes=target_classes,batch_size = batch_size, 
               lstm_size = lstm_size, num_layers = num_layers,learning_rate = learning_rate)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    print("\n************************训练开始**************************")
    count = 0
    while True:
        for i in range(epochs):
            new_state = sess.run(model_train.initial_state)
            loss = 0
            for x, y, step in get_batches(traindata, trainlabel, batch_size, word_classes):
                feed ={
                    model_train.inputs : x,
                    model_train.targets : y,
                    model_train.keep_prob : keep_prob,
                    model_train.initial_state:new_state}
                batch_loss,_ = sess.run([model_train.loss,model_train.optimizer],feed_dict = feed)
                loss += batch_loss
            print("第",(epochs * count + i + 1), "轮,损失为:",loss)
        count += 1
    
        #预测前100个
        test_new_state = sess.run(model_train.initial_state)
        accuracy = 0
        for x,y,step in get_batches(testdata, testlabel, batch_size, word_classes):
            feed = {
                model_train.inputs : x,
                model_train.targets : y,
                model_train.keep_prob : 1.,
                model_train.initial_state:test_new_state}
            test_accuracy = sess.run(model_train.accuracy, feed_dict = feed)
            accuracy += test_accuracy
        print("测试集的预测准确率为:", accuracy / (testdata.shape[0] // batch_size))
        if test_accuracy > 0.90: 
            print("\n************************训练结束**************************")
            break

源码地址:

https://github.com/freeingfree/lstm

你可能感兴趣的:(机器学习)