所需环境:Python3.6 + Tensorflow
如果使用cpu版本,可以参考:https://www.jianshu.com/p/da141c730180
如果使用gpu版本,可以参考:https://www.jianshu.com/p/62d414aa843e
还需要安装 Keras:
pip install keras -i https://pypi.tuna.tsinghua.edu.cn/simple/
所需数据集 :
负面情绪:http://ai-download.xmgc360.com/datasets/sentiment_nalysis/neg.xls
正面情绪:http://ai-download.xmgc360.com/datasets/sentiment_nalysis/pos.xls
基于LSTM网络,结构图如下:
代码解释
导入相关模块
import pandas as pd
import numpy as np
import jieba
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.utils import shuffle
from keras.utils import plot_model
读取数据集,分别存储在 DataFream 里
neg = pd.read_excel('neg.xls',header=None,index=None) #负面情绪数据集
pos = pd.read_excel('pos.xls',header=None,index=None) #正面情绪数据集
数据集打标签
pos['mark'] = 1 #正面情绪
neg['mark'] = 0 #负面情绪
合并数据集
df = pd.concat([pos,neg],ignore_index=True)
对数据集进行分词
df['words'] = df[0].apply(lambda x: list(jieba.cut(x))) #分词
如图:
统计分词后每个次出现的次数(主要是去重)
df_words = pd.DataFrame(pd.Series([j for i in df['words'] for j in i]).value_counts())
对每个词进行编号:
df_words['id'] = list(range(1,len(df_words)+1)) #id编号是顺序递增的
把每个句子中文转成句子向量(使用简单编号向量)
df['words_vecoter'] = df['words'].apply(lambda x: list(df_words['id'][x]))
把句子向量的长度统一到50,长度不够补0
df['words_vecoter'] = list(sequence.pad_sequences(df['words_vecoter'], maxlen=50))
数据整理完成,获取训练集和测试集
x_train = np.array(list(df['words_vecoter']))[::2] #训练集
y_train = np.array(list(df['mark']))[::2]
x_test = np.array(list(df['words_vecoter']))[1::2] #测试集
y_test = np.array(list(df['mark']))[1::2]
随机打乱数据集
x_train,y_train = shuffle(x_train,y_train)
x_test,y_test = shuffle(x_test,y_test)
构造循环网络
dlen = len(df_words) + 1
model = Sequential()
model.add(Embedding(dlen, 256)) #Embedding层就是以one hot为输入、中间层节点为字向量维数的全连接层!而这个全连接层的参数,就是一个“字向量表”!
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
plot_model(model, to_file='sentiment_nalysis.png', show_shapes=True) #保存网络结构
model.summary() #显示网络结构
训练模型
model.fit(x_train, y_train, batch_size=16, nb_epoch=5)
评估预测
y_predict = model.predict(x_test)
print(y_predict)
acc = model.evaluate(x_test, y_test)
print('Test accuracy:', acc)
保存模型
model.save('sentiment_nalysis.h5')
完整代码
import pandas as pd
import numpy as np
import jieba
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.utils import shuffle
from keras.utils import plot_model
neg = pd.read_excel('neg.xls',header=None,index=None) #负面情绪数据集
pos = pd.read_excel('pos.xls',header=None,index=None) #正面情绪数据集
pos['mark'] = 1 #正面情绪
neg['mark'] = 0 #负面情绪
df = pd.concat([pos,neg],ignore_index=True)
df['words'] = df[0].apply(lambda x: list(jieba.cut(x))) #分词
df_words = pd.DataFrame(pd.Series([j for i in df['words'] for j in i]).value_counts())
df_words['id'] = list(range(1,len(df_words)+1)) #id编号是顺序递增的
df['words_vecoter'] = df['words'].apply(lambda x: list(df_words['id'][x]))
df['words_vecoter'] = list(sequence.pad_sequences(df['words_vecoter'], maxlen=50))
x_train = np.array(list(df['words_vecoter']))[::2] #训练集
y_train = np.array(list(df['mark']))[::2]
x_test = np.array(list(df['words_vecoter']))[1::2] #测试集
y_test = np.array(list(df['mark']))[1::2]
x_train,y_train = shuffle(x_train,y_train)
x_test,y_test = shuffle(x_test,y_test)
dlen = len(df_words) + 1
model = Sequential()
model.add(Embedding(dlen, 256))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
plot_model(model, to_file='sentiment_nalysis.png', show_shapes=True) #保存网络结构
model.summary() #显示网络结构
model.fit(x_train, y_train, batch_size=16, nb_epoch=5)
y_predict = model.predict(x_test)
print(y_predict)
acc = model.evaluate(x_test, y_test)
print('Test accuracy:', acc)
model.save('sentiment_nalysis.h5')