- LSTM,是专门设计来解决RNN长期依赖的问题
1.数据准备
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import numpy as np
np.random.seed(10)
import re
re_tag = re.compile(r'<[^>]+>')
def rm_tags(text):
return re_tag.sub('', text)
import os
def read_files(filetype):
path = "data/aclImdb/"
file_list=[]
positive_path=path + filetype+"/pos/"
for f in os.listdir(positive_path):
file_list+=[positive_path+f]
negative_path=path + filetype+"/neg/"
for f in os.listdir(negative_path):
file_list+=[negative_path+f]
print('read',filetype, 'files:',len(file_list))
all_labels = ([1] * 12500 + [0] * 12500)
all_texts = []
for fi in file_list:
with open(fi,encoding='utf8') as file_input:
all_texts += [rm_tags(" ".join(file_input.readlines()))]
return all_labels,all_texts
token = Tokenizer(num_words=3800)
token.fit_on_texts(train_text)
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)
x_train = sequence.pad_sequences(x_train_seq, maxlen=380)
x_test = sequence.pad_sequences(x_test_seq, maxlen=380)
2.建立模型
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation,Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
model = Sequential()
model.add(Embedding(output_dim=32,
input_dim=3800,
input_length=380))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dense(units=256,
activation='relu' ))
model.add(Dropout(0.2))
model.add(Dense(units=1,
activation='sigmoid' ))
model.summary()
3.训练模型
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
train_history =model.fit(x_train, y_train,batch_size=100,
epochs=10,verbose=1,
validation_split=0.2)
%pylab inline
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
4.模型准确率
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]
5.结果预测
probility=model.predict(x_test)
predict=model.predict_classes(x_test)
SentimentDict={1:'正面的',0:'负面的'}
def display_test_Sentiment(i):
print(test_text[i])
print('标签label:',SentimentDict[y_test[i]],'预测结果:',SentimentDict[predict_classes[i]])
display_test_Sentiment(2)
6.模型保存
model_json = model.to_json()
with open("SaveModel/Imdb_LSTM_model.json", "w") as json_file:
json_file.write(model_json)
model.save_weights("SaveModel/Imdb_LSTM_model.h5")
print("Saved model to disk")