import urllib.request
import os
import tarfile
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.layers.recurrent import LSTM
#读取or下载数据
url="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath="data/aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
result=urllib.request.urlretrieve(url,filepath)
print('downloaded:',result)
#解压
if not os.path.exists('data/aclImdb'):
tfile = tarfile.open('data/aclImdb_v1.tar.gz','r:gz')
result = tfile.extractall('data/')
#利用正则表达式(不了解呀)去除html的标签
import re
def rm_tags(text):
re_tag = re.compile(r'<[^>]+>')
return re_tag.sub('',text)
#读入数据文本、标签
def read_files(filetype): #filetype: 'train' or 'test'
path = 'data/aclImdb/'
file_list=[]
#正面评价目录
positive_path = path + filetype + '/pos/'
for f in os.listdir(positive_path):
file_list += [positive_path+f]
#负面评价目录
negative_path = path + filetype + '/neg/'
for f in os.listdir(negative_path):
file_list += [negative_path+f]
print('read',filetype,'files:',len(file_list))
#标签,前12500为正面评价,后12500为负面评价
all_labels = ([1] * 12500 + [0] * 12500)
#将file_list(文件名列表)中的所有文件读出来
all_texts=[]
for fi in file_list:
with open(fi, encoding='utf8') as file_input:
all_texts += [rm_tags(' '.join(file_input.readlines()))]
return all_labels, all_texts
#读入训练集
y_train,train_text = read_files('train')
#读入测试集
y_test,test_text = read_files('test')
#token取2000个文本中最常出现的词做成字典,将词转换成数字
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)
#将词转换成数字
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)
#保留100个数字,多的去掉前面,少的在前面补零
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test = sequence.pad_sequences(x_test_seq, maxlen=100)
#构建MLP模型
model = Sequential()
model.add(Embedding(output_dim=32,
input_dim=2000,
input_length=100))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(units=256,
activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(units=1,
activation='sigmoid'))
model.summary()
#设置loss、优化器
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
#训练
train_histroy = model.fit(x_train, y_train, batch_size=100,
epochs=10, verbose=2,
validation_split=0.2)
#使用测试集评估分数
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]
#预测
predict = model.predict_classes(x_test)
#看某条的测试结果及真实值
SentimentDict = {1:'正面的', 0:'负面的'}
def display_test_Sentimemt(i):
print(test_text[i])
print('label真实值:', SentimentDict[y_test[i]],
'预测结果:', SentimentDict[predict_classes[i]])
display_test_Sentimemt(5)
#构建RNN模型预测
model_rnn = Sequential()
model_rnn.add(Embedding(output_dim=32,
input_dim=2000,
input_length=100))
model_rnn.add(Dropout(0.2))
model_rnn.add(SimpleRNN(units=16))
model_rnn.add(Dense(units=256,activation='relu'))
model_rnn.add(Dropout(0.35))
model_rnn.add(Dense(units=1, activation='sigmoid'))
model_rnn.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model_rnn_histroy = model_rnn.fit(x_train, y_train, batch_size=100,
epochs = 10, verbose=2,
validation_split=0.2)
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]
#构建LSTM模型预测
model_LSTM = Sequential()
model_LSTM.add(Embedding(output_dim=32,
input_dim=2000,
input_length=100))
model_LSTM.add(Dropout(0.2))
model_LSTM.add(LSTM(32))
model_LSTM.add(Dense(units=256,
activation='relu'))
model_LSTM.add(Dropout(0.2))
model_LSTM.add(Dense(units=1,
activation='sigmoid'))
model_LSTM.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model_LSTM_history = model_LSTM.fit(x_train, y_train, batch_size=100,
epochs=10, verbose=1,
validation_split=0.2)
scores = model_LSTM.evaluate(x_test, y_test, verbose=1)
scores[1]