'''1、读取IMDb数据集
2、建立token字典
3、使用token将“影评文字”转换成“数字列表”
4、截长补短让所有“数字列表”的长度都为100
5、建立嵌入层将“数字列表”转换为“向量列表”'''
from keras.preprocessing import sequence #用于将截长补短,让所有“数字列表”长度为100
from keras.preprocessing.text import Tokenizer #用于建立字典#Using TensorFlow backend
from keras.models importSequentialfrom keras.layers.core importDense,Dropout,Activation,Flattenfrom keras.layers.embeddings importEmbeddingfrom keras.layers.recurrent importLSTMfrom matplotlib importpyplot as pltimportosimportreimportpandas as pddefrm_tags(text):
re_tag= re.compile(r',]+>')return re_tag.sub('',text)defread_files(filename):
path= "D:/pycharm_program/keras_file/aclImdb/"filelist=[]
positive_path= path+filename+"/pos/"
print(os.listdir(positive_path))for f inos.listdir(positive_path):
filelist+= [positive_path+f]
negative_path= path+filename+"/neg/"
print(os.listdir(negative_path))for f inos.listdir(negative_path):
filelist+= [negative_path+f]
all_labels= [1] * 12500 + [0] * 12500all_texts=[]for fi infilelist:
with open(fi,encoding='utf8') as file_input:
all_texts+= [rm_tags("".join(file_input.readlines()))]print(len(filelist))returnall_labels,all_texts#read_files("train")#read_files("test")
pd.set_option('display.max_columns',10000)
y_train_labels,y_train_text= read_files("train")print(y_train_text[0])
y_test_labels,y_test_text= read_files("test")#建立token,将数据集中,按照每个英文单词在所有影评中出现的次数进行排序,排序的前3800名的英文单词会列入字典中
token = Tokenizer(num_words=4000)
token.fit_on_texts(y_train_text)print(token.document_count)#查看读取了多少篇文章
#将“评论文字”转换为“数字列表”
y_train_text_seq =token.texts_to_sequences(y_train_text)
y_test_text_seq=token.texts_to_sequences(y_test_text)#截长补短,保留数字列表后380个数字,不够添加成380个数字
y_train = sequence.pad_sequences(y_train_text_seq,maxlen=300)
y_test= sequence.pad_sequences(y_test_text_seq,maxlen=300)defcreat_model():#建立一个线性堆叠模型,后续只需要将各个神经网络层加入模型即可
model =Sequential()#嵌入层,只能作为第一层
model.add(Embedding(output_dim=32,
input_dim=4000,
input_length=300))#Dropout层
model.add(Dropout(0.2))#LSTM层
model.add(LSTM(32))#隐藏层
model.add(Dense(units=256,activation='relu')) #全连接层
#Dropout层
model.add(Dropout(0.2))#输出层
model.add(Dense(units = 1,activation="sigmoid"))#查看模型摘要
print(model.summary())returnmodel
model=creat_model()#训练模型之前,对训练模型进行设置
model.compile(loss='binary_crossentropy',#categorical_crossentropy
optimizer = 'adam',
metrics= ['accuracy'])
train_history= model.fit(y_train,y_train_labels,batch_size=100,epochs=10,verbose=2,validation_split=0.2)#verbose = 2:显示训练过程#validation_split=0.2 划分数据集
#评估模型的准确率
scores = model.evaluate(y_test,y_test_labels,verbose=1)defsave_model():#将模型保存成JSON文件
model_json =model.to_json()
with open('model.increment.json','w') as f:
f.write(model_json)
model.save_weights('model.increment.json.h5')#从JSON文件中加载模型
with open('model.increment.json','r') as f:
model_json=f.read()from keras.models importmodel_from_json
new_model=model_from_json(model_json)
new_model.load_weights('model.increment.json.h5')
new_model.compile(loss='binary_crossentropy',
optimizer= 'adam',
metrics= ['accuracy'])#编译模型
new_model.fit(y_train,y_train_labels,batch_size=100,epochs=10,verbose=2,validation_split=0.2)#评估模型的准确率
scores = model.evaluate(y_test,y_test_labels,verbose=1)#save_model()
def predict_review(input_text): #预测一段text
input_seq =token.texts_to_sequences(input_text)
pad_input_seq= sequence.pad_sequences(input_seq,maxlen=300)
predict_result=model.predict_classes(pad_input_seq)#acc的历史
plt.plot(train_history.history['acc'])
plt.plot(train_history.history['val_acc'])
plt.title("model accuracy")
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','validation'],loc = 'upper left')
plt.show()#loss的历史
plt.plot(train_history.history['loss'])
plt.plot(train_history.history['val_loss'])
plt.title("model loss")
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','validation'],loc = 'upper left')
plt.show()