前面所述的LSTM网络中用到TimeDistributed()这个网络层,实际上是将RNN展开后的每个网络的输出都算入到最终的输出结果里,比如说序列“我今天吃了一个大包子”,按照之前的网络,如果每批次有3个序列,每个序列有4个词(字),那么一个批次的数据就是“我今天吃”,对应的输出是“今天吃了”,每个输入词对应的输出是它的下一个词。
如果我们不关心序列中间的输出,只关心输入序列的最后一个预测词,比如输入是“我今天吃”,输出只有一个词“了”,那么需要改变LSTM网络结构,最后一层LSTM网络设置return_sequence=False,直接在后面加一个Dense(units=vocabulary_size, activation='softmax')这样即可。下面来说说怎么产生训练数据。
先将本文分割一下,分为train.txt, valid.txt, test.txt,然后可以自建一个predict.txt,在里面输入自己一段起始地文本,模型将根据这个进行预测。
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 27 06:14:13 2018
@author: xiaozhen
split the text to train, valid, test; as well, the space can be deleted
"""
import re
#pattern = re.compile(r'[\n,。!?]')
pattern = re.compile(r'\s+')
with open('data/断鸿零雁记.txt', 'r', encoding='utf-8') as f:
jay_file = pattern.sub('', f.read())
# jay_file = f.read()
size = len(jay_file)
train_data = jay_file[:int(size*0.7)]
vali_data = jay_file[int(size*0.7):int(size*0.9)]
test_data = jay_file[int(size*0.9):]
for file, data in zip(["data/train.txt", "data/valid.txt", "data/test.txt"],
[train_data, vali_data, test_data]):
with open(file, 'w', encoding='utf8') as file:
file.write(data)
建立数据生成器
import numpy as np
import copy
import time
import pickle
from keras.utils import to_categorical
import matplotlib.pyplot as plt
def show_train_history(train_history, train, validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
def batch_generator_one_word(arr, n_seqs, n_steps, vocab_size):
batch_size = n_seqs * n_steps
n_batches = len(arr) // batch_size
arr = arr[:batch_size * n_batches]
while True:
idx = 0
while idx < len(arr) - n_steps - n_seqs - 1:
train_x = []
train_y = []
for n in range(idx, idx + n_seqs):
x = arr[n:(n + n_steps)]
y = arr[n + n_steps + 1]
train_x.append(x)
train_y.append(y)
idx += n_seqs
train_x = np.array(train_x)
train_y = to_categorical(train_y, num_classes=vocab_size)
yield train_x, train_y
class TextConverter(object):
def __init__(self, filename, max_vocab=None):
with open(filename, 'r', encoding='utf8') as f:
text = f.read()
vocab = set(text)
print(len(vocab))
if not max_vocab:
max_vocab = len(vocab)
# max_vocab_process
vocab_count = {}
for word in vocab:
vocab_count[word] = 0
for word in text:
vocab_count[word] += 1
vocab_count_list = []
for word in vocab_count:
vocab_count_list.append((word, vocab_count[word]))
vocab_count_list.sort(key=lambda x: x[1], reverse=True)
if len(vocab_count_list) > max_vocab:
vocab_count_list = vocab_count_list[:max_vocab]
vocab = [x[0] for x in vocab_count_list]
self.vocab = vocab
self.text = text
self.word_to_int_table = {c: i for i, c in enumerate(self.vocab)}
self.int_to_word_table = dict(enumerate(self.vocab))
@property
def vocab_size(self):
return len(self.vocab) + 1
def word_to_int(self, word):
if word in self.word_to_int_table:
return self.word_to_int_table[word]
else:
return len(self.vocab)
def int_to_word(self, index):
if index == len(self.vocab):
return ''
elif index < len(self.vocab):
return self.int_to_word_table[index]
else:
raise Exception('Unknown index!')
def text_to_arr(self, text=None, filename=None):
if filename:
with open(filename, 'r', encoding='utf8') as f:
text = f.read()
arr = []
for word in text:
arr.append(self.word_to_int(word))
return np.array(arr)
def arr_to_text(self, arr):
words = []
for index in arr:
words.append(self.int_to_word(index))
return "".join(words)
def save_to_file(self, filename):
with open(filename, 'wb') as f:
pickle.dump(self.vocab, f)
一下是模型的建立,及训练。
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 21 21:10:42 2018
@author: x00428488
"""
import os
import re
import numpy as np
from read_utils import TextConverter, batch_generator_one_word
from sklearn import preprocessing
from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
convert_text = TextConverter(filename='data/断鸿零雁记.txt')
train_arr = convert_text.text_to_arr(filename='data/train.txt')
valid_arr = convert_text.text_to_arr(filename='data/valid.txt')
test_arr = convert_text.text_to_arr(filename='data/predict.txt')
vocab_size = convert_text.vocab_size
num_seqs = 100
num_steps = 30
train_gene = batch_generator_one_word(train_arr, num_seqs, num_steps, vocab_size)
valid_gene = batch_generator_one_word(valid_arr, num_seqs, num_steps, vocab_size)
test_gene = batch_generator_one_word(test_arr, 1, num_steps, vocab_size)
# 序贯模型
model = Sequential()
# 嵌入层,实际相当于每个词从索引替换为词向量,input_dim是最大的词索引+1,
# input_length是每个序列里面包含的词汇量,
# out_shape是n_bathes x input_length x output_dim
model.add(Embedding(output_dim=100,
input_dim=vocab_size,
input_length=num_steps))
# model.add(Dropout(0.35))
# units 是输出空间的维度
model.add(LSTM(units=128, dropout=0.2, return_sequences=True))
model.add(LSTM(units=128, dropout=0.2))
model.add(Dense(units=vocab_size, activation='softmax'))
model.summary()
# 训练模型
model.compile(loss='categorical_crossentropy', optimizer='adam',
metrics=['categorical_accuracy'])
checkpointer = ModelCheckpoint(
filepath='data/models/model-{epoch:02d}.hdf5', verbose=1)
num_epochs = 30
model.fit_generator(train_gene,
len(train_arr) - num_steps, num_epochs,
validation_data=valid_gene,
validation_steps=len(valid_arr) - num_steps,
callbacks=[checkpointer])
# 预测生成文本
num_predict = 100
for i in range(num_predict):
model = load_model('data/models/model-20.hdf5')
startarr = np.array(test_arr[-num_steps:]).reshape(1, -1)
pred_rst = model.predict(startarr)
pred_idx = np.argmax(pred_rst[0])
test_arr = np.append(test_arr, pred_idx)
pred_text = convert_text.arr_to_text(test_arr)
print(pred_text)
with open('data/predict.txt', 'w', encoding='utf8') as f:
f.write(pred_text)
电脑太渣,训练一个epoch就要一个多小时,如果再想改改参数,多次训练那就更慢了。先记一下方法。