Keras 建立LSTM RNN神经网络实现文本生成预测(二)

前面所述的LSTM网络中用到TimeDistributed()这个网络层,实际上是将RNN展开后的每个网络的输出都算入到最终的输出结果里,比如说序列“我今天吃了一个大包子”,按照之前的网络,如果每批次有3个序列,每个序列有4个词(字),那么一个批次的数据就是“我今天吃”,对应的输出是“今天吃了”,每个输入词对应的输出是它的下一个词。

如果我们不关心序列中间的输出,只关心输入序列的最后一个预测词,比如输入是“我今天吃”,输出只有一个词“了”,那么需要改变LSTM网络结构,最后一层LSTM网络设置return_sequence=False,直接在后面加一个Dense(units=vocabulary_size, activation='softmax')这样即可。下面来说说怎么产生训练数据。

先将本文分割一下,分为train.txt, valid.txt, test.txt,然后可以自建一个predict.txt,在里面输入自己一段起始地文本,模型将根据这个进行预测。

# -*- coding: utf-8 -*-
"""
Created on Mon Aug 27 06:14:13 2018
@author: xiaozhen
split the text to train, valid, test; as well, the space can be deleted
"""
import re
#pattern = re.compile(r'[\n,。!?]')
pattern = re.compile(r'\s+')
 
with open('data/断鸿零雁记.txt', 'r', encoding='utf-8') as f:
    jay_file = pattern.sub('', f.read())
#    jay_file = f.read()
 
size = len(jay_file)
train_data = jay_file[:int(size*0.7)]
vali_data = jay_file[int(size*0.7):int(size*0.9)]
test_data = jay_file[int(size*0.9):]
 
for file, data in zip(["data/train.txt", "data/valid.txt", "data/test.txt"],
                      [train_data, vali_data, test_data]):
    with open(file, 'w', encoding='utf8') as file:
        file.write(data)

建立数据生成器

import numpy as np
import copy
import time
import pickle
from keras.utils import to_categorical
import matplotlib.pyplot as plt


def show_train_history(train_history, train, validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()


def batch_generator_one_word(arr, n_seqs, n_steps, vocab_size):
    batch_size = n_seqs * n_steps
    n_batches = len(arr) // batch_size
    arr = arr[:batch_size * n_batches]
    
    while True:
        idx = 0
        while idx < len(arr) - n_steps - n_seqs - 1:
            train_x = []
            train_y = []
            for n in range(idx, idx + n_seqs):
                x = arr[n:(n + n_steps)]
                y = arr[n + n_steps + 1]
                train_x.append(x)
                train_y.append(y)
            idx += n_seqs
            train_x = np.array(train_x)
            train_y = to_categorical(train_y, num_classes=vocab_size)
            yield train_x, train_y


class TextConverter(object):
    def __init__(self, filename, max_vocab=None):
        with open(filename, 'r', encoding='utf8') as f:
            text = f.read()
        vocab = set(text)
        print(len(vocab))
        if not max_vocab:
            max_vocab = len(vocab)
        # max_vocab_process
        vocab_count = {}
        for word in vocab:
            vocab_count[word] = 0
        for word in text:
            vocab_count[word] += 1
        vocab_count_list = []
        for word in vocab_count:
            vocab_count_list.append((word, vocab_count[word]))
        vocab_count_list.sort(key=lambda x: x[1], reverse=True)
        if len(vocab_count_list) > max_vocab:
            vocab_count_list = vocab_count_list[:max_vocab]
        vocab = [x[0] for x in vocab_count_list]
        self.vocab = vocab
        self.text = text
        self.word_to_int_table = {c: i for i, c in enumerate(self.vocab)}
        self.int_to_word_table = dict(enumerate(self.vocab))

    @property
    def vocab_size(self):
        return len(self.vocab) + 1

    def word_to_int(self, word):
        if word in self.word_to_int_table:
            return self.word_to_int_table[word]
        else:
            return len(self.vocab)

    def int_to_word(self, index):
        if index == len(self.vocab):
            return ''
        elif index < len(self.vocab):
            return self.int_to_word_table[index]
        else:
            raise Exception('Unknown index!')

    def text_to_arr(self, text=None, filename=None):
        if filename:
            with open(filename, 'r', encoding='utf8') as f:
                text = f.read()
        arr = []
        for word in text:
            arr.append(self.word_to_int(word))
        return np.array(arr)
    
    def arr_to_text(self, arr):
        words = []
        for index in arr:
            words.append(self.int_to_word(index))
        return "".join(words)

    def save_to_file(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.vocab, f)

一下是模型的建立,及训练。

# -*- coding: utf-8 -*-
"""
Created on Tue Aug 21 21:10:42 2018

@author: x00428488
"""
import os
import re
import numpy as np
from read_utils import TextConverter, batch_generator_one_word
from sklearn import preprocessing
from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

convert_text = TextConverter(filename='data/断鸿零雁记.txt')

train_arr = convert_text.text_to_arr(filename='data/train.txt')
valid_arr = convert_text.text_to_arr(filename='data/valid.txt')
test_arr = convert_text.text_to_arr(filename='data/predict.txt')
vocab_size = convert_text.vocab_size
num_seqs = 100
num_steps = 30
train_gene = batch_generator_one_word(train_arr, num_seqs, num_steps, vocab_size)
valid_gene = batch_generator_one_word(valid_arr, num_seqs, num_steps, vocab_size)
test_gene = batch_generator_one_word(test_arr, 1, num_steps, vocab_size)

# 序贯模型
model = Sequential()
# 嵌入层,实际相当于每个词从索引替换为词向量,input_dim是最大的词索引+1,
# input_length是每个序列里面包含的词汇量,
# out_shape是n_bathes x input_length x output_dim
model.add(Embedding(output_dim=100,
                    input_dim=vocab_size,
                    input_length=num_steps))
# model.add(Dropout(0.35))
# units 是输出空间的维度
model.add(LSTM(units=128, dropout=0.2, return_sequences=True))
model.add(LSTM(units=128, dropout=0.2))
model.add(Dense(units=vocab_size, activation='softmax'))
model.summary()
# 训练模型
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['categorical_accuracy'])

checkpointer = ModelCheckpoint(
    filepath='data/models/model-{epoch:02d}.hdf5', verbose=1)
num_epochs = 30
model.fit_generator(train_gene, 
                     len(train_arr) - num_steps, num_epochs,
                    validation_data=valid_gene,
                    validation_steps=len(valid_arr) - num_steps, 
                    callbacks=[checkpointer])

# 预测生成文本
num_predict = 100
for i in range(num_predict):
    model = load_model('data/models/model-20.hdf5')
    startarr = np.array(test_arr[-num_steps:]).reshape(1, -1)
    pred_rst = model.predict(startarr)
    pred_idx = np.argmax(pred_rst[0])
    test_arr = np.append(test_arr, pred_idx)

pred_text = convert_text.arr_to_text(test_arr)
print(pred_text)
with open('data/predict.txt', 'w', encoding='utf8') as f:
    f.write(pred_text)

 电脑太渣,训练一个epoch就要一个多小时,如果再想改改参数,多次训练那就更慢了。先记一下方法。

 

 

你可能感兴趣的:(python)