import time
from collections import namedtuple
import numpy as np
import tensorflow as tf
with open('anna.txt', 'r') as f:
text=f.read()
vocab = sorted(set(text))#set将文章中的所有不同字符取出,然后sorted排序
vocab_to_int = {c: i for i, c in enumerate(vocab)}#排好序的字符列表进行字典索引
int_to_vocab = dict(enumerate(vocab))#与上字典相反,索引号为键,字符为值
encoded = np.array([vocab_to_int[c] for c in text], dtype=np.int32)#把text中所有字符进行数字编码
定义函数,读入文章,sequence长度、step长度为超参数
def get_batches(arr, n_seqs, n_steps):
# 用sequence和step计算batch大小,得出batch个数,最后不够一个batch的扔掉
characters_per_batch = n_seqs * n_steps
n_batches = len(arr)//characters_per_batch
arr = arr[:n_batches * characters_per_batch]
# 重新reshape为sequence行,列数自动生成(-1)
arr = arr.reshape((n_seqs, -1))
# 生成样本特征batch及目标值batch(目标值为样本值的下一个字母)
for n in range(0, arr.shape[1], n_steps):
x = arr[:, n:n+n_steps]
y = np.zeros_like(x)
# 目标值往下滚动一个字母,目标batch最后一列可设置为样本特征batch的第一列,不会影响精度
y[:, :-1], y[:,-1] = x[:, 1:], x[:, 0]
# x,y为生成器(generater)
yield x, y
创建输入、目标值占位符,以及keep_prob的占位符(Dropout层用到)
def build_inputs(batch_size, num_steps):
'''batch_size是每个batch中sequence的长度(batch行数)
num_steps是batch列数
'''
inputs = tf.placeholder(tf.int32, [batch_size, num_steps], name='inputs')
targets = tf.placeholder(tf.int32, [batch_size, num_steps], name='targets')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
return inputs, targets, keep_prob
tf.contrib.rnn.BasicLSTMCell(num_units)
在cell外包裹上Dropouttf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
为什么这么做可以看一下Wojciech Zaremba的论文:Recurrent Neural Network Regularization
对于rnn的部分不进行dropout,也就是说从t-1时候的状态传递到t时刻进行计算时,这个中间不进行memory的dropout;仅在同一个t时刻中,多层cell之间传递信息的时候进行dropout
多层LSTM层堆叠
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
# 创建LSTM单元
def build_cell(lstm_size, keep_prob):
# Use a basic LSTM cell
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
# Add dropout to the cell
drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
return drop
# Stack up multiple LSTM layers, for deep learning
cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size, keep_prob) for _ in range(num_layers)])
initial_state = cell.zero_state(batch_size, tf.float32)
return cell, initial_state
将RNN cell连接到一个有softmax输出的全连接层,可以给出一个用于预测下一个字母的概率分布。
如果输入batch尺寸(sequence长度)为N,步长为M,隐藏层有L个隐藏单元,则输出为一个N×M×L的3维tensor。输出M个尺寸为L的LSTM cell每一个代表一个sequence,总共有N个sequence,所以总尺寸为N×M×L。
进一步优化输出尺寸为(N×M)×L,即每行对应一个batch(sequence N × step M),列数对应LSTM cell个数。
将权值与变量包裹在一个scope中,以便共享已经在LSTM cell中创建的变量。如果不设置scope而重用LSTM cell中的变量名,则会报错
def build_output(lstm_output, in_size, out_size):
# reshape
seq_output = tf.concat(lstm_output, axis=1)
x = tf.reshape(seq_output, [-1, in_size])
# 将RNN输入连接到softmax层
with tf.variable_scope('softmax'):
softmax_w = tf.Variable(tf.truncated_normal((in_size, out_size), stddev=0.1))
softmax_b = tf.Variable(tf.zeros(out_size))
logits = tf.matmul(x, softmax_w) + softmax_b
out = tf.nn.softmax(logits, name='predictions')
return out, logits
计算目标值和预测值的交叉熵损失:
1. 目标值one-hot编码
2. reshape目标值
3. 将输出单元和目标值传递给softmax的交叉熵损失函数
def build_loss(logits, targets, lstm_size, num_classes):
y_one_hot = tf.one_hot(targets, num_classes)
y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
loss = tf.reduce_mean(loss)
return loss
LSTM不存在梯度消失的问题(常规RNN存在梯度爆炸和梯度消失),但是其增长没有上界。用gradient clip来处理梯度爆炸的问题,设置一个阈值作为上界,超过上界时梯度值设置为阈值。使用AdamOpitmizer
进行学习
def build_optimizer(loss, learning_rate, grad_clip):
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
train_op = tf.train.AdamOptimizer(learning_rate)
opitmizer = train_op.apply_gradients(zip(grads, tvars))
return optimizer
使用tf.nn.dynamic_rnn
可以允许不同batch的sequence长度不同,不会像tf.nn.rnn
一样固定batch大小,用padding补足长度而浪费空间。而且前者是动态建立graph,后者是静态,后者速度慢且占用资源多。
class CharRNN:
def __init__(self, num_classes, batch_size=64, num_steps=50, lstm_size=128,
num_layers=2, learning_rate=0.001, grad_clip=5, sampling=False):
if sempling == True:
batch_size, num_steps = 1, 1
else:
batch_size, num_steps = batch_size, num_steps
tf.reset_default_graph()
self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)
cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)
x_one_hot = tf.one_hot(self.inputs, num_classes)
outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state=self.initial_state)
self.final_state = state
self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
self.loss = build_loss(self.logits, self.targets, lstm_size, num_classes)
self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)
涉及到的超参数:
- batch_size:一次输入传递的sequence数,也就是batch的行数
- num_steps:步长,即batch的列数,是一个sequence中的字母个数。一般来说越大越好,因为越多的字母数可以让模型学习到更多的相关性,但是训练时间长。一般设置为100
- lstm_size:隐藏层中的LSTM单元数
- num_layers:隐藏层数
- learning_rate:学习率
- keep_prob:Dropout率,如果模型过拟合,减小Dropout
Andrej Karpathy大神的设置原则:
- num_layers设置为2或者3
- lstm_size依照数据的规模及模型参数数量来设置:
- 在训练之前打印出模型的参数数量
- 数据集尺寸:1M的文件大概有100万个字母
- 然后设置参数和数据规模到同一个量级,比如:
- 数据集文件100MB,参数有150k,数据集规模远远大于参数数量,那么模型很可能会欠拟合,这种情况下就要把lstm_size设置大一点
- 数据集文件100MB,参数有1000万,这时要注意观测验证集损失,如果比训练集损失大许多,那么可以尝试提高Dropout
- 原文,以上是截取的Approximate number of parameters一节
下面给出一种可能的设置:
batch_size = 100
num_steps = 100
lstm_size = 512
num_layers =2
learning_rate = 0.0001
keep_prob = 0.5
将inputs和targets传递到网络,然后运行optimizer优化。然后用checkpoint保存final LSTM状态,用来传递给下一个batch的训练。
epochs = 20
# 每200步保存一个checkpoint
save_every_n = 200
model = CharRNN(len(vocab), batch_size=batch_size, num_steps=num_steps,
lstm_size=lstm_size, num_layers=num_layers,
learning_rate=learning_rate)
saver = tf.train.Saver(max_to_keep=100)# the maximum number of recent checkpoint files to keep. As new files are created, older files are deleted.
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
counter = 0
for e in range(epochs):
new_state = sess.run(model.initial_state)
loss = 0
for x, y in get_batches(encoded, batch_size, num_steps):
counter += 1
start = time.time()
feed = {model.inputs: x,
model.targets: y,
model.keep_prob: keep_prob,
model.initial_state: new_state}
batch_loss, new_state, _ = sess.run([model.loss,
model.final_state,
model.optimizer],
feed_dict=feed)
end = time.time()
print('Epoch:{}/{}...'.format(e+1, epochs),
'Training Step:{}...'.format(counter),
'Training loss:{:.4f}...'.format(batch_loss),
'{:.4f} sec/batch'.format((end-start)))
if (counter % save_every_n == 0):
saver.save(sess, 'checkpoints/i{}.ckpt'.format(counter))
saver.save(sess, 'checkpoints/i{}.ckpt'.format(counter))
将ckpt文件保存在checkpoint文件夹中
tf.train.get_checkpoint_state('checkpoints')
模型训练完毕后,可以用来生成新的文章段落。在模型中输入一个字母,模型预测生成下一个字母,再用下一个字母预测生成下下个字母。为了减小生成的噪声,使随机性减小,我们选择生成的前N概率最大的字母
def pick_top_n(preds, vocab_size, top_n=5):
p = np.squeeze(preds)
p[np.argsort(p)[:-top_n]]=0
p = p / np.sum(p)
char = np.random.choice(vocab_size, 1, p=p)[0]
return char
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime='The '):
samples = [c for c in prime]
model = CharRNN(len(vocab), lstm_size=lstm_size, sampling=True)
saver = tf.train.Saver()
with tf.Session() as sess:
saver.restore(sess, checkpoint)
new_state = sess.run(model.initial_state)
for c in prime:
x = np.zeros((1, 1))
x[0, 0] = vocab_to_int[c]
feed = {model.inputs: x,
model.keep_prob: keep_prob,
model.initial_state: new_state}
preds, new_state = sess.run([model.prediction,
model.final_state],
feed_dict=feed)
c = pick_top_n(preds, len(vocab))
samples.append(int_to_vocab[c])
for i in range(n_samples):
x[0,0] = c
feed = {model.inputs: x,
model.keep_prob: 1.,
model.initial_state: new_state}
preds, new_state = sess.run([model.prediction, model.final_state],
feed_dict=feed)
c = pick_top_n(preds, len(vocab))
samples.append(int_to_vocab[c])
return ''.join(samples)
把checkpoint文件夹中的ckpt文件传递给模型,生成2000字母新的文本
checkpoint = tf.train.latest_checkpoint('checkpoints')
samp = sample(checkpoint, 2000, lstm_size, len(vocab), prime='The')
print(samp)
INFO:tensorflow:Restoring parameters from checkpoints\i3800.ckpt The outntine tore. “Anda was sely wark, whith the south as the simarest insorse, there wo here that wish to den to a selting theme though he drad the coustest an with him, bot the word and the hurs fold her befoul ther the some on the said, bet an wered to ather her and the wist and and a menter and was a ment warked to him hore. Her have thind as she dind to buther and the sainted as the sairs, as the sampat on the said wate and alating on the precissair of the partere aspace the hid her, and as all her thither wored and his said talk in and and a thaid andostant of to thithe alled and at a whangs at that she wit of the her..” “They’s all her that so coure that’s at it tele so do be and hus so did to anden tha marte and stear as is in whe the comprince, “she’s sele thith the mome ano she cusprian of hime fall,. Tho could as she cas a wand of thim. And tall he her hander of her, and athing think and hid bother he had buth horssing and his and her to deer alday, hid a cored her brtairiad hem, and souddy then troute her, her sond anore the caster. He could not him atrentse his befurtale and whate he chanded her and ta see at an with the cruct and the bristed and a to dit wo him. “I’s serther as the consere a dear and stice to the paster a sender on thought he was not in the compiout ta his andensted at as aldot and with his sore tare to the sore, and her and the pare al and and the which here was her a same, shich were than in thin whele and the pains in at hishing a shatted in thinge of the mest of had, sunding with a mant of trear the casions of tith his. He wis to sain he coull has had her had same the midly of the pand that she sand on tee it, and to been the sering anctreat har and same of the sarint to and the has tonk the her thote wile wat say hes woold has. Ande sald not antions.
从以上结果可以看出,介词连词代词都基本没有拼写错误,而且位置用的也合乎语法,只是形容词名词等字母数比较多的,中间会有拼写问题。模型基本跑通了,只差优化了