摘要
这里提供一个使用Theano来实现循环神经网络(RNN)的长短期记忆(LSTM)结构。模型被用来执行Large Movie Review Dataset(即IMDB数据集)中电影评论的情感分析。
任务中,给定一个电影评论,模型尝试预测它是正面还是负面的。这是一个二分类问题。
数据
如前提及,提供的脚本被用来训练LSTM循环神经网络。数据集是公开的,我们提供了一个已预处理的版本以适应LSTM实现的需要。运行本教程提供的代码会自动将数据下载到本地文件夹。如果你需要使用自己的数据,请使用以下预处理脚本。
"""
This script is what created the dataset pickled.
1) You need to download this file and put it in the same directory as this file.
https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.
2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.
3) Then run this script.
"""
dataset_path='/Tmp/bastienf/aclImdb/'
import numpy
import cPickle as pkl
from collections import OrderedDict
import glob
import os
from subprocess import Popen, PIPE
# tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']
def tokenize(sentences):
print 'Tokenizing..',
text = "\n".join(sentences)
tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
tok_text, _ = tokenizer.communicate(text)
toks = tok_text.split('\n')[:-1]
print 'Done'
return toks
def build_dict(path):
sentences = []
currdir = os.getcwd()
os.chdir('%s/pos/' % path)
for ff in glob.glob("*.txt"):
with open(ff, 'r') as f:
sentences.append(f.readline().strip())
os.chdir('%s/neg/' % path)
for ff in glob.glob("*.txt"):
with open(ff, 'r') as f:
sentences.append(f.readline().strip())
os.chdir(currdir)
sentences = tokenize(sentences)
print 'Building dictionary..',
wordcount = dict()
for ss in sentences:
words = ss.strip().lower().split()
for w in words:
if w not in wordcount:
wordcount[w] = 1
else:
wordcount[w] += 1
counts = wordcount.values()
keys = wordcount.keys()
sorted_idx = numpy.argsort(counts)[::-1]
worddict = dict()
for idx, ss in enumerate(sorted_idx):
worddict[keys[ss]] = idx+2 # leave 0 and 1 (UNK)
print numpy.sum(counts), ' total words ', len(keys), ' unique words'
return worddict
def grab_data(path, dictionary):
sentences = []
currdir = os.getcwd()
os.chdir(path)
for ff in glob.glob("*.txt"):
with open(ff, 'r') as f:
sentences.append(f.readline().strip())
os.chdir(currdir)
sentences = tokenize(sentences)
seqs = [None] * len(sentences)
for idx, ss in enumerate(sentences):
words = ss.strip().lower().split()
seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]
return seqs
def main():
# Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
path = dataset_path
dictionary = build_dict(os.path.join(path, 'train'))
train_x_pos = grab_data(path+'train/pos', dictionary)
train_x_neg = grab_data(path+'train/neg', dictionary)
train_x = train_x_pos + train_x_neg
train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
test_x_pos = grab_data(path+'test/pos', dictionary)
test_x_neg = grab_data(path+'test/neg', dictionary)
test_x = test_x_pos + test_x_neg
test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)
f = open('imdb.pkl', 'wb')
pkl.dump((train_x, train_y), f, -1)
pkl.dump((test_x, test_y), f, -1)
f.close()
f = open('imdb.dict.pkl', 'wb')
pkl.dump(dictionary, f, -1)
f.close()
if __name__ == '__main__':
main()
模型 LSTM
在传统循环神经网络的梯度反向传播阶段,梯度信号可能被与循环隐藏层神经元连接相关联的权重矩阵多次(与步数相同)相乘。意味着,过渡矩阵的权重级数对学习过程有很大影响。
如果矩阵的权重很小(矩阵的首要特征值小于1.0),会导致所谓的梯度消失,梯度信号变的如此之小,学习或是非常缓慢或是干脆停止。同时也是学习数据的长期以来变得更为困难。相反如果矩阵权重很大(矩阵的首要特征值大于1.0),会导致梯度信号如此之大以致学习开始离散,这被称为梯度爆炸。
这些问题是LSTM模型产生的主要动力,模型引进了新的结构称为记忆单元(见下图)。一个记忆单元由四个元素组成:输入门、自我循环连接的神经元、遗忘门和输出门。自我循环连接的权重是1.0,保证了抵御任何外部影响,记忆单元的状态始终保持恒定。门被用来调节记忆单元和所处环境间的互动。输入门决定允许或拒绝输入信号改变记忆单元的状态。另一方面,输出门允许或防止记忆单元的状态对其他神经元产生影响。最后,遗忘门调节记忆单元的自我循环连接,允许单元根据需要记忆或遗忘此前状态。
下列公式描述了一个记忆单元层每时间更新,公式中:
*是时间时记忆单元层的输入
*,,,,,,,和是权重矩阵
*,,和是偏差向量
首先我们计算输入门,和时间的记忆单元状态候选值:
(1)
(2)
其次,我们计算时间的记忆单元遗忘门的激活的值:
(3)
给定上述3个值,我们计算时间记忆单元的新状态:
(4)
然后计算输出门的值和输出:
(5)
(6)
我们的模型
这里我们使用标准LSTM的一个变体。单元输出门的激活不依赖于记忆单元状态。这使我们执行部分计算更为有效。意味着我们没有,输出门公式变成:
(7)
我们的模型由单一LSTM层和平均池化、逻辑回归层构成,如下图所示。因此从输入序列x0, x1, x2 ,..., xn, LSTM层的记忆单元会生成h0, h1, h2, ... , hn序列表征。这个序列表征在所有时间上取平均得到表征h。最后这个表征传递到逻辑回归层,该层目标是与输入序列相关的类标签。
备注:这里所附的代码,公式(1),(2),(3)和(7)同时运行使计算更为有效。可实现的原因是上述公式相互直接不依赖于它方结果。把4个矩阵结合成一个并以同样方式得到矩阵和偏差向量,非线性前激活可以通过以下公式结算:
将该结果切片得到,,,非线性前激活和非线性,然后独立应用。
代码-引用-联系
lstm.py 主脚本,定义和训练模型
'''
Build a tweet sentiment analyzer
'''
from __future__ import print_function
import six.moves.cPickle as pickle
from collections import OrderedDict
import sys
import time
import numpy
import theano
from theano import config
import theano.tensor as tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import imdb
datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
# Set the random number generators' seeds for consistency
SEED = 123
numpy.random.seed(SEED)
def numpy_floatX(data):
return numpy.asarray(data, dtype=config.floatX)
def get_minibatches_idx(n, minibatch_size, shuffle=False):
"""
Used to shuffle the dataset at each iteration.
"""
idx_list = numpy.arange(n, dtype="int32")
if shuffle:
numpy.random.shuffle(idx_list)
minibatches = []
minibatch_start = 0
for i in range(n // minibatch_size):
minibatches.append(idx_list[minibatch_start:
minibatch_start + minibatch_size])
minibatch_start += minibatch_size
if (minibatch_start != n):
# Make a minibatch out of what is left
minibatches.append(idx_list[minibatch_start:])
return zip(range(len(minibatches)), minibatches)
def get_dataset(name):
return datasets[name][0], datasets[name][1]
def zipp(params, tparams):
"""
When we reload the model. Needed for the GPU stuff.
"""
for kk, vv in params.items():
tparams[kk].set_value(vv)
def unzip(zipped):
"""
When we pickle the model. Needed for the GPU stuff.
"""
new_params = OrderedDict()
for kk, vv in zipped.items():
new_params[kk] = vv.get_value()
return new_params
def dropout_layer(state_before, use_noise, trng):
proj = tensor.switch(use_noise,
(state_before *
trng.binomial(state_before.shape,
p=0.5, n=1,
dtype=state_before.dtype)),
state_before * 0.5)
return proj
def _p(pp, name):
return '%s_%s' % (pp, name)
def init_params(options):
"""
Global (not LSTM) parameter. For the embeding and the classifier.
"""
params = OrderedDict()
# embedding
randn = numpy.random.rand(options['n_words'],
options['dim_proj'])
params['Wemb'] = (0.01 * randn).astype(config.floatX)
params = get_layer(options['encoder'])[0](options,
params,
prefix=options['encoder'])
# classifier
params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
options['ydim']).astype(config.floatX)
params['b'] = numpy.zeros((options['ydim'],)).astype(config.floatX)
return params
def load_params(path, params):
pp = numpy.load(path)
for kk, vv in params.items():
if kk not in pp:
raise Warning('%s is not in the archive' % kk)
params[kk] = pp[kk]
return params
def init_tparams(params):
tparams = OrderedDict()
for kk, pp in params.items():
tparams[kk] = theano.shared(params[kk], name=kk)
return tparams
def get_layer(name):
fns = layers[name]
return fns
def ortho_weight(ndim):
W = numpy.random.randn(ndim, ndim)
u, s, v = numpy.linalg.svd(W)
return u.astype(config.floatX)
def param_init_lstm(options, params, prefix='lstm'):
"""
Init the LSTM parameter:
:see: init_params
"""
W = numpy.concatenate([ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], axis=1)
params[_p(prefix, 'W')] = W
U = numpy.concatenate([ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], axis=1)
params[_p(prefix, 'U')] = U
b = numpy.zeros((4 * options['dim_proj'],))
params[_p(prefix, 'b')] = b.astype(config.floatX)
return params
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
assert mask is not None
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n + 1) * dim]
return _x[:, n * dim:(n + 1) * dim]
def _step(m_, x_, h_, c_):
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
preact += x_
i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
c = f * c_ + i * c
c = m_[:, None] * c + (1. - m_)[:, None] * c_
h = o * tensor.tanh(c)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
return h, c
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
tparams[_p(prefix, 'b')])
dim_proj = options['dim_proj']
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj),
tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0]
# ff: Feed Forward (normal neural net), only useful to put after lstm
# before the classifier.
layers = {'lstm': (param_init_lstm, lstm_layer)}
def sgd(lr, tparams, grads, x, mask, y, cost):
""" Stochastic Gradient Descent
:note: A more complicated version of sgd then needed. This is
done like that for adadelta and rmsprop.
"""
# New set of shared variable that will contain the gradient
# for a mini-batch.
gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
for k, p in tparams.items()]
gsup = [(gs, g) for gs, g in zip(gshared, grads)]
# Function that computes gradients for a mini-batch, but do not
# updates the weights.
f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
name='sgd_f_grad_shared')
pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
# Function that updates the weights from the previously computed
# gradient.
f_update = theano.function([lr], [], updates=pup,
name='sgd_f_update')
return f_grad_shared, f_update
def adadelta(lr, tparams, grads, x, mask, y, cost):
"""
An adaptive learning rate optimizer
Parameters
----------
lr : Theano SharedVariable
Initial learning rate
tpramas: Theano SharedVariable
Model parameters
grads: Theano variable
Gradients of cost w.r.t to parameres
x: Theano variable
Model inputs
mask: Theano variable
Sequence mask
y: Theano variable
Targets
cost: Theano variable
Objective fucntion to minimize
Notes
-----
For more information, see [ADADELTA]_.
.. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
Rate Method*, arXiv:1212.5701.
"""
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_grad' % k)
for k, p in tparams.items()]
running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rup2' % k)
for k, p in tparams.items()]
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad2' % k)
for k, p in tparams.items()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
name='adadelta_f_grad_shared')
updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
for zg, ru2, rg2 in zip(zipped_grads,
running_up2,
running_grads2)]
ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
for ru2, ud in zip(running_up2, updir)]
param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
f_update = theano.function([lr], [], updates=ru2up + param_up,
on_unused_input='ignore',
name='adadelta_f_update')
return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, x, mask, y, cost):
"""
A variant of SGD that scales the step size by running average of the
recent step norms.
Parameters
----------
lr : Theano SharedVariable
Initial learning rate
tpramas: Theano SharedVariable
Model parameters
grads: Theano variable
Gradients of cost w.r.t to parameres
x: Theano variable
Model inputs
mask: Theano variable
Sequence mask
y: Theano variable
Targets
cost: Theano variable
Objective fucntion to minimize
Notes
-----
For more information, see [Hint2014]_.
.. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
lecture 6a,
http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
"""
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_grad' % k)
for k, p in tparams.items()]
running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad' % k)
for k, p in tparams.items()]
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad2' % k)
for k, p in tparams.items()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function([x, mask, y], cost,
updates=zgup + rgup + rg2up,
name='rmsprop_f_grad_shared')
updir = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_updir' % k)
for k, p in tparams.items()]
updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
running_grads2)]
param_up = [(p, p + udn[1])
for p, udn in zip(tparams.values(), updir_new)]
f_update = theano.function([lr], [], updates=updir_new + param_up,
on_unused_input='ignore',
name='rmsprop_f_update')
return f_grad_shared, f_update
def build_model(tparams, options):
trng = RandomStreams(SEED)
# Used for dropout.
use_noise = theano.shared(numpy_floatX(0.))
x = tensor.matrix('x', dtype='int64')
mask = tensor.matrix('mask', dtype=config.floatX)
y = tensor.vector('y', dtype='int64')
n_timesteps = x.shape[0]
n_samples = x.shape[1]
emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
n_samples,
options['dim_proj']])
proj = get_layer(options['encoder'])[1](tparams, emb, options,
prefix=options['encoder'],
mask=mask)
if options['encoder'] == 'lstm':
proj = (proj * mask[:, :, None]).sum(axis=0)
proj = proj / mask.sum(axis=0)[:, None]
if options['use_dropout']:
proj = dropout_layer(proj, use_noise, trng)
pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])
f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
off = 1e-8
if pred.dtype == 'float16':
off = 1e-6
cost = -tensor.log(pred[tensor.arange(n_samples), y] + off).mean()
return use_noise, x, mask, y, f_pred_prob, f_pred, cost
def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
""" If you want to use a trained model, this is useful to compute
the probabilities of new examples.
"""
n_samples = len(data[0])
probs = numpy.zeros((n_samples, 2)).astype(config.floatX)
n_done = 0
for _, valid_index in iterator:
x, mask, y = prepare_data([data[0][t] for t in valid_index],
numpy.array(data[1])[valid_index],
maxlen=None)
pred_probs = f_pred_prob(x, mask)
probs[valid_index, :] = pred_probs
n_done += len(valid_index)
if verbose:
print('%d/%d samples classified' % (n_done, n_samples))
return probs
def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
"""
Just compute the error
f_pred: Theano fct computing the prediction
prepare_data: usual prepare_data for that dataset.
"""
valid_err = 0
for _, valid_index in iterator:
x, mask, y = prepare_data([data[0][t] for t in valid_index],
numpy.array(data[1])[valid_index],
maxlen=None)
preds = f_pred(x, mask)
targets = numpy.array(data[1])[valid_index]
valid_err += (preds == targets).sum()
valid_err = 1. - numpy_floatX(valid_err) / len(data[0])
return valid_err
def train_lstm(
dim_proj=128, # word embeding dimension and LSTM number of hidden units.
patience=10, # Number of epoch to wait before early stop if no progress
max_epochs=5000, # The maximum number of epoch to run
dispFreq=10, # Display to stdout the training progress every N updates
decay_c=0., # Weight decay for the classifier applied to the U weights.
lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop)
n_words=10000, # Vocabulary size
optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
encoder='lstm', # TODO: can be removed must be lstm.
saveto='lstm_model.npz', # The best model will be saved there
validFreq=370, # Compute the validation error after this number of update.
saveFreq=1110, # Save the parameters after every saveFreq updates
maxlen=100, # Sequence longer then this get ignored
batch_size=16, # The batch size during training.
valid_batch_size=64, # The batch size used for validation/test set.
dataset='imdb',
# Parameter for extra option
noise_std=0.,
use_dropout=True, # if False slightly faster, but worst test error
# This frequently need a bigger model.
reload_model=None, # Path to a saved model we want to start from.
test_size=-1, # If >0, we keep only this number of test example.
):
# Model options
model_options = locals().copy()
print("model options", model_options)
load_data, prepare_data = get_dataset(dataset)
print('Loading data')
train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
maxlen=maxlen)
if test_size > 0:
# The test set is sorted by size, but we want to keep random
# size example. So we must select a random selection of the
# examples.
idx = numpy.arange(len(test[0]))
numpy.random.shuffle(idx)
idx = idx[:test_size]
test = ([test[0][n] for n in idx], [test[1][n] for n in idx])
ydim = numpy.max(train[1]) + 1
model_options['ydim'] = ydim
print('Building model')
# This create the initial parameters as numpy ndarrays.
# Dict name (string) -> numpy ndarray
params = init_params(model_options)
if reload_model:
load_params('lstm_model.npz', params)
# This create Theano Shared Variable from the parameters.
# Dict name (string) -> Theano Tensor Shared Variable
# params and tparams have different copy of the weights.
tparams = init_tparams(params)
# use_noise is for dropout
(use_noise, x, mask,
y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
if decay_c > 0.:
decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
weight_decay = 0.
weight_decay += (tparams['U'] ** 2).sum()
weight_decay *= decay_c
cost += weight_decay
f_cost = theano.function([x, mask, y], cost, name='f_cost')
grads = tensor.grad(cost, wrt=list(tparams.values()))
f_grad = theano.function([x, mask, y], grads, name='f_grad')
lr = tensor.scalar(name='lr')
f_grad_shared, f_update = optimizer(lr, tparams, grads,
x, mask, y, cost)
print('Optimization')
kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)
print("%d train examples" % len(train[0]))
print("%d valid examples" % len(valid[0]))
print("%d test examples" % len(test[0]))
history_errs = []
best_p = None
bad_count = 0
if validFreq == -1:
validFreq = len(train[0]) // batch_size
if saveFreq == -1:
saveFreq = len(train[0]) // batch_size
uidx = 0 # the number of update done
estop = False # early stop
start_time = time.time()
try:
for eidx in range(max_epochs):
n_samples = 0
# Get new shuffled index for the training set.
kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
for _, train_index in kf:
uidx += 1
use_noise.set_value(1.)
# Select the random examples for this minibatch
y = [train[1][t] for t in train_index]
x = [train[0][t]for t in train_index]
# Get the data in numpy.ndarray format
# This swap the axis!
# Return something of shape (minibatch maxlen, n samples)
x, mask, y = prepare_data(x, y)
n_samples += x.shape[1]
cost = f_grad_shared(x, mask, y)
f_update(lrate)
if numpy.isnan(cost) or numpy.isinf(cost):
print('bad cost detected: ', cost)
return 1., 1., 1.
if numpy.mod(uidx, dispFreq) == 0:
print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost)
if saveto and numpy.mod(uidx, saveFreq) == 0:
print('Saving...')
if best_p is not None:
params = best_p
else:
params = unzip(tparams)
numpy.savez(saveto, history_errs=history_errs, **params)
pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
print('Done')
if numpy.mod(uidx, validFreq) == 0:
use_noise.set_value(0.)
train_err = pred_error(f_pred, prepare_data, train, kf)
valid_err = pred_error(f_pred, prepare_data, valid,
kf_valid)
test_err = pred_error(f_pred, prepare_data, test, kf_test)
history_errs.append([valid_err, test_err])
if (best_p is None or
valid_err <= numpy.array(history_errs)[:,
0].min()):
best_p = unzip(tparams)
bad_counter = 0
print('Train ', train_err, 'Valid ', valid_err,
'Test ', test_err)
if (len(history_errs) > patience and
valid_err >= numpy.array(history_errs)[:-patience,
0].min()):
bad_counter += 1
if bad_counter > patience:
print('Early Stop!')
estop = True
break
print('Seen %d samples' % n_samples)
if estop:
break
except KeyboardInterrupt:
print("Training interupted")
end_time = time.time()
if best_p is not None:
zipp(best_p, tparams)
else:
best_p = unzip(tparams)
use_noise.set_value(0.)
kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
test_err = pred_error(f_pred, prepare_data, test, kf_test)
print( 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err )
if saveto:
numpy.savez(saveto, train_err=train_err,
valid_err=valid_err, test_err=test_err,
history_errs=history_errs, **best_p)
print('The code run for %d epochs, with %f sec/epochs' % (
(eidx + 1), (end_time - start_time) / (1. * (eidx + 1))))
print( ('Training took %.1fs' %
(end_time - start_time)), file=sys.stderr)
return train_err, valid_err, test_err
if __name__ == '__main__':
# See function train for all possible parameter and there definition.
train_lstm(
max_epochs=100,
test_size=500,
)
imdb.py 副脚本,载入和预处理数据
from __future__ import print_function
from six.moves import xrange
import six.moves.cPickle as pickle
import gzip
import os
import numpy
import theano
def prepare_data(seqs, labels, maxlen=None):
"""Create the matrices from the datasets.
This pad each sequence to the same lenght: the lenght of the
longuest sequence or maxlen.
if maxlen is set, we will cut all sequence to this maximum
lenght.
This swap the axis!
"""
# x: a list of sentences
lengths = [len(s) for s in seqs]
if maxlen is not None:
new_seqs = []
new_labels = []
new_lengths = []
for l, s, y in zip(lengths, seqs, labels):
if l < maxlen:
new_seqs.append(s)
new_labels.append(y)
new_lengths.append(l)
lengths = new_lengths
labels = new_labels
seqs = new_seqs
if len(lengths) < 1:
return None, None, None
n_samples = len(seqs)
maxlen = numpy.max(lengths)
x = numpy.zeros((maxlen, n_samples)).astype('int64')
x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX)
for idx, s in enumerate(seqs):
x[:lengths[idx], idx] = s
x_mask[:lengths[idx], idx] = 1.
return x, x_mask, labels
def get_dataset_file(dataset, default_dataset, origin):
'''Look for it as if it was a full path, if not, try local file,
if not try in the data directory.
Download dataset if it is not present
'''
data_dir, data_file = os.path.split(dataset)
if data_dir == "" and not os.path.isfile(dataset):
# Check if dataset is in the data directory.
new_path = os.path.join(
os.path.split(__file__)[0],
"..",
"data",
dataset
)
if os.path.isfile(new_path) or data_file == default_dataset:
dataset = new_path
if (not os.path.isfile(dataset)) and data_file == default_dataset:
from six.moves import urllib
print('Downloading data from %s' % origin)
urllib.request.urlretrieve(origin, dataset)
return dataset
def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
sort_by_len=True):
'''Loads the dataset
:type path: String
:param path: The path to the dataset (here IMDB)
:type n_words: int
:param n_words: The number of word to keep in the vocabulary.
All extra words are set to unknow (1).
:type valid_portion: float
:param valid_portion: The proportion of the full train set used for
the validation set.
:type maxlen: None or positive int
:param maxlen: the max sequence length we use in the train/valid set.
:type sort_by_len: bool
:name sort_by_len: Sort by the sequence lenght for the train,
valid and test set. This allow faster execution as it cause
less padding per minibatch. Another mechanism must be used to
shuffle the train set at each epoch.
'''
#############
# LOAD DATA #
#############
# Load the dataset
path = get_dataset_file(
path, "imdb.pkl",
"http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
if path.endswith(".gz"):
f = gzip.open(path, 'rb')
else:
f = open(path, 'rb')
train_set = pickle.load(f)
test_set = pickle.load(f)
f.close()
if maxlen:
new_train_set_x = []
new_train_set_y = []
for x, y in zip(train_set[0], train_set[1]):
if len(x) < maxlen:
new_train_set_x.append(x)
new_train_set_y.append(y)
train_set = (new_train_set_x, new_train_set_y)
del new_train_set_x, new_train_set_y
# split training set into validation set
train_set_x, train_set_y = train_set
n_samples = len(train_set_x)
sidx = numpy.random.permutation(n_samples)
n_train = int(numpy.round(n_samples * (1. - valid_portion)))
valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
train_set_x = [train_set_x[s] for s in sidx[:n_train]]
train_set_y = [train_set_y[s] for s in sidx[:n_train]]
train_set = (train_set_x, train_set_y)
valid_set = (valid_set_x, valid_set_y)
def remove_unk(x):
return [[1 if w >= n_words else w for w in sen] for sen in x]
test_set_x, test_set_y = test_set
valid_set_x, valid_set_y = valid_set
train_set_x, train_set_y = train_set
train_set_x = remove_unk(train_set_x)
valid_set_x = remove_unk(valid_set_x)
test_set_x = remove_unk(test_set_x)
def len_argsort(seq):
return sorted(range(len(seq)), key=lambda x: len(seq[x]))
if sort_by_len:
sorted_index = len_argsort(test_set_x)
test_set_x = [test_set_x[i] for i in sorted_index]
test_set_y = [test_set_y[i] for i in sorted_index]
sorted_index = len_argsort(valid_set_x)
valid_set_x = [valid_set_x[i] for i in sorted_index]
valid_set_y = [valid_set_y[i] for i in sorted_index]
sorted_index = len_argsort(train_set_x)
train_set_x = [train_set_x[i] for i in sorted_index]
train_set_y = [train_set_y[i] for i in sorted_index]
train = (train_set_x, train_set_y)
valid = (valid_set_x, valid_set_y)
test = (test_set_x, test_set_y)
return train, valid, test
下载上述两个脚本并放入同一文件夹中,使用下述代码运行:
THEANO_FLAGS="floatX=float32" python lstm.py该脚本会自动下载数据并解压缩。
注意:提供的代码支持随机梯度下降(SGD),AdaDelta和RMSProp优化方法。SGD在此模型任务中表现并不佳。
论文
如果使用此处代码,请引用以下论文:
Addition of the forget gate to the LSTM model:
More recent LSTM paper:
Papers related to Theano:
如有问题请联系Pierre Luc Carrier orKyunghyun Cho
参考