由于本篇论文所实现的任务十分耗时,我的小笔记本根本无法承受其计算量,而且他不像之前实现的模型都有明确的评价指标,所以并未亲自实现。在github上面找到了一个简化版的实现代码,该代码中在数据处理、模型评价等方面做了如下简化:
其使用的是Google 1 Billion Word dataset数据集,该训练集中包含100个文件,每个文件中包含大概30万个句子,每个句子包含大概20个单词。所以总共有30301028个句子,one billion个tokens,800k个单词。可以说是最大的语言建模数据集。改代码做了简化,仅选取句长为18的句子作为训练集,并将句子进行padding:
if len(tokens) == conf.context_size-2:
words.extend((['' ]*(conf.filter_h/2)) + [''] + tokens + [''])
import numpy as np
import collections
import os
def read_words(conf):
#读取所有训练集文件,将句长为18的句子作为训练集,并进行PADDING
words = []
for file in os.listdir(conf.data_dir):
with open(os.path.join(conf.data_dir, file), 'r') as f:
for line in f.readlines():
tokens = line.split()
# NOTE Currently, only sentences with a fixed size are chosen
# to account for fixed convolutional layer size.
if len(tokens) == conf.context_size-2:
words.extend((['' ]*(conf.filter_h/2)) + [''] + tokens + [''])
return words
def index_words(words, conf):
#选出出现次数最高的2000个单词作为vocabulary,这里也是一个大大的简化。直接将voca从800K降到了2000
word_counter = collections.Counter(words).most_common(conf.vocab_size-1)
word_to_idx = {'' : 0}
idx_to_word = {0: '' }
for i,_ in enumerate(word_counter):
word_to_idx[_[0]] = i+1
idx_to_word[i+1] = _[0]
data = []
#将训练集中的单词转化为索引,以方便后续的嵌入层使用
for word in words:
idx = word_to_idx.get(word)
idx = idx if idx else word_to_idx['' ]
data.append(idx)
return np.array(data), word_to_idx, idx_to_word
def create_batches(data, conf):
#对训练数据进行切分成batch
conf.num_batches = int(len(data) / (conf.batch_size * conf.context_size))
#取出最后不足一个batch的数据
data = data[:conf.num_batches * conf.batch_size * conf.context_size]
xdata = data
ydata = np.copy(data)
ydata[:-1] = xdata[1:]
ydata[-1] = xdata[0]
x_batches = np.split(xdata.reshape(conf.batch_size, -1), conf.num_batches, 1)
y_batches = np.split(ydata.reshape(conf.batch_size, -1), conf.num_batches, 1)
for i in xrange(conf.num_batches):
x_batches[i] = x_batches[i][:,:-1]
y_batches[i] = y_batches[i][:,:-1]
return x_batches, y_batches, conf
def get_batch(x_batches, y_batches, batch_idx):
x, y = x_batches[batch_idx], y_batches[batch_idx]
batch_idx += 1
if batch_idx >= len(x_batches):
batch_idx = 0
return x, y.reshape(-1,1), batch_idx
def prepare_data(conf):
#按顺序执行上面的操作
words = read_words(conf)
data, word_to_idx, idx_to_word = index_words(words, conf)
x_batches, y_batches, conf = create_batches(data, conf)
#删除words和data,减少内存占用
del words
del data
return x_batches, y_batches
模型构建部分代码和相关注释如下所示:
import numpy as np
import tensorflow as tf
class GatedCNN(object):
def __init__(self, conf):
tf.reset_default_graph()
#定义输入X和Y的占位符
self.X = tf.placeholder(shape=[conf.batch_size, conf.context_size-1], dtype=tf.int32, name="X")
self.y = tf.placeholder(shape=[conf.batch_size, conf.context_size-1], dtype=tf.int32, name="y")
#词嵌入层,将单词索引转化为词向量,shape=[conf.batch_size, conf.context_size-1, embed_size, 1]
embed = self.create_embeddings(self.X, conf)
h, res_input = embed, embed
#堆叠num_layers个Gated CNN层
for i in range(conf.num_layers):
fanin_depth = h.get_shape()[-1]
filter_size = conf.filter_size if i < conf.num_layers-1 else 1
shape = (conf.filter_h, conf.filter_w, fanin_depth, filter_size)
with tf.variable_scope("layer_%d"%i):
#计算两个卷积w,v
conv_w = self.conv_op(h, shape, "linear")
conv_v = self.conv_op(h, shape, "gated")
#计算门限输出h
h = conv_w * tf.sigmoid(conv_v)
#将每5层Gated CNN组合成一个block。
if i % conf.block_size == 0:
h += res_input
res_input = h
#将模型输出h和y reshape
h = tf.reshape(h, (-1, conf.embedding_size))
y_shape = self.y.get_shape().as_list()
self.y = tf.reshape(self.y, (y_shape[0] * y_shape[1], 1))
softmax_w = tf.get_variable("softmax_w", [conf.vocab_size, conf.embedding_size], tf.float32,
tf.random_normal_initializer(0.0, 0.1))
softmax_b = tf.get_variable("softmax_b", [conf.vocab_size], tf.float32, tf.constant_initializer(1.0))
#Preferance: NCE Loss, heirarchial softmax, adaptive softmax
self.loss = tf.reduce_mean(tf.nn.nce_loss(softmax_w, softmax_b, h, self.y, conf.num_sampled, conf.vocab_size))
#训练模型,使用MomentumOptimizer优化器
trainer = tf.train.MomentumOptimizer(conf.learning_rate, conf.momentum)
gradients = trainer.compute_gradients(self.loss)
#将梯度进行clip截断
clipped_gradients = [(tf.clip_by_value(_[0], -conf.grad_clip, conf.grad_clip), _[1]) for _ in gradients]
self.optimizer = trainer.apply_gradients(clipped_gradients)
self.perplexity = tf.exp(self.loss)
#将loss和perplexity进行记录,方便在tensorboard中观察模型训练效果
self.create_summaries()
def create_embeddings(self, X, conf):
#这里使用的是随机初始化词向量
embeddings = tf.get_variable("embeds",(conf.vocab_size, conf.embedding_size), tf.float32, tf.random_uniform_initializer(-1.0,1.0))
embed = tf.nn.embedding_lookup(embeddings, X)
mask_layer = np.ones((conf.batch_size, conf.context_size-1, conf.embedding_size))
mask_layer[:,0:conf.filter_h/2,:] = 0
embed *= mask_layer
embed_shape = embed.get_shape().as_list()
embed = tf.reshape(embed, (embed_shape[0], embed_shape[1], embed_shape[2], 1))
return embed
def conv_op(self, fan_in, shape, name):
W = tf.get_variable("%s_W"%name, shape, tf.float32, tf.random_normal_initializer(0.0, 0.1))
b = tf.get_variable("%s_b"%name, shape[-1], tf.float32, tf.constant_initializer(1.0))
#使用‘SAME’模式进行卷积
return tf.add(tf.nn.conv2d(fan_in, W, strides=[1,1,1,1], padding='SAME'), b)
def create_summaries(self):
tf.summary.scalar("loss", self.loss)
tf.summary.scalar("perplexity", self.perplexity)
self.merged_summary_op = tf.summary.merge_all()
这些代码中比较值得学习和注意的便是下面两个:将5层卷积结果组成一个block &&对梯度进行clip
#将每5层Gated CNN组合成一个block。
if i % conf.block_size == 0:
h += res_input
res_input = h
#对梯度进行计算并截断
gradients = trainer.compute_gradients(self.loss)
#将梯度进行clip截断
clipped_gradients = [(tf.clip_by_value(_[0], -conf.grad_clip, conf.grad_clip), _[1]) for _ in gradients]
self.optimizer = trainer.apply_gradients(clipped_gradients)
from model import *
from data_utils import *
from conf_utils import *
#设置模型参数
flags = tf.app.flags
flags.DEFINE_integer("vocab_size", 2000, "Maximum size of vocabulary")
flags.DEFINE_integer("embedding_size", 200, "Embedding size of each token")
flags.DEFINE_integer("filter_size", 64, "Depth of each CNN layer")
flags.DEFINE_integer("num_layers", 10, "Number of CNN layers")
flags.DEFINE_integer("block_size", 5, "Size of each residual block")
flags.DEFINE_integer("filter_h", 5, "Height of the CNN filter")
flags.DEFINE_integer("context_size", 20, "Length of sentence/context")
flags.DEFINE_integer("batch_size", 64, "Batch size of data while training")
flags.DEFINE_integer("epochs", 50, "Number of epochs")
flags.DEFINE_integer("num_sampled", 1, "Sampling value for NCE loss")
flags.DEFINE_integer("learning_rate", 1.0, "Learning rate for training")
flags.DEFINE_integer("momentum", 0.99, "Nestrov Momentum value")
flags.DEFINE_integer("grad_clip", 0.1, "Gradient Clipping limit")
flags.DEFINE_integer("num_batches", 0, "Predefined: to be calculated")
flags.DEFINE_string("ckpt_path", "ckpt", "Path to store checkpoints")
flags.DEFINE_string("summary_path", "logs", "Path to store summaries")
flags.DEFINE_string("data_dir", "data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled", "Path to store data")
def main(_):
conf = prepare_conf(flags.FLAGS)
#读入数据
x_batches, y_batches = prepare_data(conf)
#构建模型
model = GatedCNN(conf)
saver = tf.train.Saver(tf.trainable_variables())
print "Started Model Training..."
batch_idx = 0
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
summary_writer = tf.summary.FileWriter(conf.summary_path, graph=sess.graph)
if os.path.exists(conf.ckpt_file):
saver.restore(sess, conf.ckpt_file)
print "Model Restored"
for i in xrange(conf.epochs):
start = time.time()
for j in xrange(conf.num_batches):
inputs, labels, batch_idx = get_batch(x_batches, y_batches, batch_idx)
_, l = sess.run([model.optimizer, model.loss], feed_dict={model.X:inputs, model.y:labels})
end = time.time()
print "Epoch: %.2f, Time: %.2f, Loss: %.2f"%(i, end-start, l)
if i % 10 == 0:
perp = sess.run(model.perplexity, feed_dict={model.X:inputs, model.y:labels})
print "Perplexity: %.2f"%perp
saver.save(sess, conf.ckpt_file)
summaries = sess.run(model.merged_summary_op, feed_dict={model.X:inputs, model.y:labels})
summary_writer.add_summary(summaries, i)
if __name__ == '__main__':
tf.app.run()