Language Modeling with Gated Convolutional Networks(句子建模之门控CNN)--Tensorflow实现篇

由于本篇论文所实现的任务十分耗时,我的小笔记本根本无法承受其计算量,而且他不像之前实现的模型都有明确的评价指标,所以并未亲自实现。在github上面找到了一个简化版的实现代码,该代码中在数据处理、模型评价等方面做了如下简化:

  • Replace NCE loss with Adaptive Softmax.
  • Remove restricted training on fixed sized sentences (20, for now) and extend to account for all varied sentence lenghts.
  • Implement Weight Normalisation for faster convergence.
  • Train extensively on deeper models to match the results with the paper.

其使用的是Google 1 Billion Word dataset数据集,该训练集中包含100个文件,每个文件中包含大概30万个句子,每个句子包含大概20个单词。所以总共有30301028个句子,one billion个tokens,800k个单词。可以说是最大的语言建模数据集。改代码做了简化,仅选取句长为18的句子作为训练集,并将句子进行padding:

if len(tokens) == conf.context_size-2:
                    words.extend((['']*(conf.filter_h/2)) + [''] + tokens + [''])

数据处理

import numpy as np
import collections
import os

def read_words(conf):
#读取所有训练集文件,将句长为18的句子作为训练集,并进行PADDING
    words = []
    for file in os.listdir(conf.data_dir):
        with open(os.path.join(conf.data_dir, file), 'r') as f:
            for line in f.readlines():
                tokens = line.split()
                # NOTE Currently, only sentences with a fixed size are chosen
                # to account for fixed convolutional layer size.
                if len(tokens) == conf.context_size-2:
                    words.extend((['']*(conf.filter_h/2)) + [''] + tokens + [''])
    return words

def index_words(words, conf):
    #选出出现次数最高的2000个单词作为vocabulary,这里也是一个大大的简化。直接将voca从800K降到了2000
    word_counter = collections.Counter(words).most_common(conf.vocab_size-1)
    word_to_idx = {'': 0}
    idx_to_word = {0: ''}
    for i,_ in enumerate(word_counter):
        word_to_idx[_[0]] = i+1
        idx_to_word[i+1] = _[0]
    data = []
    #将训练集中的单词转化为索引,以方便后续的嵌入层使用
    for word in words:
        idx = word_to_idx.get(word)
        idx = idx if idx else word_to_idx['']
        data.append(idx)
    return np.array(data), word_to_idx, idx_to_word

def create_batches(data, conf):
    #对训练数据进行切分成batch
    conf.num_batches = int(len(data) / (conf.batch_size * conf.context_size))
    #取出最后不足一个batch的数据
    data = data[:conf.num_batches * conf.batch_size * conf.context_size]
    xdata = data
    ydata = np.copy(data)

    ydata[:-1] = xdata[1:]
    ydata[-1] = xdata[0]
    x_batches = np.split(xdata.reshape(conf.batch_size, -1), conf.num_batches, 1)
    y_batches = np.split(ydata.reshape(conf.batch_size, -1), conf.num_batches, 1)

    for i in xrange(conf.num_batches):
        x_batches[i] = x_batches[i][:,:-1]
        y_batches[i] = y_batches[i][:,:-1]
    return x_batches, y_batches, conf

def get_batch(x_batches, y_batches, batch_idx):
    x, y = x_batches[batch_idx], y_batches[batch_idx]
    batch_idx += 1
    if batch_idx >= len(x_batches):
        batch_idx = 0
    return x, y.reshape(-1,1), batch_idx


def prepare_data(conf):
    #按顺序执行上面的操作
    words = read_words(conf)
    data, word_to_idx, idx_to_word = index_words(words, conf)
    x_batches, y_batches, conf = create_batches(data, conf)
    #删除words和data,减少内存占用
    del words
    del data

    return x_batches, y_batches

模型构建

模型构建部分代码和相关注释如下所示:

import numpy as np
import tensorflow as tf

class GatedCNN(object):

    def __init__(self, conf):
        tf.reset_default_graph()
        #定义输入X和Y的占位符
        self.X = tf.placeholder(shape=[conf.batch_size, conf.context_size-1], dtype=tf.int32, name="X")
        self.y = tf.placeholder(shape=[conf.batch_size, conf.context_size-1], dtype=tf.int32, name="y")

        #词嵌入层,将单词索引转化为词向量,shape=[conf.batch_size, conf.context_size-1, embed_size, 1]
        embed = self.create_embeddings(self.X, conf)
        h, res_input = embed, embed

        #堆叠num_layers个Gated CNN层
        for i in range(conf.num_layers):
            fanin_depth = h.get_shape()[-1]
            filter_size = conf.filter_size if i < conf.num_layers-1 else 1
            shape = (conf.filter_h, conf.filter_w, fanin_depth, filter_size)

            with tf.variable_scope("layer_%d"%i):
                #计算两个卷积w,v
                conv_w = self.conv_op(h, shape, "linear")
                conv_v = self.conv_op(h, shape, "gated")
                #计算门限输出h
                h = conv_w * tf.sigmoid(conv_v)
                #将每5层Gated CNN组合成一个block。
                if i % conf.block_size == 0:
                    h += res_input
                    res_input = h
        #将模型输出h和y reshape
        h = tf.reshape(h, (-1, conf.embedding_size))
        y_shape = self.y.get_shape().as_list()
        self.y = tf.reshape(self.y, (y_shape[0] * y_shape[1], 1))

        softmax_w = tf.get_variable("softmax_w", [conf.vocab_size, conf.embedding_size], tf.float32, 
                                    tf.random_normal_initializer(0.0, 0.1))
        softmax_b = tf.get_variable("softmax_b", [conf.vocab_size], tf.float32, tf.constant_initializer(1.0))

        #Preferance: NCE Loss, heirarchial softmax, adaptive softmax
        self.loss = tf.reduce_mean(tf.nn.nce_loss(softmax_w, softmax_b, h, self.y, conf.num_sampled, conf.vocab_size))
        #训练模型,使用MomentumOptimizer优化器
        trainer = tf.train.MomentumOptimizer(conf.learning_rate, conf.momentum)
        gradients = trainer.compute_gradients(self.loss)
        #将梯度进行clip截断
        clipped_gradients = [(tf.clip_by_value(_[0], -conf.grad_clip, conf.grad_clip), _[1]) for _ in gradients]
        self.optimizer = trainer.apply_gradients(clipped_gradients)
        self.perplexity = tf.exp(self.loss)

        #将loss和perplexity进行记录,方便在tensorboard中观察模型训练效果
        self.create_summaries()

    def create_embeddings(self, X, conf):
        #这里使用的是随机初始化词向量
        embeddings = tf.get_variable("embeds",(conf.vocab_size, conf.embedding_size), tf.float32, tf.random_uniform_initializer(-1.0,1.0))
        embed = tf.nn.embedding_lookup(embeddings, X)
        mask_layer = np.ones((conf.batch_size, conf.context_size-1, conf.embedding_size))
        mask_layer[:,0:conf.filter_h/2,:] = 0
        embed *= mask_layer

        embed_shape = embed.get_shape().as_list()
        embed = tf.reshape(embed, (embed_shape[0], embed_shape[1], embed_shape[2], 1))
        return embed


    def conv_op(self, fan_in, shape, name):
        W = tf.get_variable("%s_W"%name, shape, tf.float32, tf.random_normal_initializer(0.0, 0.1))
        b = tf.get_variable("%s_b"%name, shape[-1], tf.float32, tf.constant_initializer(1.0))
        #使用‘SAME’模式进行卷积
        return tf.add(tf.nn.conv2d(fan_in, W, strides=[1,1,1,1], padding='SAME'), b)

    def create_summaries(self):
        tf.summary.scalar("loss", self.loss)
        tf.summary.scalar("perplexity", self.perplexity)
        self.merged_summary_op = tf.summary.merge_all()

这些代码中比较值得学习和注意的便是下面两个:将5层卷积结果组成一个block &&对梯度进行clip

#将每5层Gated CNN组合成一个block。
if i % conf.block_size == 0:
    h += res_input
    res_input = h

#对梯度进行计算并截断
gradients = trainer.compute_gradients(self.loss)
#将梯度进行clip截断
clipped_gradients = [(tf.clip_by_value(_[0], -conf.grad_clip, conf.grad_clip), _[1]) for _ in gradients]
self.optimizer = trainer.apply_gradients(clipped_gradients)

模型训练

from model import *
from data_utils import *
from conf_utils import *
#设置模型参数
flags = tf.app.flags
flags.DEFINE_integer("vocab_size", 2000, "Maximum size of vocabulary")
flags.DEFINE_integer("embedding_size", 200, "Embedding size of each token")
flags.DEFINE_integer("filter_size", 64, "Depth of each CNN layer")
flags.DEFINE_integer("num_layers", 10, "Number of CNN layers")
flags.DEFINE_integer("block_size", 5, "Size of each residual block")
flags.DEFINE_integer("filter_h", 5, "Height of the CNN filter")
flags.DEFINE_integer("context_size", 20, "Length of sentence/context")
flags.DEFINE_integer("batch_size", 64, "Batch size of data while training")
flags.DEFINE_integer("epochs", 50, "Number of epochs")
flags.DEFINE_integer("num_sampled", 1, "Sampling value for NCE loss")
flags.DEFINE_integer("learning_rate", 1.0, "Learning rate for training")
flags.DEFINE_integer("momentum", 0.99, "Nestrov Momentum value")
flags.DEFINE_integer("grad_clip", 0.1, "Gradient Clipping limit")
flags.DEFINE_integer("num_batches", 0, "Predefined: to be calculated")
flags.DEFINE_string("ckpt_path", "ckpt", "Path to store checkpoints")
flags.DEFINE_string("summary_path", "logs", "Path to store summaries")
flags.DEFINE_string("data_dir", "data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled", "Path to store data")


def main(_):
    conf = prepare_conf(flags.FLAGS)
    #读入数据
    x_batches, y_batches = prepare_data(conf)
    #构建模型
    model = GatedCNN(conf)

    saver = tf.train.Saver(tf.trainable_variables())
    print "Started Model Training..."

    batch_idx = 0
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        summary_writer = tf.summary.FileWriter(conf.summary_path, graph=sess.graph)

        if os.path.exists(conf.ckpt_file):
            saver.restore(sess, conf.ckpt_file)
            print "Model Restored"

        for i in xrange(conf.epochs):
            start = time.time()
            for j in xrange(conf.num_batches):
                inputs, labels, batch_idx = get_batch(x_batches, y_batches, batch_idx)
                _, l = sess.run([model.optimizer, model.loss], feed_dict={model.X:inputs, model.y:labels})
            end = time.time()
            print "Epoch: %.2f, Time: %.2f,  Loss: %.2f"%(i, end-start, l)

            if i % 10 == 0:
                perp = sess.run(model.perplexity, feed_dict={model.X:inputs, model.y:labels})
                print "Perplexity: %.2f"%perp
                saver.save(sess, conf.ckpt_file)

            summaries = sess.run(model.merged_summary_op, feed_dict={model.X:inputs, model.y:labels})
            summary_writer.add_summary(summaries, i)




if __name__ == '__main__':
    tf.app.run()

你可能感兴趣的:(卷积神经网络-CNN,nlp,深度学习,TensorFlow)