Word2vec 快熟实现之 TuneLayer (TensorLayer) 和 TensorFlow 篇

所有训练的过程,都可以抽象成 cost。TensorLayer 把 word2vec 的cost,通过一个 Layer Class 抽象出来,大大简化了代码的难度。

Word2vecEmbeddingInputlayer 输出的cost,是根据向 skip-gram 和 NCE 给定特定的 number of negative samples, vocabulary size 和 embedding size 决定的。

此外,还可以通过 tensorflow 提供的 initializer 自定义参数初始化方法,以下是核心代码。

# train_inputs is a row vector, a input is an integer id of single word.
# train_labels is a column vector, a label is an integer id of single word.
# valid_dataset is a column vector, a valid set is an integer id of single word.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Look up embeddings for inputs.
emb_net = tl.layers.Word2vecEmbeddingInputlayer(
        inputs = train_inputs,
        train_labels = train_labels,
        vocabulary_size = vocabulary_size,
        embedding_size = embedding_size,
        num_sampled = num_sampled,
        nce_loss_args = {},
        E_init = tf.random_uniform_initializer(minval=-1.0, maxval=1.0),
        E_init_args = {},
        nce_W_init = tf.truncated_normal_initializer(
                          stddev=float(1.0/np.sqrt(embedding_size))),
        nce_W_init_args = {},
        nce_b_init = tf.constant_initializer(value=0.0),
        nce_b_init_args = {},
        name ='word2vec_layer',
    )
cost = emb_net.nce_cost


具体代码,可以在 TensorLayer 的 word2vec 教程找到, 可参考:

[1]http://tensorlayer.readthedocs.io/en/latest/user/tutorial.html#run-the-word2vec-example

[2]https://github.com/zsdonghao/tensorlayer



import collections
import math
import os
import random
# import zipfile


import numpy as np
# from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
import tensorlayer as tl
import time


flags = tf.flags
flags.DEFINE_string("model", "one", "A type of model.")
FLAGS = flags.FLAGS




def main_word2vec_basic():


    """ Step 1: Download the data, read the context into a list of strings.
    Set hyperparameters.
    """


    words = tl.files.load_matt_mahoney_text8_dataset()
    data_size = len(words)
    print('Data size', data_size) # print(words)    # b'their', b'families', b'who', b'were', b'expelled', b'from', b'jerusalem',


    resume = False  # load existing model, data and dictionaries


    if FLAGS.model == "one":
        # toy setting (tensorflow/examples/tutorials/word2vec/word2vec_basic.py)
        vocabulary_size = 50000 # maximum number of word in vocabulary
        batch_size = 128
        embedding_size = 128  # Dimension of the embedding vector (hidden layer).
        skip_window = 1       # How many words to consider left and right.
        num_skips = 2         # How many times to reuse an input to generate a label.
                              #     (should be double of 'skip_window' so as to
                              #     use both left and right words)
        num_sampled = 64      # Number of negative examples to sample.
                              #     more negative samples, higher loss
        learning_rate = 1.0
        n_epoch = 20
        model_file_name = "model_word2vec_50k_128"
        # Eval 2084/15851 accuracy = 15.7%
    if FLAGS.model == "two":
        # better (tensorflow/models/embedding/word2vec.py)
        vocabulary_size = 80000
        batch_size = 20     # Note: small batch_size need more steps for a Epoch
        embedding_size = 200
        skip_window = 5
        num_skips = 10
        num_sampled = 100
        learning_rate = 0.2
        n_epoch = 15
        model_file_name = "model_word2vec_80k_200"
        # 7.9%
    if FLAGS.model == "three":
        # better (tensorflow/models/embedding/word2vec_optimized.py)
        vocabulary_size = 80000
        batch_size = 500
        embedding_size = 200
        skip_window = 5
        num_skips = 10
        num_sampled = 25
        learning_rate = 0.025
        n_epoch = 20
        model_file_name = "model_word2vec_80k_200_opt"
        # bad 0%
    if FLAGS.model == "four":
        # see: Learning word embeddings efficiently with noise-contrastive estimation
        vocabulary_size = 80000
        batch_size = 100
        embedding_size = 600
        skip_window = 5
        num_skips = 10
        num_sampled = 25
        learning_rate = 0.03
        n_epoch = 200 * 10
        model_file_name = "model_word2vec_80k_600"
        # bad bad
    if FLAGS.model == "five":
        vocabulary_size = 50000
        batch_size = 128
        embedding_size = 200
        skip_window = 1
        num_skips = 2
        num_sampled = 64
        learning_rate = 0.001
        n_epoch = 20
        model_file_name = "model_word2vec_50k_200"


    num_steps = int((data_size/batch_size) * n_epoch)   # total number of iteration


    print('%d Steps a Epoch, total Epochs %d' % (int(data_size/batch_size), n_epoch))
    print('   learning_rate: %f' % learning_rate)
    print('   batch_size: %d' % batch_size)




    """ Step 2: Build the dictionary and replace rare words with 'UNK' token.
    """
    print()
    if resume:
        print("Load existing data and dictionaries" + "!"*10)
        all_var = tl.files.load_npy_to_any(name=model_file_name+'.npy')
        data = all_var['data']; count = all_var['count']
        dictionary = all_var['dictionary']
        reverse_dictionary = all_var['reverse_dictionary']
    else:
        data, count, dictionary, reverse_dictionary = \
                    tl.files.build_words_dataset(words, vocabulary_size, True)


    print('Most 5 common words (+UNK)', count[:5]) # [['UNK', 418391], (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764)]
    print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) # [5243, 3081, 12, 6, 195, 2, 3135, 46, 59, 156] [b'anarchism', b'originated', b'as', b'a', b'term', b'of', b'abuse', b'first', b'used', b'against']


    del words  # Hint to reduce memory.


    """ Step 3: Function to generate a training batch for the Skip-Gram model.
    """
    print()
    data_index = 0
    batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data,
                        batch_size=20, num_skips=4, skip_window=2, data_index=0)
    for i in range(20):
        print(batch[i], reverse_dictionary[batch[i]],
            '->', labels[i, 0], reverse_dictionary[labels[i, 0]])




    """ Step 4: Build a Skip-Gram model.
    """
    print()
    # We pick a random validation set to sample nearest neighbors. Here we limit the
    # validation samples to the words that have a low numeric ID, which by
    # construction are also the most frequent.
    valid_size = 16     # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)
        # a list of 'valid_size' integers smaller than 'valid_window'
        # print(valid_examples)   # [90 85 20 33 35 62 37 63 88 38 82 58 83 59 48 64]
    print_freq = 2000
        # n_epoch = int(num_steps / batch_size)


    # train_inputs is a row vector, a input is an integer id of single word.
    # train_labels is a column vector, a label is an integer id of single word.
    # valid_dataset is a column vector, a valid set is an integer id of single word.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)


    # Look up embeddings for inputs.
    emb_net = tl.layers.Word2vecEmbeddingInputlayer(
            inputs = train_inputs,
            train_labels = train_labels,
            vocabulary_size = vocabulary_size,
            embedding_size = embedding_size,
            num_sampled = num_sampled,
            nce_loss_args = {},
            E_init = tf.random_uniform_initializer(minval=-1.0, maxval=1.0),
            E_init_args = {},
            nce_W_init = tf.truncated_normal_initializer(stddev=float(1.0/np.sqrt(embedding_size))),
            nce_W_init_args = {},
            nce_b_init = tf.constant_initializer(value=0.0),
            nce_b_init_args = {},
            name ='word2vec_layer',
        )
    # Construct the optimizer. Note: AdamOptimizer is very slow in this case
    cost = emb_net.nce_cost
    train_params = emb_net.all_params
    # train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost, var_list=train_params)
    train_op = tf.train.AdagradOptimizer(learning_rate, initial_accumulator_value=0.1,
        use_locking=False).minimize(cost, var_list=train_params)


    # Compute the cosine similarity between minibatch examples and all embeddings.
    # For simple visualization of validation set.
    normalized_embeddings = emb_net.normalized_embeddings
    valid_embed = tf.nn.embedding_lookup(
                                normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
        valid_embed, normalized_embeddings, transpose_b=True)
        # multiply all valid word vector with all word vector.
        # transpose_b=True, normalized_embeddings is transposed before multiplication.


    """ Step 5: Begin training.
    """
    print()
    sess.run(tf.initialize_all_variables())
    if resume:
        print("Load existing model" + "!"*10)
        # instead of using TensorFlow saver, we use TensorLayer to load a model
        # saver = tf.train.Saver()
        # saver.restore(sess, model_file_name+'.ckpt')
        load_params = tl.files.load_npz(name=model_file_name+'.npz')
        tl.files.assign_params(sess, load_params, emb_net)


    emb_net.print_params()
    emb_net.print_layers()


    # save vocabulary to txt
    tl.files.save_vocab(count, name='vocab_text8.txt')


    average_loss = 0


    # for step in xrange(num_steps):
    step = 0
    while (step < num_steps):
        start_time = time.time()
        batch_inputs, batch_labels, data_index = tl.nlp.generate_skip_gram_batch(
                        data=data, batch_size=batch_size, num_skips=num_skips,
                        skip_window=skip_window, data_index=data_index)
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
        # We perform one update step by evaluating the train_op (including it
        # in the list of returned values for sess.run()
        _, loss_val = sess.run([train_op, cost], feed_dict=feed_dict)
        average_loss += loss_val


        if step % print_freq == 0:
            if step > 0:
                average_loss /= 2000
            print("Average loss at step %d/%d. loss:%f took:%fs" %
                        (step, num_steps, average_loss, time.time() - start_time))
            average_loss = 0
        # Prints out nearby words given a list of words.
        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % (print_freq * 5) == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors to print
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log_str = "Nearest to %s:" % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)


        if (step % (print_freq * 20) == 0) and (step != 0):
            print("Save model, data and dictionaries" + "!"*10);
            # instead of using TensorFlow saver, we use TensorLayer to save a model
            # saver = tf.train.Saver()
            # save_path = saver.save(sess, model_file_name+'.ckpt')
            tl.files.save_npz(emb_net.all_params, name=model_file_name+'.npz')
            tl.files.save_any_to_npy(save_dict={'data': data, 'count': count,
                'dictionary': dictionary, 'reverse_dictionary':
                reverse_dictionary}, name=model_file_name+'.npy')


        if step == num_steps-1:
            keeptrain = input("Training %d finished enter 1 to keep training: " % num_steps)
            if keeptrain == '1':
                step = 0
                learning_rate = float(input("Input new learning rate: "))
                train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
        step += 1




    """ Step 6: Visualize the normalized embedding matrix by t-SNE.
    """
    print()
    final_embeddings = normalized_embeddings.eval()
    tl.visualize.tsne_embedding(final_embeddings, reverse_dictionary,
                plot_only=500, second=5, saveable=True, name='word2vec_basic')


    """ Step 7: Evaluate by analogy questions.
        see tensorflow/models/embedding/word2vec_optimized.py
    """
    print()
    #   from tensorflow/models/embedding/word2vec.py
    analogy_questions = tl.files.read_analogies_file( \
                eval_file='questions-words.txt', word2id=dictionary)
    # The eval feeds three vectors of word ids for a, b, c, each of
    # which is of size N, where N is the number of analogies we want to
    # evaluate in one batch.
    analogy_a = tf.placeholder(dtype=tf.int32)  # [N]
    analogy_b = tf.placeholder(dtype=tf.int32)  # [N]
    analogy_c = tf.placeholder(dtype=tf.int32)  # [N]
    # Each row of a_emb, b_emb, c_emb is a word's embedding vector.
    # They all have the shape [N, emb_dim]
    a_emb = tf.gather(normalized_embeddings, analogy_a)  # a's embs
    b_emb = tf.gather(normalized_embeddings, analogy_b)  # b's embs
    c_emb = tf.gather(normalized_embeddings, analogy_c)  # c's embs
    # We expect that d's embedding vectors on the unit hyper-sphere is
    # near: c_emb + (b_emb - a_emb), which has the shape [N, emb_dim].
    #   Bangkok Thailand Tokyo Japan -> Thailand - Bangkok = Japan - Tokyo
    #   Japan = Tokyo + (Thailand - Bangkok)
    #   d = c + (b - a)
    target = c_emb + (b_emb - a_emb)
    # Compute cosine distance between each pair of target and vocab.
    # dist has shape [N, vocab_size].
    dist = tf.matmul(target, normalized_embeddings, transpose_b=True)
    # For each question (row in dist), find the top 'n_answer' words.
    n_answer = 4
    _, pred_idx = tf.nn.top_k(dist, n_answer)
    def predict(analogy):
        """Predict the top 4 answers for analogy questions."""
        idx, = sess.run([pred_idx], {
            analogy_a: analogy[:, 0],
            analogy_b: analogy[:, 1],
            analogy_c: analogy[:, 2]
        })
        return idx


    # Evaluate analogy questions and reports accuracy.
    #  i.e. How many questions we get right at precision@1.
    correct = 0
    total = analogy_questions.shape[0]
    start = 0
    while start < total:
        limit = start + 2500
        sub = analogy_questions[start:limit, :] # question
        idx = predict(sub)      # 4 answers for each question
        # print('question:', tl.files.word_ids_to_words(sub[0], reverse_dictionary))
        # print('answers:', tl.files.word_ids_to_words(idx[0], reverse_dictionary))
        start = limit
        for question in xrange(sub.shape[0]):
            for j in xrange(n_answer):
                # if one of the top 4 answers in correct, win !
                if idx[question, j] == sub[question, 3]:
                    # Bingo! We predicted correctly. E.g., [italy, rome, france, paris].
                    print(j+1, tl.files.word_ids_to_words([idx[question, j]], reverse_dictionary) \
                        , ':', tl.files.word_ids_to_words(sub[question, :], reverse_dictionary))
                    correct += 1
                    break
                elif idx[question, j] in sub[question, :3]:
                    # We need to skip words already in the question.
                    continue
                else:
                    # The correct label is not the precision@1
                    break
    print("Eval %4d/%d accuracy = %4.1f%%" % (correct, total,
                                             correct * 100.0 / total))




if __name__ == '__main__':
    sess = tf.InteractiveSession()
    main_word2vec_basic()


























#

你可能感兴趣的:(深度学习,tensorflow,人工智能,机器学习)