回顾 skip-gram
CBOW
- 与skip-gram相反,
context(word)
预测 word
the dog barked at the mailman
- 当CBOW_window = 1,则
- batch:
[['the','barked'],['dog','at'],['barked','the'],['at','mailman']]
- labels:
['dog','barked','at','the']
- CBOW 可能会比skip-gram效果更好,因为每次训练利用了多个上下文词语的信息,而skip-gram每次只利用一个
CBOW 详解
- 输入为
batch_size, word[t-2]
,batch_size, word[t-1]
,batch_size, word[t+1]
,batch_size, word[t+2]
- 其实就是对上下文词语的词向量进行平均即可
- 主要是batch生成方式不一样了(后续的placeholder等都要改)
生成符合CBOW的batch
def generate_batch(batch_size, cbow_window):
global data_index
assert cbow_window % 2 == 1
span = 2 * cbow_window + 1
# 去除中心word: span - 1
batch = np.ndarray(shape=(batch_size, span - 1), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
# 循环选取 data中数据,到尾部则从头开始
data_index = (data_index + 1) % len(data)
for i in range(batch_size):
# target at the center of span
target = cbow_window
# 仅仅需要知道context(word)而不需要word
target_to_avoid = [cbow_window]
col_idx = 0
for j in range(span):
# 略过中心元素 word
if j == span // 2:
continue
batch[i, col_idx] = buffer[j]
col_idx += 1
labels[i, 0] = buffer[target]
# 更新 buffer
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
assert batch_size[0] == batch_size
assert batch_size[1] == span - 1
return batch, labels
建立模型
- 修改placeholder的形状
- 对上下文的词向量进行评价
num_steps = 100001
if __name__ == '__main__':
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
cbow_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
# pick 16 samples from 100
valid_examples = np.array(random.sample(range(valid_window), valid_size // 2))
valid_examples = np.append(valid_examples, random.sample(range(1000, 1000+valid_window), valid_size // 2))
num_sampled = 64 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
# Input data.
train_dataset = tf.placeholder(tf.int32, shape=[batch_size,2 * cbow_window])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Variables.
# embedding, vector for each word in the vocabulary
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Model.
# Look up embeddings for inputs.
# this might efficiently find the embeddings for given ids (traind dataset)
# manually doing this might not be efficient given there are 50000 entries in embeddings
embeds = None
for i in range(2 * cbow_window):
embedding_i = tf.nn.embedding_lookup(embeddings, train_dataset[:,i])
print('embedding %d shape: %s'%(i, embedding_i.get_shape().as_list()))
emb_x,emb_y = embedding_i.get_shape().as_list()
if embeds is None:
embeds = tf.reshape(embedding_i, [emb_x,emb_y,1])
else:
embeds = tf.concat([embeds, tf.reshape(embedding_i, [emb_x, emb_y,1])], 2)
assert embeds.get_shape().as_list()[2] == 2 * cbow_window
print("Concat embedding size: %s"%embeds.get_shape().as_list())
avg_embed = tf.reduce_mean(embeds, 2, keep_dims=False)
print("Avg embedding size: %s"%avg_embed.get_shape().as_list())
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases,
labels=train_labels,
inputs=avg_embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
# Optimizer.
# Note: The optimizer will optimize the softmax_weights AND the embeddings.
# This is because the embeddings are defined as a variable quantity and the
# optimizer's `minimize` method will by default modify all variable quantities
# that contribute to the tensor it is passed.
# See docs on `tf.train.Optimizer.minimize()` for more details.
# Adagrad is required because there are too many things to optimize
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
# Compute the similarity between minibatch examples and all embeddings.
# We use the cosine distance:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
average_loss = 0
for step in range(num_steps):
batch_data, batch_labels = generate_batch(batch_size, cbow_window)
feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
_, l = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += l
if step % 2000 == 0:
if step > 0:
average_loss = average_loss / 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print('Average loss at step %d: %f' % (step, average_loss))
average_loss = 0
# note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in range(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log = 'Nearest to %s:' % valid_word
for k in range(top_k):
close_word = reverse_dictionary[nearest[k]]
log = '%s %s,' % (log, close_word)
print(log)
final_embeddings = normalized_embeddings.eval()
运行结果
Average loss at step 98000: 3.557815
Average loss at step 100000: 3.589172
Nearest to time: month, collaborating, moribund, period, mx, fayetteville, piece, reason,
Nearest to that: which, however, what, anand, furthermore, how, hearses, dakini,
Nearest to states: kingdom, nations, state, aba, mexico, factsheet, rpp, us,
Nearest to war: disturbances, wars, yucatan, charlemagne, midsummer, telomeres, autopsy, outcomes,
Nearest to five: six, seven, four, eight, three, nine, two, zero,
Nearest to into: through, woodruff, within, from, urartu, cranberries, across, under,
Nearest to for: pwn, volo, after, without, when, while, if, during,
Nearest to known: regarded, used, described, possible, defined, called, classified, recognized,
Nearest to issue: michele, epistle, weightings, episode, mesoamerican, equator, version, directv,
Nearest to experience: heir, widehat, difranco, value, arrival, lyricist, carolus, vila,
Nearest to arts: cetacean, maximin, heroines, lycaon, abatement, art, misunderstandings, historian,
Nearest to pre: uncountably, listing, psychoactive, publication, widespread, islamic, fantasyland, ephesians,
Nearest to marriage: centriole, buddhahood, nieve, mor, disobedience, tripoli, kolingba, urgell,
Nearest to mainly: mostly, primarily, largely, still, mood, halal, now, ardal,
Nearest to stage: satirists, honorius, catechism, filmed, sway, ursus, permanently, ifr,
Nearest to consists: means, because, instead, is, consisted, futurism, list, history,
[Finished in 405.1s]