首先是载入各种依赖库,因为要从网络中下载数据,粗腰的依赖库比较多。
import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import tensorflow as tf
url='http://mattmahoney.net/dc/'
def maybe_download(filename,expected_bytes):
if not os.path.exists(filename):
filename,_=urllib.request.urlretrieve(url+filename,filename)
statinfo=os.stat(filename)
if statinfo.st_size==expected_bytes;
print 'Found and verified',filename
else:
print statinfo.st_size
raise exception('Failed to verify'+filename+',can you get to it with a browser?')
return filename
filename=maybe_download(text8.zip,31344016)
接下来解压下载的亚索文件,并使用tf.conpat.as_str将数据转成单词的列表。通过程序输出,可以知道数据最后被转为一个包含17005207个单词的列表。
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data=tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
words=read_data(filename)
print 'Data size',len(words)
vocabulary_size=50000
def build_dataset(words):
count=[['UNK',-1]]
count.extend(collections.Counter(words).most_common(vocabulary_size -1))
dictionary=dict()
for word,_ in count:
dictionary[word]=len(dictionary)
data=list()
unk_count=0
for word in words;
if word in dictionary:
index=dictionary[word]
else:
index=0
unk_count+=1
data.append(index)
count[0][1]=unk_count
reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
return data,count,dictionary,reverse_dictionary
data,count,dictionary,reverse_dictionary = build_dataset(words)
del words
print 'Most common words (+UNK)',count[:5]
print 'Sample data',data[:10],[reverse_dictionary[i] for i in data[:10]]
data_index=0
def generate_batch(batch_size,num_skips,skip_window):
global data_index
assert batch_size % num_skips==0
assert num_skips <=2*skip_window
batch=np.ndaaray(shape=(batch_size),dtype=np.int32)
labels=np.ndaaray(shape=(batch_size,1),dtype=np.int32)
span=2*skip_window+1
buffer=collection.deque(maxlen=span)
接下来从需要data_index开始,把span个单词顺序读入buffer作为初始值。因为buffer是容量为span的deque,所以此时buffer已填充满,后续数据将替换掉前面的数据。然后我们进入第一层循环(次数为batch_size/num_skips),每次循环内对一个目标单词生成样本。现在buffer中是目标单词和所有相关单词,我们定义target=skip_window,即buffer中第skip_window个变量为目标单词。然后我们定义生成样本时需要避免的单词李彪target_to_avoid,这个列表一开始包括skip_window个单词(即目标单词)因为我们要预测的是语境单词,不包括目标单词本身。接下来进入第二层循环(次数为num_skips),每次循环中对一个语境单词生成样本,先产生随机数,知道随机数不在target_to_avoid中,代表可以使用的语境单词,然后产生一个样本,feature即目标词汇buffer[skip_window],label则是buffer[target]。同时,因为这个语境单词被使用了,所再把它添加到target_to_avoid中过滤。在对一个目标单词生成完所有样本后(num_skips个样本),我们再读入下一个单词(同时会跑掉buffer中的第一个单词),即把滑窗向后移动一位,这样我们的目标单词也向后移动一个,语境打次也整体后移了,便可以开始生成下一个目标单词的训练样本。两侧循环完成后,我们已经获得了batch_size个训练样本,将batch和labels作为函数结果返回。
for _ in range(span):
buffer.append(data[data_index])
data_index=(data_index+1)%len(data)
for i in range(batch_size//num_skips):
target=skip_window
target_to_avoid=[skip_window]
for j in range(num_skips):
while target in target_to_avoid:
target=random.randint(o,span-1)
targets_to_avoid.append(target)
batch[i*num_skips+j]=buffer[skip_window]
labels[i*num_skips+j,0]=buffer[target]
buffer.append(data[data_index])
data_index=(data_index+1)%len(data)
return batch,labels
batch,labels=generate_batch(batch_size=8,num_skips=2,skip_window=1)
for i in range(8):
print batch[i],reverse_dictionary[batch[i]],'->',labels[i,0],reverse_dictionary[labels[i,0]]
batch_size=128
embedding_size=128
skip_window=1
num_skips=2
valid_size=16
valid_window=100
valid_examples=np.random.choice(valid_window,valid_size,replace=False)
num_sampled=64
下面就开始定义skip-Gram Word2Vec模型的网络结构。我们先创建一个tf.Graph并设置为默认的graph。然后创建训练数据中inputs和lebels的placeholder,同时将前面随机产生的valid_examples转为Tensorflow中的constant。接下来,先使用with tf.device('/cpu:0')限定所有计算在CPU上执行,因为接下去的一些计算操作在GPU上可能还没有实现。然后实现tf.random_uniform随机生成所有单词的词向量embeddings,单词表大小为50000,向量维度为128,再使用tf.nn.embedding_lookup查找输入train_inputs对应的向量embed。下面使用之前提到的NCE Loss中的权重参数nce_weights,并将其nce_biases初始化为0.最后使用tf.nn.nce_loss计算出词向量embedding在训练数据上的Loss,并是引用tf.reduce_mean进行汇总。
graph=tf.Graph()
with graph.as_default():
train_inputs=tf.placeholder(tf.int32,shape=[batch_size])
train_labels=tf.placeholder(tf.int32,shape=[batch_size,1])
valid_dataset=tf.constant(valid_examples,dtype=tf.int32)
with tf.device('/cpu:0'):
embeddings=tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
embed=tf.nn.embedding_lookup(embeddings,train_inputs)
nce_weights=tf.Variable(tf.zeros([vocabulary_size]))
loss=tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,biases=nce_biases,labels=train_labels,inputs=embed,num_sampled=num_sampled,num_classes=vocabulary_size))
我们定义优化器为SGD,且学习速率为1.0.然后计算嵌入向量embeddings的L2范数norm,再将embeddings除以其L2范数得到标准化后的normalized_embeddings。再使用tf.nn.embedding_lookup查询验证单词的嵌入向量,并计算验证单词的嵌入向量与词汇表中所有单词的相似性。最后,我们使用tf.global_variables_initializer初始化所有模型参数。
optimizer=tf.train.GradientDescentOptimizer(1.0).minimize(loss)
norm=tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keep_dims=True))
normalized_embeddings=embeddings/norm
valid_embeddings=tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
similarity=tf.matmul(valid_embeddings,normalized_embeddings,transpose_b=True)
init=tf.global_variables_initializer()
num_steps=100001
with tf.Session(graph=graph) as session:
init.run()
print "Initialized"
average_loss=0
for step in range(num_steps):
batch_inputs,batch_labels=generate_batch(batch_size,num_skips,skip_window)
feed_dict={train_inputs:batch_inputs,train_labels:batch_labels}
_,loss_val=session.run([optimizer,loss],feed_dict=feed_dict)
average_loss+=loss_val
if step % 2000==0:
if step>0:
average_loss/=2000
print 'Average loss at step',step,':',average_loss
average_loss=0
每10000次循环,计算一次验证单词与全部单词的相似度,并将其与每个验证单词最相似的8个单词展示出来。
if step % 10000==0:
sim=similarity.eval()
for i in range(valid_size):
valid_word=reverse_dictionary[valid_examples[i]]
top_k=8
nearest=(-sim[i,:]).argsort()[1:top_k+1]
log_str='Nearest to %s:' % valid_word
for k in range(top_k):
close_word=reverse_dictionary[nearest[k]]
log_str='%s %s,' % (log_str,close_word)
print log_str
final_embeddings=normalized_embeddings.eval()
def plot_with_labels(low_dim_embs,labels,filename='tsne.png'):
assert low_dim_embs.shape[0]>=len(labels),'More labels than embeddings'
plt.figure(figsize=(18,18))
for i,label in enumerate(labels);
x,y=low_dim_embs[i,:]
plt.scatter(x,y)
plt.annotate(label,xy=(x,y).xytext=(5,2),textcoords='offset points',ha='right',va='bottom')
plt.savefig(filename)
~
~
我们使用sklearn.manifold.TSNE实现降维,这里直接将原始的128维的嵌入向量降到2维,再用前面的plot_with_labels函数进行展示。这里只显示词频最高的100个单词的可视化结果。
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne=TSNE(PERPLEXITY=30,N_COMPONENTS=2,INIT='PCA',N_ITER=5000)
plot_only=100
low_dim_embs=tsne.fit_transform(final_embeddings[:plot_only,:])
labels=[reverse_dictionary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs,labels)