1.导入模块
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import os
import zipfile
import numpy as np
from tempfile import gettempdir
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf
2. 获取文件并解压、初步处理
def zip_file(filename,expected_bytes):
local_filename = os.path.join('.\\',filename)
statinfo = os.stat(local_filename)
if statinfo.st_size == expected_bytes:
print('goodfile',filename)
else:
raise Exception('bad file')
with zipfile.ZipFile(local_filename) as f:
data = tf.compat.as_str(f,read(f.namelist()[0])).split()
return data
了解文章词
vocabulary = zip_file('text8.zip',31344016)
print('Data size',len(vocabulary))
vocabulary_size = 50000
3.构造输入数据 Batch
def build_dataset(words,n_words):
count = [['UNK',-1]]
count.extend(collections.Counter(words).most_common(n_words -1))
dictionary = {}
i = 0
for word, _ in cout:
dictionary[word] = i
i += 1
data = []
unk_count = 0
for word in words:
index = dictionary.get(word,0)
unk_count += 1
data.append(index)
cout[0][1] = unk_count
reversed_dictionary = dict(zip(dictionary.values():dictionary.key()))
return data,count,dictionary,reversed_dictionary
查看统计词
data,count,dictionary,reversed_dictionary = build_dataset(vocabulary,vocabulary_size)
print('most common words:',count[:5]) #频数最高的五个词
data_index = 0 #对原文定位
4. skip-gram方法:用于生成 Batch的样本
def generate_batch(batch_size,num_skips,skip_windows):
global data_index
assert batch_size % num_skips ==0
assert num_skips <= 2 * skip_windows
batch = np.ndarray(shape = (batch_size),dtype = np.int32)
labels = np.ndarray(shape = (batch_size,1),dtype = np.int32)
span = 2 * skip_windows+1
buffer = collections.deque(maxlen = span)
if (data_index + span > len(data)):
data_index = 0
buffer.extend(data[data_index:data_index + span])
data_index = span
for i in range(batch_size// num_skips):
context_words = [w for w in range(span) if w != skip_windows]
random.shuffle(context_words)
word_to_use = collections.deque(context_words)
for j in range (num_skips):
batch[i * num_skips + j] = buffer[skip_windows]
context_words = word_to_use.pop()
labels[i * num_skips + j,0] = buffer[context_words]
if data_index == len(data):
buffer[:] = data[:span]
data_index = span
else:
buffer.append(data[data_index])
data_index += 1
data_index = (data_index + len(data) - span) % len(data)
return batch,labels
5.定义神经网络初始化数据
batch_size = 128 #适当的偏向
embedding_size = 300 #中间层节点的个数,词向量(word2vec)的维度
skip_window = 2
num_skips = 2 #采样值 偏小(经验值)
num_sampled = 64 #采样 与soft max的计算有关
valid_size = 16 #测试集的大小
valid_window = 100 #生成随机列表的值<100
valid_examples = np.random.choice(valid_window,valid_size,replace = Fale)
gragh = tf.Gragh() #直观理解,所在tensor 构成了一幅图
6.构造神经网络
with gragh.as_default():
train_input = tf.placebolder(tf.int32,shape = [batch_size])
train_labels = tf.placebolder(tf.int32, shape=[batch_size,1])
with tf.device('/cpu:0'):
embeddings = tf.Variable(tf.random_uniform([vocabulary_size.embedding_size]),-1,0,1.0)
embed = tf.nn.embedding_lookup(embeddings,train_input)
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros(vocabulary_size))
loss = tf.reduce_mean(tf.nn_loss(weights = nce_weights,
biases = nce_biases,
labels = train_labels,
inputs = embed,
num_sampled = num_sampled,
num_classes = vocabulary_size))
optimizer = tf.train.GradientDcisionOptimizer(1.0).minimize(loss)
norm = tf.sqrt(tf.reduce_sum(tf.queare(embeddings),1,keep_dims = True))
normalized_embeddings = embeddings/norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
similarity = tf.matmul(valid_embeddings,
normalized_embeddings,
transpose_b = True)
init = tf.global_variable_initializer()
num_steps = 100000
7. 训练
with tf.Session(gragh = gragh) as sesion:
init.run()
average_loss = 0
for step in arange(num_steps):
batch_inputs,batch_labels = generate_batch(batch_size,num_skips,skip_window)
feed_dict =
_.loss_val = session.run([optimizer,loss],feed_dict = feed_dict)
#run()函数计算又先后
average_loss += loss_val
if (step % 2000 ==0): # 每2000次 统计一下模型误差
if(step > 0):
average_loss /= 2000
print('average loss at step is'.step,':',average_loss)
average_loss = 0
if step % 10000 ==0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reversed_dictionary[valid_examples[1]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i,:]).argsort()[1:top_k + 1]
log_str = 'Nearst to %s:' % valid_word
for k in xrange(top_k):
close_word = reversed_dictionary[nearest[k]]
log_str = '%s %s,'% (log_str,close_word)
print(log_str)
final.embeddings = normalized_embeddings.eval() #归一化,输出词向量