精简版本,去掉了各种tsne以及vector可视化的麻烦,直接看到word2vec的训练过程
也就是将每一个单词转换成一个数字序号,同时将低频词过滤掉,统一使用 UKN来代替.
import collections
import tensorflow as tf
def pre_process(input_file_path, vocabulary_size, output_file_path=None):
"""
对输入数据进行预处理。处理之后的结果存储到 output_file_path中
:param input_file_path: 输入的文件路径
:param output_file_path: 输出的文件路径
:return: True: 成功; False: 失败
"""
# 存储单词以及编号, key: word; value: 编号
most_common_dict = dict()
# 存储将每一个word转换成index之后的index序列。也就是相当于原先的word序列
data = list()
# 存储常见的单词是以元组的形式存储的,(word, word的数量)
most_common_words = list()
with open(input_file_path) as input_file:
for line in input_file:
words = line.strip().split(' ')
most_common_words = collections.Counter(words).most_common(vocabulary_size - 1)
most_common_words.insert(0, ['UKN', 0])
tf.logging.info(str(len(most_common_words)))
tf.logging.info(str(most_common_words[:5]))
tf.logging.info(str(most_common_words[4998:]))
# 建立most_common_words 字典, 使用字典的长度作为value,
# 因为每增加一个单词,字典长度就会增加1,所以相当于对每一个单词进行了编号
# 当然可以直接使用一个自增的变量来处理
for common_word, _ in most_common_words:
most_common_dict[common_word] = len(most_common_dict)
# 统计 UKN 的数量
for word in words:
if word in most_common_dict:
index = most_common_dict[word]
else:
# 这是UKN
index = 0
most_common_words[0][1] += 1
data.append(index)
# 将most_common_dict 翻转成 key: 编号; value: words
reverse_most_common_dict = dict(zip(most_common_dict.values(),
most_common_dict.keys()))
tf.logging.info('data: ' + str(data[:5]))
tf.logging.info('most_common_words: ' + str(most_common_words[:5]))
tf.logging.info('most_common_dict: ' + str(zip(most_common_dict.keys()[:5], most_common_dict.values()[:5])))
tf.logging.info('reverse_most_common_dict: ' + str(zip(reverse_most_common_dict.keys()[:5], reverse_most_common_dict.values()[:5])))
return data, most_common_words, most_common_dict, reverse_most_common_dict
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
pre_process('./input/text8.txt', './input/pre_process_result.txt', 5000)
class Word2VecInput(object):
"""
读取训练数据
"""
def __init__(self, batch_size, num_skips, skip_window, seq_data_index):
"""
初始化, Skip-Gram
:param batch_size: batch size
:param num_skips: 每个单词产生的样本数量
:param skip_window: 以当前词为中心的skip window步长
:param seq_data_index: 输入的word序列, 其中的word已经转换成了index
"""
# 对seq_data_index进行遍历用的index
self._data_index = 0
# 因为每个单词产生 num_skips个样本,所以batch_size需要保证包含完整的某个单词的样本
assert batch_size % num_skips == 0
# skip_window中心单词向两边的步长,所以num_skips必须小于2*skip_window
assert num_skips <= 2 * skip_window
self._batch_size = batch_size
self._num_skips = num_skips
self._skip_window = skip_window
self._seq_data_index = seq_data_index
def read_data(self):
batch = np.ndarray(shape=[self._batch_size], dtype=np.int32)
labels = np.ndarray(shape=[self._batch_size, 1], dtype=np.int32)
# 每个单词产生skip-gram样本的跨度
span = 2 * self._skip_window + 1
# 建立双端队列,保存当前一段单词
span_buffer = collections.deque(maxlen=span)
for _ in range(span):
# 每一个span中的单词放入队列
span_buffer.append(self._seq_data_index[self._data_index])
# 当_data_index超过了全部单词序列的长度时候,重新计算,这样可以循环使用无限产出
self._data_index = (self._data_index + 1) % len(self._seq_data_index)
# 产生一个batch
for i in range(self._batch_size // self._num_skips):
target = self._skip_window # 目标词,就是中心词,所以index 就和skip_window是一样的
# target是要避免作为样本产出的
targets_to_avoid = {target}
for j in range(self._num_skips):
# 随机找到寻找一个不是target 并且没有被抽样抽中的单词,
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.add(target)
# 对于样本来说 中心词也就是 _skip_window index就是样本
batch[i * self._num_skips + j] = span_buffer[self._skip_window]
# 对于label来说,target就是label
labels[i * self._num_skips + j][0] = span_buffer[target]
# 处理完一个单词之后,再向后移动一个单词,同时将最前面的单词出队列. 这是用过max_len=span来保证的
span_buffer.append(self._seq_data_index[self._data_index])
self._data_index = (self._data_index + 1) % len(self._seq_data_index)
return batch, labels
直接使用nce loss。
class Word2VecInference(object):
def __init__(self, batch_size, embedding_size, num_sampled):
"""
:param batch_size:
:param embedding_size:
:param num_sampled: 负采样的数量
"""
super(Word2VecInference, self).__init__()
self._bach_size = batch_size
self._embedding_size = embedding_size
self._num_sampled = num_sampled
self._loss = None
self._train_input_placeholder = None
self._train_labels_placeholder = None
self._normal_embedings = None
@property
def train_input_placeholder(self):
return self._train_input_placeholder
@property
def train_labels_placeholder(self):
return self._train_labels_placeholder
@property
def loss(self):
return self._loss
def inference(self):
train_input = tf.placeholder(tf.int32, shape=[self._bach_size])
train_labels = tf.placeholder(tf.int32, shape=[self._bach_size, 1])
with tf.device('/cpu:0'):
# 创建所有词汇的embedding, 初始化后是shape=(50000,128) 这说明每一个单词都是128维
embedding = tf.Variable(
tf.random_uniform([common.VOCABULARY_SIZE, self._embedding_size],
-1.0, 1.0)
)
logging.info('embeding shape: ' + str(embedding.shape))
# 训练的输入数据需要在embeding中查找
# train_input,shape=(128) (128是batch_size大小),
# 将train_input去embedding中查找到相应的自己的128维的向量,
# 返回的embed shape=(batch_size=128, dim=128)
# 这一这就是为什么在input_data中产出的 样本的维度是shape=(batch_size=128), 而label=(batch_size, 1)
# 二者的维度对不上的缘故,因为样本还需要进行一个转换也就是这里的
# train_input:shape=(batch_size) => embed:shape=(batch_size,128)
# 转换后的embed 和 label的维度就吻合了
# 这里同样也告诉我们 如果想要获取到某个word的维度只需要调用embedding_lookup就可以拿到了.
embed = tf.nn.embedding_lookup(embedding, train_input)
logging.info('embed shape: ' + str(embed.shape))
# nce_weights 实际上就是对每一个embedding的words的每一个维度的weight
nce_weights = tf.Variable(
tf.truncated_normal([common.VOCABULARY_SIZE, self._embedding_size],
stddev=1.0/math.sqrt(self._embedding_size))
)
# bias相当于每一个embedding拥有一个bias
nce_bias = tf.Variable(tf.zeros([common.VOCABULARY_SIZE]))
nce_loss = tf.nn.nce_loss(
weights=nce_weights,
biases=nce_bias,
labels=train_labels,
inputs=embed,
num_sampled=self._num_sampled,
num_classes=common.VOCABULARY_SIZE
)
loss = tf.reduce_mean(nce_loss)
self._loss = loss
self._train_input_placeholder = train_input
self._train_labels_placeholder = train_labels
# 实际使用的embedding的维度需要做一个归一化.
embedding_square = tf.square(embedding)
embedding_square_sum = tf.reduce_sum(embedding_square, 1, keep_dims=True)
norm = tf.sqrt(embedding_square_sum)
self._normal_embedings = embedding / norm
logging.info('embedding_square shape: ' + str(embedding_square.shape))
logging.info('embedding_square_sum shape: ' + str(embedding_square_sum.shape))
logging.info('norm shape: ' + str(norm.shape))
return loss
class Word2VecTrain(ITrain):
def train(self):
seq_data_index, most_common_words, most_common_dict, reverse_most_common_dict = \
pre_process.pre_process(‘./input/text8.txt’, common.VOCABULARY_SIZE)
input_data = Word2VecInput(common.BATCH_SIZE,
common.num_skips,
common.skip_window,
seq_data_index)
infrence = Word2VecInference(common.BATCH_SIZE,
common.EMBEDDING_SIZE,
common.NUM_SAMPLED)
graph = tf.Graph()
with graph.as_default():
loss = infrence.inference()
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
with tf.Session(graph=graph) as session:
session.run(tf.global_variables_initializer())
session.run(tf.local_variables_initializer())
total_loss = 0
for step in range(common.num_steps):
batch_inputs, batch_labels = input_data.read_data()
_, loss_val = session.run([optimizer, loss],
feed_dict={infrence.train_input_placeholder: batch_inputs,
infrence.train_labels_placeholder: batch_labels})
total_loss += loss_val
if step % 1000 == 0:
tf.logging.info('step: ' + str(step) + ', average loss: ' + str(total_loss / step))