1:先导入包
#encoding=utf8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import os
import random
import zipfile
import numpy as np
from six.moves import urllib
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
第二步:读取数据,提取文件中的文本。
#自己手动用tensorflow实现word2vec
#1:先读取文件,提取文件中的文本
filename = 'text8.zip'
def readdata(filename):
with zipfile.ZipFile(filename) as f: #打开zip文件
data = tf.compat.as_str(f.read(f.namelist()[0])).split() #分割压缩文件的文件名
return data
words = readdata(filename)
print(len(words))
剔除高频停用词,减小模型噪声
def remove_fre_stop_words(words):
t = 1e-5 #t值
threshold = 0.8 #剔除概率阀值,当词频高于0.8时则剔除
#统计单词频率
int_word_counts = collections.Counter(words) #collections.Counter 可以用来计数
total_count = len(words)
#计算单词频率
word_freqs = {w :c /total_count for w , c in int_word_counts.items()}
#计算被删除的概率
prob_drop = {w :1 - np.sqrt( t / f ) for w ,f in word_freqs.items()}
#对分词进行采样
train_words = [w for w in words if prob_drop[w] < threshold]
return train_words
words = remove_fre_stop_words(words)
print(len(words))
建立词典以及生僻词用UNK代替
vocabulary_size = len(set(words)) #words中不重复的单词个数
print("Data size",vocabulary_size)
def build_dataset(words):
count = [['UNK',-1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) #words中每个分词计数,然后按照词频降序排列放在count里:[['UNK', -1], ('的', 99229), ('在', 25925), ('是', 20172), ('年', 17007), ('和', 16514), ('为', 15231), ('了', 13053), ('有', 11253), ('与', 11194)]
dictionary = dict()
for word ,_ in count :
dictionary[word] = len(dictionary) #为count中的每个词分配一个编号:
# [('UNK', 0), ('的', 1), ('在', 2), ('是', 3), ('年', 4), ('和', 5), ('为', 6), ('了', 7), ('有', 8), ('与', 9)]
#编号越大,则频率越小 相当于词典,key是词,value是分配的编号
data = list()
unk_count = 0
data = [dictionary[word] if word in dictionary else 0 for word in words] # 将words中的每个分词用序列号表示:[14880, 4491, 483, 70, 1, 1009, 1850, 317, 14, 76]
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys())) #将dictionary中每个词的键值对对换,[(0, 'UNK'), (1, '的'), (2, '在'), (3, '是'), (4, '年'), (5, '和'), (6, '为'), (7, '了')]
return data,count,dictionary,reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words) # data:2262896,语料中的每个词的对应的编号; count:199247,相当于词频表,key是语料中所有的词,value是词频;
del words # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
data_index = 0
#该函数是为了得到每次的batch数据,即每次的train和labels数据,batch_size为每次的batch,bag_windows为隔多少个词,比如总长度为3,则第一个词和第三个词作为train
#中间的第二个词作为label。总长度为3. 萌萌 爱 赖赖 萌萌和赖赖为train,中间的爱为label
def generate_batch(batch_size,bag_windows):
global data_index
span = 2 * bag_windows +1 #每个span为 2 × bag_windows
batch = np.ndarray(shape=(batch_size,span - 1),dtype = np.int32) #初始化batch
labels = np.ndarray(shape=(batch_size,1),dtype= np.int32) #初始化lables
buffer = collections.deque(maxlen=span) #这里要选用对于咧,list不能用,要记得指定最大长度为span。
for i in range(span):
buffer.append(data[data_index])
data_index = (data_index +1 ) % len(data)
for i in range(batch_size):
buffer_list = list(buffer)
labels[i,0] = buffer_list.pop(bag_windows)
batch[i] = buffer_list
buffer.append(data[data_index])
data_index = (data_index +1) % len(data)
return batch,labels
batch, labels = generate_batch(batch_size=4, bag_windows=1)
print(batch)
print(labels)
#接下来构建模型
batch_size =128
embedding_size = 128
bag_window = 2
valid_size = 16
valid_window = 100
valid_examples = np.array(random.sample(range(valid_window),valid_size)) #random.sample
#random.sample函数
#import random
# list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# for i in range(3):
# slice = random.sample(list, 5) # 从list中随机获取5个元素,作为一个片断返回
# print(slice)
# print(list, '\n') # 原有序列并没有改变
num_sampled = 64
graph = tf.Graph()
#构建tensorlfow的图
with graph.as_default():
train_dataset = tf.placeholder(tf.int32,shape=[batch_size,bag_window*2])
train_labels = tf.placeholder(tf.int32,shape=[batch_size,1])
valid_dataset = tf.constant(valid_examples,dtype=tf.int32)
#词向量通过均匀分布初始化,shape为词表大小*词向量维数,通过tf.nn.embedding_lookup()查表将输入转为词向量形式。
embeddings = tf.Variable(tf.random.uniform([vocabulary_size,embedding_size],-1.0,1.0))
embeds = tf.nn.embedding_lookup(embeddings,train_dataset)
#噪声对比估计的损失按照逻辑回归模型定义,所以对于每一个词,都要有对应的权向量和偏置,通过截尾正态分布初始化(只保留两个标准差以内的值)。
softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
softmax_bias = tf.Variable(tf.zeros([vocabulary_size]))
#定义损失函数
loss= tf.reduce_mean(tf.nn.sampled_softmax_loss(softmax_weights,softmax_bias,train_labels,tf.reduce_sum(embeds, 1),num_sampled,vocabulary_size))
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
#按照行列求和
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
similarity = tf.matmul(valid_embeddings,tf.transpose(normalized_embeddings))
#tf.transpose 将normalized_embeddings进行转至
num_steps = 1000001
with tf.Session(graph=graph) as sess:
tf.initialize_all_variables().run()
print("初始化")
average_loss = 0
for step in range(num_steps):
batch_data,batch_labels = generate_batch(batch_size,bag_window)
feed_dict = {train_dataset:batch_data,train_labels:batch_labels}
_,l = sess.run([optimizer,loss],feed_dict=feed_dict)
average_loss+=l
if step % 2000 == 0:
if step > 0:
average_loss = average_loss / 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print('Average loss at step %d: %f' % (step, average_loss))
average_loss = 0
# note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in range(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
print("---------------------------")
print(valid_word)
log = 'Nearest to %s:' % valid_word
for k in range(top_k):
close_word = reverse_dictionary[nearest[k]]
log = '%s %s,' % (log, close_word)
print(log)
final_embeddings = normalized_embeddings.eval()
print("*" * 10 + "final_embeddings:" + "*" * 10 + "\n", final_embeddings)
fp = open('vector_cbow_mengmeng.txt', 'w', encoding='utf8')
for k, v in reverse_dictionary.items():
t = tuple(final_embeddings[k])
s = ''
for i in t:
i = str(i)
s += i + " "
fp.write(v + " " + s + "\n")
fp.close()
数据文件text.8.zip是经过分词处理后的维基百科数据。需要的可以下载。 https://download.csdn.net/download/wang263334857/11389707
运行效果如下:
Average loss at step 672000: 0.115959
Average loss at step 674000: 0.101580
Average loss at step 676000: 0.093495
Average loss at step 678000: 0.100869
Average loss at step 680000: 0.094914
---------------------------
university
Nearest to university: 中共中央组织部, 卡里莫夫, 布凡, 黄辅辰, 林永升, 从里, 伯承, 玄猫,
---------------------------
西
Nearest to 西: wtem, 夕阳, 阿当, 网书, 株洲县, 大里区, 夜访, 辽视,
---------------------------
佛教
Nearest to 佛教: 辽圣宗伐, 人工湖, 活动场所, 农业产业, 有寺, 领巾, 黄俊华, 杏仁,
---------------------------
古代
Nearest to 古代: 比较文学, 碳酸钾, 古老, 头等, 巴颜喀拉山, 最早, 匹妇, 华隆语,
---------------------------
像
Nearest to 像: 略偏, mutter, 出废, 压下去, 瓦尔特, 小鸭, 鸭绿江畔, 纳普,
---------------------------
n
Nearest to n: 异丁烯, 郑康公, 羲, 后前, 张梓琳, yoshi, 控制组, shaolin,
---------------------------
细胞
Nearest to 细胞: 媒介, 合众国, 鸟嘌呤, 林周县, 本流, 波得, 柳河县, 血,
---------------------------
地位
Nearest to 地位: 以林大, 陕西汉, 权力, 当大, 视听, 非典型, 寒带, treeempty,
---------------------------
俄罗斯
Nearest to 俄罗斯: 信部, 村道, 暂行规定, output, 内海, 诱惑, ronda, 之命立,
---------------------------
机构
Nearest to 机构: 油香, 同级, 国联, mythologie, 无母数, 结合能, 谢尔夫, xrtt,
---------------------------
委员会
Nearest to 委员会: 委, 陶努斯园, 降服, 金禧, 苹果日报, 苕, 首都区, 紫荆,
---------------------------
西班牙
Nearest to 西班牙: 男著, 北约组织, 纳瓦拉, 萨族, 易七世, 售电, 邱庆枫, 比利时,
---------------------------
控制
Nearest to 控制: 卡梅伦, 集团, 排斥性, 正交关系, 啤酒花, 大出风头, staples, 一万倍,
---------------------------
公元前
Nearest to 公元前: mavericks, 顺帝, 螺旋运动, 水坝, 破坏活动, 双辽市, 东别, 莫维奇,
---------------------------
功能
Nearest to 功能: 听觉, 现用, 成人, 亲唐, 自愧, 有线电, filarmonica, 触气,
---------------------------
生物
Nearest to 生物: 朱椿为, mecha, 求证, 舞鞋, 核工程, 国际交流, eat, 展览馆,
Average loss at step 682000: 0.097374