用tensorflow做的cbow词向量

1:先导入包

#encoding=utf8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile
import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

第二步:读取数据,提取文件中的文本。



#自己手动用tensorflow实现word2vec

#1:先读取文件,提取文件中的文本


filename = 'text8.zip'
def  readdata(filename):
    with zipfile.ZipFile(filename) as f:  #打开zip文件
        data  = tf.compat.as_str(f.read(f.namelist()[0])).split()  #分割压缩文件的文件名
    return data


words = readdata(filename)

print(len(words))

剔除高频停用词,减小模型噪声


def  remove_fre_stop_words(words):
    t = 1e-5   #t值
    threshold = 0.8  #剔除概率阀值,当词频高于0.8时则剔除

    #统计单词频率
    int_word_counts = collections.Counter(words)   #collections.Counter  可以用来计数

    total_count  = len(words)

    #计算单词频率
    word_freqs = {w :c /total_count for w , c  in int_word_counts.items()}

    #计算被删除的概率
    prob_drop =  {w :1 - np.sqrt( t / f ) for w ,f in  word_freqs.items()}

    #对分词进行采样

    train_words = [w for w in words  if prob_drop[w] < threshold]

    return train_words

words = remove_fre_stop_words(words)

print(len(words))

 

 

建立词典以及生僻词用UNK代替

vocabulary_size =  len(set(words))  #words中不重复的单词个数
print("Data size",vocabulary_size)

def build_dataset(words):
    count  = [['UNK',-1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) #words中每个分词计数,然后按照词频降序排列放在count里:[['UNK', -1], ('的', 99229), ('在', 25925), ('是', 20172), ('年', 17007), ('和', 16514), ('为', 15231), ('了', 13053), ('有', 11253), ('与', 11194)]
    dictionary = dict()
    for word ,_   in  count :
        dictionary[word] = len(dictionary)  #为count中的每个词分配一个编号:
        # [('UNK', 0), ('的', 1), ('在', 2), ('是', 3), ('年', 4), ('和', 5), ('为', 6), ('了', 7), ('有', 8), ('与', 9)]
        #编号越大,则频率越小  相当于词典,key是词,value是分配的编号

    data = list()
    unk_count  = 0
    data = [dictionary[word] if word in dictionary else 0 for word in words]   # 将words中的每个分词用序列号表示:[14880, 4491, 483, 70, 1, 1009, 1850, 317, 14, 76]

    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))  #将dictionary中每个词的键值对对换,[(0, 'UNK'), (1, '的'), (2, '在'), (3, '是'), (4, '年'), (5, '和'), (6, '为'), (7, '了')]

    return data,count,dictionary,reverse_dictionary

 




data, count, dictionary, reverse_dictionary = build_dataset(words)  # data:2262896,语料中的每个词的对应的编号; count:199247,相当于词频表,key是语料中所有的词,value是词频;
del words  # Hint to reduce memory.

print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])


data_index = 0

#该函数是为了得到每次的batch数据,即每次的train和labels数据,batch_size为每次的batch,bag_windows为隔多少个词,比如总长度为3,则第一个词和第三个词作为train
#中间的第二个词作为label。总长度为3.   萌萌  爱   赖赖     萌萌和赖赖为train,中间的爱为label
def generate_batch(batch_size,bag_windows):
    global data_index
    span = 2 * bag_windows +1  #每个span为 2 × bag_windows
    batch =   np.ndarray(shape=(batch_size,span - 1),dtype = np.int32)  #初始化batch
    labels = np.ndarray(shape=(batch_size,1),dtype= np.int32)  #初始化lables
    buffer =  collections.deque(maxlen=span)  #这里要选用对于咧,list不能用,要记得指定最大长度为span。
    for i in  range(span):
        buffer.append(data[data_index])
        data_index =  (data_index +1 ) % len(data)

    for i in range(batch_size):
        buffer_list = list(buffer)
        labels[i,0] = buffer_list.pop(bag_windows)
        batch[i] = buffer_list
        buffer.append(data[data_index])
        data_index = (data_index +1) % len(data)
    return batch,labels




batch, labels = generate_batch(batch_size=4, bag_windows=1)

print(batch)
print(labels)



#接下来构建模型


batch_size =128

embedding_size = 128

bag_window = 2

valid_size = 16
valid_window = 100
valid_examples = np.array(random.sample(range(valid_window),valid_size))  #random.sample

#random.sample函数
#import random

# list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# for i in range(3):
#     slice = random.sample(list, 5)  # 从list中随机获取5个元素,作为一个片断返回
#     print(slice)
#     print(list, '\n')  # 原有序列并没有改变

num_sampled = 64
graph  = tf.Graph()

#构建tensorlfow的图
with graph.as_default():
    train_dataset = tf.placeholder(tf.int32,shape=[batch_size,bag_window*2])
    train_labels = tf.placeholder(tf.int32,shape=[batch_size,1])
    valid_dataset =   tf.constant(valid_examples,dtype=tf.int32)

    #词向量通过均匀分布初始化,shape为词表大小*词向量维数,通过tf.nn.embedding_lookup()查表将输入转为词向量形式。
    embeddings = tf.Variable(tf.random.uniform([vocabulary_size,embedding_size],-1.0,1.0))
    embeds = tf.nn.embedding_lookup(embeddings,train_dataset)


    #噪声对比估计的损失按照逻辑回归模型定义,所以对于每一个词,都要有对应的权向量和偏置,通过截尾正态分布初始化(只保留两个标准差以内的值)。

    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
    softmax_bias = tf.Variable(tf.zeros([vocabulary_size]))

    #定义损失函数

    loss=  tf.reduce_mean(tf.nn.sampled_softmax_loss(softmax_weights,softmax_bias,train_labels,tf.reduce_sum(embeds, 1),num_sampled,vocabulary_size))

    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

    #按照行列求和
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keep_dims=True))
    normalized_embeddings = embeddings / norm

    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset)
    similarity =  tf.matmul(valid_embeddings,tf.transpose(normalized_embeddings))
    #tf.transpose  将normalized_embeddings进行转至


num_steps = 1000001

with tf.Session(graph=graph)   as  sess:
    tf.initialize_all_variables().run()
    print("初始化")
    average_loss = 0

    for step in range(num_steps):
        batch_data,batch_labels  = generate_batch(batch_size,bag_window)
        feed_dict = {train_dataset:batch_data,train_labels:batch_labels}
        _,l = sess.run([optimizer,loss],feed_dict=feed_dict)
        average_loss+=l
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
            # note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                print("---------------------------")
                print(valid_word)
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)
    final_embeddings = normalized_embeddings.eval()

    print("*" * 10 + "final_embeddings:" + "*" * 10 + "\n", final_embeddings)
    fp = open('vector_cbow_mengmeng.txt', 'w', encoding='utf8')
    for k, v in reverse_dictionary.items():
        t = tuple(final_embeddings[k])

        s = ''
        for i in t:
            i = str(i)
            s += i + " "

        fp.write(v + " " + s + "\n")

    fp.close()














 

 

 

数据文件text.8.zip是经过分词处理后的维基百科数据。需要的可以下载。  https://download.csdn.net/download/wang263334857/11389707

 

运行效果如下:

Average loss at step 672000: 0.115959
Average loss at step 674000: 0.101580
Average loss at step 676000: 0.093495
Average loss at step 678000: 0.100869
Average loss at step 680000: 0.094914
---------------------------
university
Nearest to university: 中共中央组织部, 卡里莫夫, 布凡, 黄辅辰, 林永升, 从里, 伯承, 玄猫,
---------------------------
西
Nearest to 西: wtem, 夕阳, 阿当, 网书, 株洲县, 大里区, 夜访, 辽视,
---------------------------
佛教
Nearest to 佛教: 辽圣宗伐, 人工湖, 活动场所, 农业产业, 有寺, 领巾, 黄俊华, 杏仁,
---------------------------
古代
Nearest to 古代: 比较文学, 碳酸钾, 古老, 头等, 巴颜喀拉山, 最早, 匹妇, 华隆语,
---------------------------

Nearest to 像: 略偏, mutter, 出废, 压下去, 瓦尔特, 小鸭, 鸭绿江畔, 纳普,
---------------------------
n
Nearest to n: 异丁烯, 郑康公, 羲, 后前, 张梓琳, yoshi, 控制组, shaolin,
---------------------------
细胞
Nearest to 细胞: 媒介, 合众国, 鸟嘌呤, 林周县, 本流, 波得, 柳河县, 血,
---------------------------
地位
Nearest to 地位: 以林大, 陕西汉, 权力, 当大, 视听, 非典型, 寒带, treeempty,
---------------------------
俄罗斯
Nearest to 俄罗斯: 信部, 村道, 暂行规定, output, 内海, 诱惑, ronda, 之命立,
---------------------------
机构
Nearest to 机构: 油香, 同级, 国联, mythologie, 无母数, 结合能, 谢尔夫, xrtt,
---------------------------
委员会
Nearest to 委员会: 委, 陶努斯园, 降服, 金禧, 苹果日报, 苕, 首都区, 紫荆,
---------------------------
西班牙
Nearest to 西班牙: 男著, 北约组织, 纳瓦拉, 萨族, 易七世, 售电, 邱庆枫, 比利时,
---------------------------
控制
Nearest to 控制: 卡梅伦, 集团, 排斥性, 正交关系, 啤酒花, 大出风头, staples, 一万倍,
---------------------------
公元前
Nearest to 公元前: mavericks, 顺帝, 螺旋运动, 水坝, 破坏活动, 双辽市, 东别, 莫维奇,
---------------------------
功能
Nearest to 功能: 听觉, 现用, 成人, 亲唐, 自愧, 有线电, filarmonica, 触气,
---------------------------
生物
Nearest to 生物: 朱椿为, mecha, 求证, 舞鞋, 核工程, 国际交流, eat, 展览馆,
Average loss at step 682000: 0.097374

你可能感兴趣的:(人工智能)