接下来是我的详细的推倒过程
接下来是代码的实现部分:
import argparse
import math
import struct
import sys
import time
import warnings
import os
import numpy as np
#from multiprocessing import Pool,Value,Array
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
## multiprocessing需要在linux环境下使用!!!!
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
import multiprocessing
class VocabItem: #为每个单词建立一个对象,保存相关内容
def __init__(self,word):
self.word=word #传入单词
self.count=0
self.path=None #the path from root to the word
self.code=None #Huffman encoding(embedding)
class Vocab:
def __init__(self,fi,min_count):
vocab_items=[]
vocab_hash={}
word_count=0
fi=open(fi,'r',encoding='utf-8')
#Add special tokens (beginning of line) and (end of line)
for token in ['','']: #打标记,开始 or 结束
vocab_hash[token]=len(vocab_items)
vocab_items.append(VocabItem(token))
for line in fi: #将单词放入语料库Vocab中,一行一行的读取
tokens=line.split()
for token in tokens:
if token not in vocab_hash: #构件的hash表中单词不会重复出现,并且会统计频次
vocab_hash[token]=len(vocab_items)
vocab_items.append(VocabItem(token))
vocab_items[vocab_hash[token]].count+=1 #每出现一次+1
word_count+=1 #文档中出现的单词总数(包括重复出现)
#Add special tokens (beginning of line) abd (end of line) 表示第一行结束,+1的目的是表示已经读入了1行单词
vocab_items[vocab_hash['']].count+=1
vocab_items[vocab_hash['']].count+=1
word_count+=2 #这里是因为做标记导致的词语数量增加
self.bytes=fi.tell()
self.vocab_items=vocab_items #List of VocabItems objects
self.vocab_hash=vocab_hash #Mapping from each token to its index in vocab
self.word_count=word_count #Total number of words in train file
#Add special token(unknown)
#merge words occuring less than mini_count into(unk),and
#sort vocab in descending order by frequency in train file
self.__sort(min_count) #声明了一个方法
#assert self.word_count==sum([t.count for t in self.vocab_items]),'word count and sum of t.count do not agree'
#可以使用==判断记录的单词总数和vocab_items连表中每个单词的频次之和是否对应,下面的print只是输出并没有进行判断
print("Total words in training file: %d" % self.word_count)
print("Total bytes in training file: %d" % self.bytes)
print("Vocab size: %d" % len(self)) #参考 __len__(self),用于记录单词的个数
# __len__
# 如果一个类表现得像一个list,要获取有多少个元素,就得用
# len() 函数。要让len()函数工作正常,类必须提供一个特殊方法__len__(),它返回元素的个数。
def __getitem(self,i):
return self.vocab_items[i]
def __len__(self):
return len(self.vocab_items)
def __iter__(self):
return iter(self.vocab_items) #用于遍历单词
def __contains__(self, key):
return key in self.vocab_hash #返回查找的单词的index
def __sort(self,min_count):
tmp=[] #目的是将当前语料库中min_count<5的单词去掉
tmp.append(VocabItem(''))
unk_hash=0
count_unk=0
for token in self.vocab_items:
if token.count< min_count:
count_unk+=1
tmp[unk_hash].count+=token.count
else:
tmp.append(token)
tmp.sort(key=lambda token : token.count,reverse=True) #key是用于排序的lambda表达式,定义了排序的指标,这里是token.count
#update vocab_hash
vocab_hash={}
for i,token in enumerate(tmp):
vocab_hash[token.word]=i
self.vocab_hash=vocab_hash
self.vocab_items=tmp #重构之后的语料库中<'ukn'>这一个special token代表了所有min_count<5的单词的总称
def indices(self,tokens):
return [self.vocab_hash[token] if token in self else self.vocab_hash[''] for token in tokens]
def encode_huffman(self):
#Building a Huffman tree
vocab_size=len(self)
count=[t.count for t in self]+[1e15]*(vocab_size-1) #创建单词叶结点以及Huffman的中间节点
#后面部分的目的是储存中间节点,比如将两个单词构造一个树结构的情况,1e15=10^15,目的是构造出来的小数中的频次的值一定会大于所有单词
#出现的频次,否则会出现将两个树进行合并时出现错位的问题
parent=[0]*(2*vocab_size-2) #[0]*2=[0,0] parent是Huffman树中节点的个数,parent节点个数貌似和Huffman边的个数一样??
binary=[0]*(2*vocab_size-2) #Huffman树中边的个数
pos1=vocab_size-1 #和单词数目对应,表示叶结点[t.count for t in self]部分,并且因为
# 单词表是根据降序排列的,vocab_size表示最小的频次的单词,之所以是-1是因为数组从0开始索引,而不是像字典可以根据具体单词索引
pos2=vocab_size #和中间节点对应表示[1e15]*(vocab_size-1)这部分的标号
for i in range(vocab_size-1):
#Find min1 第一个频次最小的节点
if pos1>=0:
if count[pos1]=0:
if count[pos1]= vocab_size: path.append(node_idx)
code.append(binary[node_idx]) #只是为了便于保存,更新单词中的编码的时候是倒序放入的
node_idx = parent[node_idx]
path.append(root_idx)
#these are path and code from root to the leaf
token.path=[j-vocab_size for j in path[::-1]] #[::-1]表示从最后一个开始便利当前list列表
#j根据Huffman树结构j-vocab_size刚好可以表示从根结点到当前单词所经过几个中间节点,也就是
#代表了编码长度
token.code=code[::-1]
#Huffman编码完成
#带有Huffman结构的语料库构建完成
class UnigramTale:
"""
A list of indices of tokens in the vocab following a power law distribution,
used to draw negative samples.
"""
def __init__(self, vocab):
vocab_size = len(vocab)
power = 0.75
#暂时不完成 这部分内容是负采样方法
def sigmoid(z): #sigmoid激活函数
if z>6: #因为sigmoid激活函数在>6 或 <-6的基本上为1或者0,这里是为了方便期间做的修改,因为是估计一个二分类的概率因此为了便于计算这么设置没有问题
return 1.0
elif z<-6:
return 0.0
else:
return 1/1+math.exp(-z)
def init_net(dim,vocab_size):
#Init syn0 with random numbers from a uniform distribution on the interval [-0.5,0.5]/dim
tmp1=np.random.uniform(low=-0.5/dim,high=0.5/dim,size=(vocab_size,dim))
syn0=tmp1
#syn0保存了初始word embedding空间
print("hello")
#Init syn1 with zeros syn1保存的是路径节点上的参数向量 θ
tmp2=np.zeros(shape=(vocab_size,dim))
syn1=tmp2
print("hello2")
return (syn0,syn1)
def train_process(vocab, syn0, syn1, table, cbow, neg, dim, starting_alpha,win, num_processes, global_word_count, fi,file_size):
#set fi to point to the right chunk of training file
# start=vocab.bytes/num_processes #将文件根据线程数分成若干份,pid表示当前执行的第pid个线程,其实就是在为多线程分配训练的文件块
# end=vocab.bytes if pid==num_processes-1 else vocab.bytes/num_processes*(pid+1)
# fi.seek(start)
#print'Worker %d begining training at %d , ending at %d %(pid,start,end)
print("hello3")
alpha=starting_alpha
print("global_word_count:")
print(global_word_count)
print('\n')
word_count=0
last_word_count=0
while fi.tell()']+line.split()+[''])
# print(sent)
for sent_pos,token in enumerate(sent): #token是单词在items中的序号不是不是单词本身
if word_count%1000==0:
print("已完成的词语训练个数:%d ----------------"%word_count)
global_word_count+=(word_count-last_word_count)
last_word_count=word_count #这里指的是每训练1000个单词就记录一次全局数据用于统计训练情况
#Recalculate alpha
alpha=starting_alpha * (1- float(global_word_count)/vocab.word_count)
if alpha 0:
print('Initializing unigram table')
else:
print
'Initializing Huffman tree'
vocab.encode_huffman()
# Begin training using num_processes workers
t0 = time.time()
print("hello10")
fi = open(fi, 'r', encoding='utf-8')
# initargs = [vocab, syn0, syn1, table, cbow, neg, dim, alpha,win, num_processes, global_word_count, fi]
# print(initargs)
# __init_process(initargs)
train_process(vocab, syn0, syn1, table, cbow, neg, dim, alpha,win, num_processes, global_word_count, fi,file_size)
t1 = time.time()
print('Completed training. Training took', (t1 - t0) / 60, 'minutes')
# Save model to file
save(vocab, syn0, fo, binary)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-train', help='Training file', dest='fi', required=True)
parser.add_argument('-model', help='Output model file', dest='fo', required=True)
parser.add_argument('-cbow', help='1 for CBOW, 0 for skip-gram', dest='cbow', default=1, type=int)
parser.add_argument('-negative',
help='Number of negative examples (>0) for negative sampling, 0 for hierarchical softmax',
dest='neg', default=5, type=int)
parser.add_argument('-dim', help='Dimensionality of word embeddings', dest='dim', default=100, type=int)
parser.add_argument('-alpha', help='Starting alpha', dest='alpha', default=0.025, type=float)
parser.add_argument('-window', help='Max window length', dest='win', default=5, type=int)
parser.add_argument('-min-count', help='Min count for words used to learn ', dest='min_count', default=5,
type=int)
parser.add_argument('-processes', help='Number of processes', dest='num_processes', default=1, type=int)
parser.add_argument('-binary', help='1 for output model in binary format, 0 otherwise', dest='binary', default=0,
type=int)
# TO DO: parser.add_argument('-epoch', help='Number of training epochs', dest='epoch', default=1, type=int)
args = parser.parse_args()
train(args.fi, args.fo, bool(args.cbow), args.neg, args.dim, args.alpha, args.win,
args.min_count, args.num_processes, bool(args.binary))
#python cbow_hierachical_softmax_single_process.py -train="hello_cbow.txt" -model=cbow_save_file -cbow=1 -negative=0 -dim=100 -alpha=0.025 -window=5 -min-count=5 -processes=4 -binary=1