skipgram的原理及公式推倒就不详细说了,主要记录一下第一个正向传播和反向传播都自己写的神经网络,也终于体验了一把负采样对于词向量训练速度的惊人提升,感人!虽然最终的时间复杂度依然较高,不过我正在研究同样使用python的gensim为啥这么快的原因!
(明天有时间会把)数据和代码放在本人的github里,写的比较搓,待改进...
python: 3.6
电脑:mac本地跑
数据集: text8的英文语料
def preprocess(text, freq=5):
'''
对文本进行预处理
参数
---
text: 文本数据
freq: 词频阈值
'''
# 替换文本中特殊符号
text = text.lower()
text = text.replace('.', ' ')
text = text.replace(',', ' ')
text = text.replace('"', ' ')
text = text.replace(';', ' ')
text = text.replace('!', ' ')
text = text.replace('?', ' ')
text = text.replace('(', ' ')
text = text.replace(')', ' ')
text = text.replace('--', ' ')
text = text.replace('?', ' ')
text = text.replace(':', ' ')
words = text.split()
# 删除低频词,减少噪音影响
word_counts = Counter(words)
trimmed_words = [word for word in words if word_counts[word] > freq]
return trimmed_words
其中代表单词的出现频次。为一个阈值,一般介于1e-3到1e-5之间,若大于一个阈值,就删除。
def get_train_words(path, t, threshold, freq):
with open(path) as f:
text = f.read()
words = preprocess(text, freq)
vocab = set(words)
vocab_to_int = {w: c for c, w in enumerate(vocab)}
int_to_vocab = {c: w for c, w in enumerate(vocab)}
# 对原文本进行vocab到int的转换
int_words = [vocab_to_int[w] for w in words]
# 统计单词出现频次
int_word_counts = Counter(int_words)
total_count = len(int_words)
# 计算单词频率
word_freqs = {w: c/total_count for w, c in int_word_counts.items()}
# 计算被删除的概率
prob_drop = {w: 1 - np.sqrt(t / word_freqs[w]) for w in int_word_counts}
# 对单词进行采样
train_words = [w for w in int_words if prob_drop[w] < threshold]
return int_to_vocab, train_words
这里上下文单词的window是随机采样的,这么做是为了更多的采样离中心词更近的单词,毕竟离中心词越近,跟中心词关联的越紧密嘛!
def get_targets(words, idx, window_size):
'''
获得中心词的上下文单词列表
参数
---
words: 单词列表
idx: input word的索引号
window_size: 窗口大小
'''
target_window = np.random.randint(1, window_size + 1)
# 这里要考虑input word前面单词不够的情况
start_point = idx - target_window if (idx - target_window) > 0 else 0
end_point = idx + target_window
# output words(即窗口中的上下文单词)
targets = set(words[start_point: idx] + words[idx + 1: end_point + 1])
return list(targets)
def get_batches(words, window_size):
'''
将中心词的上下文单词列表一一与中心词配对
'''
for idx in range(0, len(words)):
targets = get_targets(words, idx, window_size)
for y in targets:
yield words[idx], y
其中sigmoid_grad是对sigmoid函数求梯度。
def softmax(vector):
res = np.exp(vector)
e_sum = np.sum(res)
res /= e_sum
return res
def sigmoid(inp):
return 1.0 / (1.0 + 1.0 / np.exp(inp))
def sigmiod_grad(inp):
return inp * (1 - inp)
def forward_backword(input_vectors, output_vectors, in_idx, out_idx, sigma, vector_dimension, vocabulary_size):
hidden = input_vectors[in_idx]
output = np.dot(output_vectors, hidden)
output_p = softmax(output)
loss = -np.log(output_p[out_idx])
output_grad = output_p.copy()
output_grad[out_idx] -= 1.0
hidden_grad = np.dot(output_vectors.T, output_grad)
hidden = hidden.reshape(vector_dimension, 1)
output_grad = output_grad.reshape(vocabulary_size, 1)
output_vectors_grad = np.dot(output_grad, hidden.T)
output_vectors -= sigma * output_vectors_grad
input_vectors[in_idx] -= sigma * hidden_grad
return loss
但是要注意,这个是最基础的skipgram模型的前向传播和反向传播,它实在是太慢了!慢到根本无法使用!所以下面会用负采样模型替代它。
def neg_forward_backword(input_vectors, output_vectors, in_idx, out_idx, sigma, vocabulary_size, K=10):
epsilon = 1e-5
hidden = input_vectors[in_idx]
neg_idxs = neg_sample(vocabulary_size, out_idx, K)
tmp = sigmoid(np.dot(output_vectors[out_idx], hidden))
hidden_grad = (tmp - 1.0) * output_vectors[out_idx]
output_vectors[out_idx] -= sigma * (tmp - 1.0) * hidden
loss = -np.log(tmp + epsilon)
for idx in neg_idxs:
tmp = sigmoid(np.dot(output_vectors[idx], hidden))
loss -= np.log(1.0 - tmp + epsilon)
hidden_grad += tmp * output_vectors[idx]
output_vectors[idx] -= sigma * tmp * hidden
input_vectors[in_idx] -= sigma * hidden_grad
return loss
def neg_sample(vocabulary_size, out_idx, K):
res = [None] * K
for i in range(K):
tmp = np.random.randint(0, vocabulary_size)
while tmp == out_idx:
tmp = np.random.randint(0, vocabulary_size)
res[i] = tmp
return np.array(res)
为了验证一下我们的词向量训练效果,得看看单词的最相似的K个单词是不是和它比较相似,这个函数就是随机选取一些高频单词,求这些单词的最相似的K个单词。
def get_simi(input_vectors):
valid_size = 16
valid_window = 100
# 从不同位置各选8个单词
valid_examples = np.array(random.sample(range(valid_window), valid_size // 2))
valid_examples = np.append(valid_examples,
random.sample(range(1000, 1000 + valid_window), valid_size // 2))
valid_size = len(valid_examples)
# 计算每个词向量的模并进行单位化
norm = np.sqrt(np.square(input_vectors).sum(axis=1)).reshape(len(input_vectors), 1)
normalized_embedding = input_vectors / norm
# 查找验证单词的词向量
valid_embedding = normalized_embedding[valid_examples]
# 计算余弦相似度
similarity = np.dot(valid_embedding, normalized_embedding.T)
return similarity, valid_size, valid_examples
if __name__ == "__main__":
path = './text8.txt'
t = 1e-5
threshold = 0.8 # 剔除概率阈值
freq = 5
windows = 10
int_to_vocab, train_words = get_train_words(path, t, threshold, freq)
np.save('int_to_vocab', int_to_vocab)
vocabulary_size = len(int_to_vocab)
vector_dimension = 200
input_vectors = np.random.random([vocabulary_size, vector_dimension])
output_vectors = np.random.random([vocabulary_size, vector_dimension])
epochs = 10 # 迭代轮数
sigma = 0.01
K = 10
iter = 1
for e in range(1, epochs + 1):
if e > 1:
sigma = 0.001
elif e > 3:
sigma = 0.0001
loss = 0
batches = get_batches(train_words, windows)
start = time.time()
for x, y in batches:
loss += neg_forward_backword(input_vectors, output_vectors, x, y, sigma, vocabulary_size, K)
if iter % 100000 == 0:
end = time.time()
print("Epoch {}/{}".format(e, epochs),
"Iteration: {}".format(iter),
"Avg. Training loss: {:.4f}".format(loss / 100000),
"{:.4f} sec/100000".format((end - start)))
loss = 0
start = time.time()
if iter % 4000000 == 0:
np.save('input_vectors', input_vectors)
similarity, valid_size, valid_examples = get_simi(input_vectors)
for i in range(valid_size):
valid_word = int_to_vocab[valid_examples[i]]
top_k = 8 # 取最相似单词的前8个
nearest = (-similarity[i, :]).argsort()[1:top_k + 1]
log = 'Nearest to [%s]:' % valid_word
for k in range(top_k):
close_word = int_to_vocab[nearest[k]]
log = '%s %s,' % (log, close_word)
print(log)
iter += 1
可以看出还是有一些效果的,但由于时间复杂度比较高,没有调参,epoch跑的也不够,数据量用的也比较小,所以效果不是太好。但对于熟悉skipgram模型的内部机制、熟悉负采样也足够了!不过我正在研究同样使用python的gensim为啥这么快的原因!打算借鉴一下,再自己实现一下hierarchical softmax。
参考网址:https://www.leiphone.com/news/201706/QprrvzsrZCl4S2lw.html
https://zhuanlan.zhihu.com/p/33625794