# 这里省略了偏置 b
c = np.array([[1, 0, 0, 0, 0, 0, 0]]) # 输入 (1 x 7) 考虑了 mini-batch 的处理
W = np.random.randn(7, 3) # 权重
h = np.dot(c, W) # 中间节点
print(h)
# [[-0.70012195 0.25204755 -0.79774592]]
Bag-Of-Words 是 “一袋子单词” 的意思,这意味着袋子中单词的顺序被忽视了
from common.layers import MatMul
# 样本的上下文数据
c0 = np.array([[1, 0, 0, 0, 0, 0, 0]])
c1 = np.array([[0, 0, 1, 0, 0, 0, 0]])
# 权重的初始值
W_in = np.random.randn(7, 3)
W_out = np.random.randn(3, 7)
# 生成层
in_layer0 = MatMul(W_in)
in_layer1 = MatMul(W_in)
out_layer = MatMul(W_out)
# 正向传播
h0 = in_layer0.forward(c0)
h1 = in_layer1.forward(c1)
h = 0.5 * (h0 + h1)
s = out_layer.forward(h)
print(s)
# [[ 0.30916255 0.45060817 -0.77308656 0.22054131 0.15037278
# -0.93659277 -0.59612048]]
将 Softmax 层和 Cross Entropy Error 层统一为 Softmax with Loss 层
这里我们仍以 “You say goodbye and I say hello.” 这个只有一句话的语料库为例进行说明
# corpus 为用 word_id 表示的语料库
def create_contexts_target(corpus, window_size=1):
target = corpus[window_size:-window_size]
contexts = []
for idx in range(window_size, len(corpus)-window_size):
cs = []
for t in range(-window_size, window_size + 1):
if t == 0:
continue
cs.append(corpus[idx + t])
contexts.append(cs)
return np.array(contexts), np.array(target)
from common.util import preprocess, create_contexts_target, convert_one_hot
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
contexts, target = create_contexts_target(corpus, window_size=1)
print(contexts)
# [[0 2]
# [1 3]
# [2 4]
# [3 1]
# [4 5]
# [1 6]]
print(target)
# [1 2 3 4 1 5]
# 将上下文和目标词转化为 one-hot 表示
vocab_size = len(word_to_id)
target = convert_one_hot(target, vocab_size)
contexts = convert_one_hot(contexts, vocab_size)
import sys
sys.path.append('..')
import numpy as np
from common.layers import MatMul, SoftmaxWithLoss
class SimpleCBOW:
def __init__(self, vocab_size, hidden_size):
V, H = vocab_size, hidden_size
# 初始化权重
W_in = 0.01 * np.random.randn(V, H).astype('f') # 32 位
W_out = 0.01 * np.random.randn(H, V).astype('f')
# 生成层
self.in_layer = MatMul(W_in)
self.out_layer = MatMul(W_out)
self.loss_layer = SoftmaxWithLoss()
# 将所有的权重和梯度整理到列表中
layers = [self.in_layer, self.out_layer]
self.params, self.grads = [], []
for layer in layers:
self.params += layer.params
self.grads += layer.grads
# 将单词的分布式表示设置为成员变量
self.word_vecs = W_in
def forward(self, contexts, target):
h = self.in_layer.forward((contexts[:, 0] + contexts[:, 1]) * 0.5)
score = self.out_layer.forward(h)
loss = self.loss_layer.forward(score, target)
return loss
def backward(self, dout=1):
ds = self.loss_layer.backward(dout)
da = self.out_layer.backward(ds)
self.in_layer.backward(da)
return None
学习的实现
# coding: utf-8
import sys
sys.path.append('..') # 为了引入父目录的文件而进行的设定
from common.trainer import Trainer
from common.optimizer import Adam
from simple_cbow import SimpleCBOW
from common.util import preprocess, create_contexts_target, convert_one_hot
window_size = 1
hidden_size = 5
batch_size = 3
max_epoch = 1000
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
contexts, target = create_contexts_target(corpus, window_size)
target = convert_one_hot(target, vocab_size)
contexts = convert_one_hot(contexts, vocab_size)
model = SimpleCBOW(vocab_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()
word_vecs = model.word_vecs
for word_id, word in id_to_word.items():
print(word, word_vecs[word_id])
import sys
sys.path.append('..')
import numpy as np
from common.layers import MatMul, SoftmaxWithLoss
class SimpleSkipGram:
def __init__(self, vocab_size, hidden_size):
V, H = vocab_size, hidden_size
# 初始化权重
W_in = 0.01 * np.random.randn(V, H).astype('f')
W_out = 0.01 * np.random.randn(H, V).astype('f')
# 生成层
self.in_layer = MatMul(W_in)
self.out_layer = MatMul(W_out)
self.loss_layer1 = SoftmaxWithLoss()
self.loss_layer2 = SoftmaxWithLoss()
# 将所有的权重和梯度整理到列表中
layers = [self.in_layer, self.out_layer]
self.params, self.grads = [], []
for layer in layers:
self.params += layer.params
self.grads += layer.grads
# 将单词的分布式表示设置为成员变量
self.word_vecs = W_in
def forward(self, contexts, target):
h = self.in_layer.forward(target)
s = self.out_layer.forward(h)
l1 = self.loss_layer1.forward(s, contexts[:, 0])
l2 = self.loss_layer2.forward(s, contexts[:, 1])
loss = l1 + l2
return loss
def backward(self, dout=1):
dl1 = self.loss_layer1.backward(dout)
dl2 = self.loss_layer2.backward(dout)
ds = dl1 + dl2
dh = self.out_layer.backward(ds)
self.in_layer.backward(dh)
return None
单词的密集向量表示称为词嵌入(word embedding) 或者 单词的分布式表示(distributed representation)
class Embedding:
def __init__(self, W):
self.params = [W]
self.grads = [np.zeros_like(W)]
self.idx = None
def forward(self, idx):
W, = self.params
self.idx = idx
out = W[idx]
return out
def backward(self, dout):
dW, = self.grads
dW[...] = 0
np.add.at(dW, self.idx, dout) # 将 dout 加到 dW 上,需要进行加法的行由 self.idx 指定
# for i, word_id in enumerate(self.idx):
# dW[word_id] += dout[i]
return None
这里要注意,在反向传播时,如果写成
dW[self.idx] = dout
会出问题:当idx
中元素重复时 (e.g.idx
= [0, 2, 0, 4]),同一个单词反向传播得到的多个梯度就会只累计一次:
为了解决这个问题,需要像上面的代码一样,对同一个单词反向传播得到的多个梯度进行累加
网络改为二分类后,输入 “you” 和 “goodbye”,只需输出中间单词是 “say” 的概率 (Yes/No 问题)
Sigmoid With Loss
class EmbeddingDot:
def __init__(self, W):
self.embed = Embedding(W)
self.params = self.embed.params
self.grads = self.embed.grads
self.cache = None # 保存正向传播时的计算结果
def forward(self, h, idx):
target_W = self.embed.forward(idx)
out = np.sum(target_W * h, axis=1)
self.cache = (h, target_W)
return out
def backward(self, dout):
h, target_W = self.cache
dout = dout.reshape(dout.shape[0], 1)
dtarget_W = dout * h
self.embed.backward(dtarget_W)
dh = dout * target_W
return dh
负采样的采样方法
# 负采样
class UnigramSampler:
def __init__(self, corpus, power, sample_size): # corpus 为单词 ID 列表格式的语料
self.sample_size = sample_size # 每个单词对的负例采样个数
self.vocab_size = None
self.word_p = None
counts = collections.Counter()
for word_id in corpus:
counts[word_id] += 1
vocab_size = len(counts)
self.vocab_size = vocab_size
self.word_p = np.zeros(vocab_size)
for i in range(vocab_size):
self.word_p[i] = counts[i]
self.word_p = np.power(self.word_p, power)
self.word_p /= np.sum(self.word_p)
def get_negative_sample(self, target):
batch_size = target.shape[0]
if not GPU:
# 针对每一个上下文单词对,都采样 sample_size 个负例
negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32)
for i in range(batch_size):
p = self.word_p.copy()
target_idx = target[i]
p[target_idx] = 0
p /= p.sum()
negative_sample[i, :] = np.random.choice(self.vocab_size, size=self.sample_size, replace=False, p=p)
else:
# 在用 GPU (cupy)计算时,优先速度;有时目标词存在于负例中
negative_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size),
replace=True, p=self.word_p)
return negative_sample
unigram 是“1 个(连续)单词”的意思。同样地,bigram 是“2个连续单词”的意思,trigram 是“3 个连续单词”的意思。这里使用
UnigramSampler
这个名字,是因为我们以 1个单词为对象创建概率分布。如果是 bigram,则 以 (‘you’, ‘say’)、(‘you’, ‘goodbye’)……这样的 2 个单词的组合为对象创建概率分布
# 包含 CBOW 中的 Embedding Dot + Sigmoid with loss
class NegativeSamplingLoss:
def __init__(self, W, corpus, power=0.75, sample_size=5):
self.sample_size = sample_size
self.sampler = UnigramSampler(corpus, power, sample_size)
# 包含 1 个正例用的层和 sample_size 个负例用的层,但要学习的权重其实就只有 W
# 生成 sample_size + 1 个层是为了保留前向传播的结果用于之后的反向传播
self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]
self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size + 1)]
self.params, self.grads = [], []
for layer in self.embed_dot_layers:
self.params += layer.params
self.grads += layer.grads
def forward(self, h, target):
batch_size = target.shape[0]
negative_sample = self.sampler.get_negative_sample(target)
# 正例的正向传播
score = self.embed_dot_layers[0].forward(h, target)
correct_label = np.ones(batch_size, dtype=np.int32)
loss = self.loss_layers[0].forward(score, correct_label)
# 负例的正向传播
negative_label = np.zeros(batch_size, dtype=np.int32)
for i in range(self.sample_size):
negative_target = negative_sample[:, i]
score = self.embed_dot_layers[1 + i].forward(h, negative_target)
loss += self.loss_layers[1 + i].forward(score, negative_label)
return loss
def backward(self, dout=1):
dh = 0
# 反向传播时,依次调用各层的 backward() 函数即可
for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
dscore = l0.backward(dout)
dh += l1.backward(dscore)
return dh
SimpleCBOW
类,改进版 CBOW
类使用了 Embedding 层和 Negative Sampling Loss 层。此外,还将上下文部分扩展为可以处理任意的窗口大小class CBOW:
def __init__(self, vocab_size, hidden_size, window_size, corpus):
V, H = vocab_size, hidden_size
# 初始化权重 (由于采用了 Embedding 层,输入、输出层的权重大小均为 V x H)
W_in = 0.01 * np.random.randn(V, H).astype('f')
W_out = 0.01 * np.random.randn(V, H).astype('f')
# 生成层
self.in_layers = []
for i in range(2 * window_size):
layer = Embedding(W_in) # 所有 Embedding 层共享同一组权重
self.in_layers.append(layer)
self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)
# 将所有的权重和梯度整理到列表中
layers = self.in_layers + [self.ns_loss]
self.params, self.grads = [], []
for layer in layers:
self.params += layer.params
self.grads += layer.grads
# 将单词的分布式表示设置为成员变量
self.word_vecs = W_in
def forward(self, contexts, target):
h = 0
# 由于使用 Embedding 层,contexts 存储的是 word-id 而非 one-hot vector
for i, layer in enumerate(self.in_layers):
h += layer.forward(contexts[:, i])
h *= 1 / len(self.in_layers)
loss = self.ns_loss.forward(h, target)
return loss
def backward(self, dout=1):
dout = self.ns_loss.backward(dout)
dout *= 1 / len(self.in_layers)
for layer in self.in_layers:
layer.backward(dout)
return None
class SkipGram:
def __init__(self, vocab_size, hidden_size, window_size, corpus):
V, H = vocab_size, hidden_size
rn = np.random.randn
# 初始化权重
W_in = 0.01 * rn(V, H).astype('f')
W_out = 0.01 * rn(V, H).astype('f')
# 生成层
self.in_layer = Embedding(W_in)
self.loss_layers = []
for i in range(2 * window_size):
layer = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)
self.loss_layers.append(layer)
# 将所有的权重和梯度整理到列表中
layers = [self.in_layer] + self.loss_layers
self.params, self.grads = [], []
for layer in layers:
self.params += layer.params
self.grads += layer.grads
# 将单词的分布式表示设置为成员变量
self.word_vecs = W_in
def forward(self, contexts, target):
h = self.in_layer.forward(target)
loss = 0
for i, layer in enumerate(self.loss_layers):
loss += layer.forward(h, contexts[:, i])
return loss
def backward(self, dout=1):
dh = 0
for i, layer in enumerate(self.loss_layers):
dh += layer.backward(dout)
self.in_layer.backward(dh)
return None
# coding: utf-8
import sys
sys.path.append('..')
from common import config
# 在用GPU运行时,请打开下面的注释(需要cupy)
# ===============================================
# config.GPU = True
# ===============================================
from common.np import *
import pickle
from common.trainer import Trainer
from common.optimizer import Adam
from cbow import CBOW
from skip_gram import SkipGram
from common.util import create_contexts_target, to_cpu, to_gpu
from dataset import ptb
# 设定超参数
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10
# 读入数据
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
contexts, target = create_contexts_target(corpus, window_size)
if config.GPU:
contexts, target = to_gpu(contexts), to_gpu(target)
# 生成模型等
model = CBOW(vocab_size, hidden_size, window_size, corpus)
# model = SkipGram(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)
# 开始学习
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()
# 保存必要数据,以便后续使用
word_vecs = model.word_vecs
if config.GPU:
word_vecs = to_cpu(word_vecs)
params = {}
params['word_vecs'] = word_vecs.astype(np.float16)
params['word_to_id'] = word_to_id
params['id_to_word'] = id_to_word
pkl_file = 'cbow_params.pkl' # or 'skipgram_params.pkl'
with open(pkl_file, 'wb') as f:
pickle.dump(params, f, -1)
# coding: utf-8
import sys
sys.path.append('..')
from common.util import most_similar, analogy
import pickle
pkl_file = 'cbow_params.pkl'
# pkl_file = 'skipgram_params.pkl'
with open(pkl_file, 'rb') as f:
params = pickle.load(f)
word_vecs = params['word_vecs']
word_to_id = params['word_to_id']
id_to_word = params['id_to_word']
下面我们只是简单地查询一下某些单词的 Top5 最相似单词
# most similar task
querys = ['you', 'year', 'car', 'toyota']
for query in querys:
most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
[query] you
we: 0.6103515625
someone: 0.59130859375
i: 0.55419921875
something: 0.48974609375
anyone: 0.47314453125
[query] year
month: 0.71875
week: 0.65234375
spring: 0.62744140625
summer: 0.6259765625
decade: 0.603515625
[query] car
luxury: 0.497314453125
arabia: 0.47802734375
auto: 0.47119140625
disk-drive: 0.450927734375
travel: 0.4091796875
[query] toyota
ford: 0.55078125
instrumentation: 0.509765625
mazda: 0.49365234375
bethlehem: 0.47509765625
nissan: 0.474853515625
类推问题
vec(‘king’) + vec(‘woman’) − vec(‘man’)
最近的单词向量 (可以通过计算余弦相似度求得)def analogy(a, b, c, word_to_id, id_to_word, word_matrix, top=5, answer=None):
for word in (a, b, c):
if word not in word_to_id:
print('%s is not found' % word)
return
print('\n[analogy] ' + a + ':' + b + ' = ' + c + ':?')
a_vec, b_vec, c_vec = word_matrix[word_to_id[a]], word_matrix[word_to_id[b]], word_matrix[word_to_id[c]]
query_vec = b_vec - a_vec + c_vec
query_vec = normalize(query_vec)
similarity = np.dot(word_matrix, query_vec)
if answer is not None:
print("==>" + answer + ":" + str(np.dot(word_matrix[word_to_id[answer]], query_vec)))
count = 0
for i in (-1 * similarity).argsort():
if np.isnan(similarity[i]):
continue
if id_to_word[i] in (a, b, c):
continue
print(' {0}: {1}'.format(id_to_word[i], similarity[i]))
count += 1
if count >= top:
return
基于类推问题的评价方法
由图 4-23 可知:语料库越大,结果越好 (始终需要大数据);单词向量的维数必须适中 (太大会导致精度变差)
下面也只是简单地查询一下几个类推问题的 Top5 答案
# analogy task
print('-'*50)
analogy('king', 'man', 'queen', word_to_id, id_to_word, word_vecs)
analogy('take', 'took', 'go', word_to_id, id_to_word, word_vecs)
analogy('car', 'cars', 'child', word_to_id, id_to_word, word_vecs)
analogy('good', 'better', 'bad', word_to_id, id_to_word, word_vecs)
[analogy] king:man = queen:?
woman: 5.16015625
veto: 4.9296875
ounce: 4.69140625
earthquake: 4.6328125
successor: 4.609375
[analogy] take:took = go:?
went: 4.55078125
points: 4.25
comes: 3.98046875
oct.: 3.90625
[analogy] car:cars = child:?
children: 5.21875
average: 4.7265625
yield: 4.20703125
cattle: 4.1875
priced: 4.1796875
[analogy] good:better = bad:?
more: 6.6484375
less: 6.0625
rather: 5.21875
slower: 4.734375
greater: 4.671875