skip-gram 理论部分见:NLP 笔记:Skip-gram_刘文巾的博客-CSDN博客
import numpy as np
import torch
from torch import nn, optim
import random
from collections import Counter
import matplotlib.pyplot as plt
#训练数据
text ='I like dog i like cat i like \
animal dog cat animal apple \
cat dog like dog fish milk like dog \
cat eyes like i like apple apple \
i hate apple i movie book music like \
cat dog hate cat dog like he is man she\
is woman king is man queen is woman'
#参数设置
EMBEDDING_DIM = 2 #词向量维度
PRINT_EVERY = 1000 #可视化频率
EPOCHS = 1000 #训练的轮数
BATCH_SIZE = 5 #每一批训练数据大小
N_SAMPLES = 3 #负样本大小
WINDOW_SIZE = 5 #最大背景词窗口大小
FREQ = 0 #词汇出现频率,低于这个的词汇将被删除
DELETE_WORDS = False #是否删除部分高频词(是否二次采样)
#文本预处理
def preprocess(text, FREQ):
text = text.lower()
words = text.split()
word_counts = Counter(words)
#计算每个词出现的次数
trimmed_words = []
for word in words :
if word_counts[word] > FREQ:
trimmed_words.append(word)
return trimmed_words
words = preprocess(text, FREQ)
#保留下词汇出现频率大于等于FREQ的词
'''
['i',
'like',
'dog',
'i',
'like',
'cat',
'i',
'like',
'animal',
'dog',
'cat',
'animal',
'apple',
'cat',
'dog',
...
'''
#构建词典 word<->num;num<->word
vocab = set(words)
vocab2int = {}
int2vocab = {}
for c, w in enumerate(vocab):
vocab2int[w]=c
int2vocab[c]=w
vocab2int,int2vocab
'''
({'music': 0,
'movie': 1,
'book': 2,
'he': 3,
...
{0: 'music',
1: 'movie',
2: 'book',
...
'''
#将文本转化为数值
int_words = []
for w in words:
int_words.append(vocab2int[w])
int_words
'''
[6,
4,
12,
6,
4,
13,
6,
4,
9,
12,
...
'''
#计算单词频次
int_word_counts = Counter(int_words)
total_count = len(int_words)
word_freqs={}
for w, c in int_word_counts.items():
word_freqs[w]=c/total_count
word_freqs
'''
{6: 0.11320754716981132,
4: 0.16981132075471697,
12: 0.1320754716981132,
13: 0.11320754716981132,
...
'''
#二次采样:去除出现频次高的词汇
if DELETE_WORDS:
t = 1e-5
prob_drop = {}
for w in int_word_counts:
prob_drop[w]=1-np.sqrt(t/word_freqs[w])
#每个word的丢弃概率
train_words=[]
for w in int_words:
if(random.random()<(1-prob_drop[w])):
train_words.append(w)
else:
train_words = int_words
word_freqs = np.array(list(word_freqs.values()))
noise_dist = torch.from_numpy(word_freqs ** (0.75) / np.sum(word_freqs ** (0.75)))
noise_dist
'''
tensor([0.0969, 0.1314, 0.1088, 0.0969, 0.0425, 0.0715, 0.0253, 0.0253, 0.0253,
0.0425, 0.0253, 0.0253, 0.0253, 0.0253, 0.0715, 0.0425, 0.0253, 0.0425,
0.0253, 0.0253], dtype=torch.float64)
'''
#获取目标词汇(中心词的背景词)
def get_target(words, idx, WINDOW_SIZE):
target_window = np.random.randint(1, WINDOW_SIZE+1)
#当前窗口的大小,窗口大大小可以灵活调整,这样训练得到的效果会更好
if (idx-target_window)>0:
start_point = idx-target_window
#预测窗口的第一个下标
else:
start_point=0
if(idx+target_window
#批次化数据
def get_batch(words, BATCH_SIZE, WINDOW_SIZE):
n_batches = len(words)//BATCH_SIZE
#我们的单词一共需要划分成几个batch
words = words[:n_batches*BATCH_SIZE]
#这里我们是把最后多出来的几个词剔除掉了
for idx in range(0, len(words), BATCH_SIZE):
batch_x, batch_y = [],[]
batch = words[idx:idx+BATCH_SIZE]
#当前batch所涉及的单词
for i in range(len(batch)):
x = batch[i]
y = get_target(batch, i, WINDOW_SIZE)
我们记y返回的窗口size为k_i(一个batch里面不同的i可能窗口的size是不一样的)
batch_x.extend([x]*len(y))
#batch_x:[k_i]
batch_y.extend(y)
#batch_y:[k_i]
yield batch_x, batch_y
#每次输出当前的这一组batch_x和batch_y,待下一次调用的时候,生成下一组
#batch_x:[sigma{k_i}]
#batch_y:[sigma{k_i}]
class SkipGramNeg(nn.Module):
def __init__(self, n_vocab, n_embed, noise_dist):
super().__init__()
self.n_vocab = n_vocab
#输入的单词数量(len(vocab2int))
self.n_embed = n_embed
#中间词向量的维度(EMBEDDING_DIM)
self.noise_dist = noise_dist
#定义词向量层(每个词被选成负样本的概率)
#noise_dist [n_vocab]
self.in_embed = nn.Embedding(n_vocab, n_embed)
#中心词对应的权重矩阵
#维度 n_vocab->n_embed
self.out_embed = nn.Embedding(n_vocab, n_embed)
#背景词对应的权重矩阵
#维度 n_vocab->n_embed
#词向量层参数初始化
self.in_embed.weight.data.uniform_(-1, 1)
self.out_embed.weight.data.uniform_(-1, 1)
#将参数的范围限定在(-1,1)
#输入词的前向过程(对应 input->hidden)
#作为中心词的向量表示
def forward_input(self, input_words):
#k_i表示一个batch每个i的窗口大小
#input_words [sigma{k_i}]
input_vectors = self.in_embed(input_words)
return input_vectors
#input_vectors [sigma{k_i},n_embed]
#目标词的前向过程(在中心词窗口内的背景词的编码)
#作为背景词的向量表示
def forward_output(self, output_words):
#output_words [sigma{k_i}]
output_vectors = self.out_embed(output_words)
return output_vectors
#output_vectors [sigma{k_i},n_embed]
#负样本词的前向过程
def forward_noise(self, size, N_SAMPLES):
noise_dist = self.noise_dist
#从词汇分布(各单词选作负样本的概率)中采样负样本
noise_words = torch.multinomial(noise_dist,
size * N_SAMPLES,
replacement=True)
#从词汇分布中有放回地选择size * N_SAMPLES个样本(每个word pair配N_SAMPLES个负样本)
noise_vectors = self.out_embed(noise_words).view(size, N_SAMPLES, self.n_embed)
return noise_vectors
#noise_vectors [sigma{k_i},N_SAMPLES,n_embed]
负采样见NLP 笔记:Skip-gram_刘文巾的博客-CSDN博客
class NegativeSamplingLoss(nn.Module):
def __init__(self):
super().__init__()
def forward(self, input_vectors, output_vectors, noise_vectors):
#k_i表示一个batch每个i的窗口大小
#input_vectors [sigma{k_i},n_embed]
#output_vectors [sigma{k_i},n_embed]
#noise_vectors [sigma{k_i},N_SAMPLES,n_embed]
BATCH_SIZE, embed_size = input_vectors.shape
#将输入词向量与目标词向量作维度转化处理
input_vectors = input_vectors.view(BATCH_SIZE, embed_size, 1)
#input_vectors [sigma{k_i},n_embed,1]
output_vectors = output_vectors.view(BATCH_SIZE, 1, embed_size)
#output_vectors [sigma{k_i},1,n_embed]
test = torch.bmm(output_vectors, input_vectors)
#test [sigma{k_i},1,1]
#也就是中心词和背景词的条件概率,即u_o^T v_c
#目标词的loss
out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log()
#out_loss [sigma{k_i},1,1]
out_loss = out_loss.squeeze()
#out_loss [sigma{k_i}]
#负样本损失
noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log()
#noise_loss [sigma{k_i},N_SAMPLES,1]
noise_loss = noise_loss.squeeze().sum(1)
#noise_loss [sigma{k_i},1]
#综合计算两类损失
#目标是让正样本概率大,负样本概率小,也就是两个loss的和越大越好
#但损失函数是要最小化, 所以前面要加一个-
return -(out_loss + noise_loss).mean()
model = SkipGramNeg(len(vocab2int),
EMBEDDING_DIM,
noise_dist=noise_dist)
criterion = NegativeSamplingLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)
#训练
steps = 0
for e in range(EPOCHS):
#获取输入词以及目标词
for input_words, target_words in get_batch(train_words, BATCH_SIZE, WINDOW_SIZE):
#input_words [sigma{k_i}]
#target_words [sigma{k_i}]
steps += 1
inputs, targets = torch.LongTensor(input_words), torch.LongTensor(target_words)
#输入、输出以及负样本向量
input_vectors = model.forward_input(inputs)
#k_i表示一个batch每个i的窗口大小
#input_vectors [sigma{k_i},n_embed]
output_vectors = model.forward_output(targets)
#output_vectors [sigma{k_i},n_embed]
size, _ = input_vectors.shape
#size:sigma{k_i}
noise_vectors = model.forward_noise(size, N_SAMPLES)
#noise_vectors [sigma{k_i},N_SAMPLES,n_embed]
#计算损失
loss = criterion(input_vectors, output_vectors, noise_vectors)
#打印损失
if steps%PRINT_EVERY == 0:
print("loss:",loss)
#梯度回传
optimizer.zero_grad()
loss.backward()
optimizer.step()
'''
loss: tensor(2.3455, grad_fn=)
loss: tensor(1.6729, grad_fn=)
loss: tensor(1.6398, grad_fn=)
loss: tensor(1.5920, grad_fn=)
loss: tensor(1.4348, grad_fn=)
loss: tensor(1.5463, grad_fn=)
loss: tensor(1.4360, grad_fn=)
loss: tensor(1.6348, grad_fn=)
loss: tensor(1.4676, grad_fn=)
loss: tensor(1.6141, grad_fn=)
'''
如NLP 笔记:Skip-gram_刘文巾的博客-CSDN博客 第4条第3)小条所言,权重矩阵的每一行对应的是一个点在稠密空间上的编码
plt.figure(figsize=(20,10))
for i, w in int2vocab.items():
vectors = model.state_dict()["in_embed.weight"]
x,y = float(vectors[i][0]),float(vectors[i][1])
plt.scatter(x,y)
plt.annotate(w, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()