栗子:比如有一个语料库:
i love you but you love him i am sad
现在窗口为5,即中心词左右两边有2个单词组成的统计窗口,窗口0、1长度小于5是因为中心词左侧内容少于2个,同理窗口8、9长度也小于5。窗口内容如下:
窗口标号 | 中心词 | 窗口内容 |
---|---|---|
0 | i | i love you |
1 | love | i love you but |
2 | you | i love you but you |
3 | but | love you but you love |
4 | you | you but you love him |
5 | love | but you love him i |
6 | him | you love him i am |
7 | i | love him i am sad |
8 | am | him i am sad |
9 | sad | i am sad |
需要定义继承Dataset
的GloveDataset
类(对训练预料处理、构建词表),同时共现矩阵的构建和存取。
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights
from collections import defaultdict
class GloveDataset(Dataset):
def __init__(self, corpus, vocab, context_size=2):
# 记录词与上下文在给定语料中的共现次数
self.cooccur_counts = defaultdict(float)
self.bos = vocab[BOS_TOKEN] # 句首标记
self.eos = vocab[EOS_TOKEN] # 句尾标记
for sentence in tqdm(corpus, desc="Dataset Construction"):
sentence = [self.bos] + sentence + [self.eos]
for i in range(1, len(sentence)-1):
w = sentence[i]
left_contexts = sentence[max(0, i - context_size):i]
right_contexts = sentence[i+1:min(len(sentence), i + context_size)+1]
# 共现次数随距离衰减: 1/d(w, c)
for k, c in enumerate(left_contexts[::-1]):
self.cooccur_counts[(w, c)] += 1 / (k + 1)
for k, c in enumerate(right_contexts):
self.cooccur_counts[(w, c)] += 1 / (k + 1)
self.data = [(w, c, count) for (w, c), count in self.cooccur_counts.items()]
def __len__(self):
return len(self.data)
def __getitem__(self, i):
return self.data[i]
def collate_fn(self, examples):
words = torch.tensor([ex[0] for ex in examples])
contexts = torch.tensor([ex[1] for ex in examples])
counts = torch.tensor([ex[2] for ex in examples])
return (words, contexts, counts)
Glove比负采样的skip-gram模型增加两个偏置变量w_biased
和c_biases
变量。
class GloveModel(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(GloveModel, self).__init__()
# 词嵌入及偏置向量
self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.w_biases = nn.Embedding(vocab_size, 1)
# 上下文嵌入及偏置向量
self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.c_biases = nn.Embedding(vocab_size, 1)
def forward_w(self, words):
w_embeds = self.w_embeddings(words)
w_biases = self.w_biases(words)
return w_embeds, w_biases
def forward_c(self, contexts):
# 上下文embedding和偏置向量
c_embeds = self.c_embeddings(contexts)
c_biases = self.c_biases(contexts)
return c_embeds, c_biases
# 用以控制样本权重的超参数
m_max = 100
alpha = 0.75
# 从文本数据中构建GloVe训练数据集
corpus, vocab = load_reuters()
dataset = GloveDataset(
corpus,
vocab,
context_size=context_size
)
def get_loader(dataset, batch_size, shuffle=True):
data_loader = DataLoader(
dataset,
batch_size=batch_size,
collate_fn=dataset.collate_fn,
shuffle=shuffle
)
return data_loader
data_loader = get_loader(dataset, batch_size)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = GloveModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train()
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
words, contexts, counts = [x.to(device) for x in batch]
# 提取batch内词、上下文的向量表示及偏置
word_embeds, word_biases = model.forward_w(words)
context_embeds, context_biases = model.forward_c(contexts)
# 回归目标值:必要时可以使用log(counts+1)进行平滑
log_counts = torch.log(counts)
# 样本权重
weight_factor = torch.clamp(torch.pow(counts / m_max, alpha), max=1.0)
optimizer.zero_grad()
# 计算batch内每个样本的L2损失
loss = (torch.sum(word_embeds * context_embeds, dim=1) + word_biases + context_biases - log_counts) ** 2
# 样本加权损失
wavg_loss = (weight_factor * loss).mean()
wavg_loss.backward()
optimizer.step()
total_loss += wavg_loss.item()
print(f"Loss: {total_loss:.2f}")
# 合并词嵌入矩阵与上下文嵌入矩阵,作为最终的预训练词向量
combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
save_pretrained(vocab, combined_embeds.data, "glove.vec")
glove.vec
读取到字典里,单词为key,embedding作为value;选了几个单词的词向量进行降维,然后将降维后的数据转为dataframe格式,绘制散点图进行可视化。sklearn.manifold
的TSNE
:
perplexity
参数用于控制 t-SNE 算法的困惑度,n_components
参数用于指定降维后的维度数,init
参数用于指定初始化方式,n_iter
参数用于指定迭代次数,random_state
参数用于指定随机数种子。ax.annotate(word, pos, fontsize = 40)
可以在每个节点位置加上对应词向量的key。# !/usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.pyplot as plt
# 加载训练好的词向量
embeddings_index = {}
with open('/glove.vec', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
# 选择部分词向量进行降维可视化
# words = ['apple', 'banana', 'orange', 'pear', 'grape', 'watermelon']
words = ['good', 'well', 'many', 'fears', 'american', 'analyst', 'china']
# embeddings_index_selected = {word: embeddings_index[word] for word in words}
embeddings_index_selected = np.array([embeddings_index[word] for word in words])
# 对词向量进行降维
tsne_model = TSNE(perplexity=5, n_components=2, init='pca', n_iter=2500, random_state=23)
# tsne_values = tsne_model.fit_transform(list(embeddings_index_selected.values()))
tsne_values = tsne_model.fit_transform(embeddings_index_selected)
# 将降维后的数据转换为 DataFrame 格式
df = pd.DataFrame(tsne_values, index=words, columns=['x', 'y'])
# 绘制散点图
fig, ax = plt.subplots(figsize=(15, 15))
ax.scatter(df['x'], df['y'])
for word, pos in df.iterrows():
ax.annotate(word, pos, fontsize = 40)
plt.savefig("glove_embedding_tsne.jpg")
plt.show()
结果如下图,可以看到good单词和well单词还是很靠近,说明glove训练出来的词向量还是靠谱的哈哈。
代替requires_gradient=False
冻结词向量参数,即词向量作为特征使用def load_pretrained(load_path):
with open(load_path, "r") as fin:
# Optional: depending on the specific format of pretrained vector file
n, d = map(int, fin.readline().split())
tokens = []
embeds = []
for line in fin:
line = line.rstrip().split(' ')
token, embed = line[0], list(map(float, line[1:]))
tokens.append(token)
embeds.append(embed)
vocab = Vocab(tokens)
embeds = torch.tensor(embeds, dtype=torch.float)
return vocab, embeds
def knn(W, x, k):
similarities = torch.matmul(x, W.transpose(1, 0)) / (torch.norm(W, dim=1) * torch.norm(x) + 1e-9)
knn = similarities.topk(k=k)
return knn.values.tolist(), knn.indices.tolist()
# 在词向量空间中进行近义词检索(这里我们用到我们之前glove.py训练好的glove.vec,预训练词向量)
def find_similar_words(embeds, vocab, query, k=10):
knn_values, knn_indices = knn(embeds, embeds[vocab[query]], k + 1)
knn_words = vocab.convert_ids_to_tokens(knn_indices)
print(f">>> Query word: {query}")
for i in range(k):
print(f"cosine similarity={knn_values[i + 1]:.4f}: {knn_words[i + 1]}")
word_sim_queries = ['good', 'well', 'many', 'fears', 'american', \
'analyst', 'china', 'apple', 'banana', 'orange', 'grape', 'watermelon']
vocab, embeds = load_pretrained("glove.vec")
for w in word_sim_queries:
find_similar_words(embeds, vocab, w)
print("=======test========")
[1] 语言模型(LM)介绍及实操
[2] 利用t-SNE可视化Glove向量
[3] 李沐动手学dl:自然语言推断:微调BERT
[4] 自然语言处理.基于预训练模型的方法
[5] 论文:Encoding meaning components in vector differences
[Pennington, Socher, and Manning, EMNLP 2014]
[6] Encoding meaning in vector differences
[Pennington, Socher, and Manning, EMNLP 2014]
[7] Word2Vec词向量—本质、思想、推导、中文维基百科词向量训练&可视化实战