静态词向量之word2vec-cbow

介绍

如果你看懂了skipgram和cbow的区别,那么实现上面就很简单了。skipgram是中心词预测周围词,cbow是周围词预测中心词,即dataset那里更换下input和target即可。

具体就不细讲了,大家看源码吧~。

实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

# Defined in Section 5.2.3.1

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights

class CbowDataset(Dataset):
def __init__(self, corpus, vocab, context_size=2):
self.data = []
self.bos = vocab[BOS_TOKEN]
self.eos = vocab[EOS_TOKEN]
for sentence in tqdm(corpus, desc="Dataset Construction"):
sentence = [self.bos] + sentence+ [self.eos]
if len(sentence) < context_size * 2 + 1:
continue
for i in range(context_size, len(sentence) - context_size):
# 模型输入:左右分别取context_size长度的上下文
context = sentence[i-context_size:i] + sentence[i+1:i+context_size+1]
# 模型输出:当前词
target = sentence[i]
self.data.append((context, target))

def __len__(self):
return len(self.data)

def __getitem__(self, i):
return self.data[i]

def collate_fn(self, examples):
inputs = torch.tensor([ex[0] for ex in examples])
targets = torch.tensor([ex[1] for ex in examples])
return (inputs, targets)

class CbowModel(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(CbowModel, self).__init__()
# 词嵌入层
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
# 线性变换:隐含层->输出层
self.output = nn.Linear(embedding_dim, vocab_size)
init_weights(self)

def forward(self, inputs):
embeds = self.embeddings(inputs)
# 计算隐含层:对上下文词向量求平均
hidden = embeds.mean(dim=1)
output = self.output(hidden)
log_probs = F.log_softmax(output, dim=1)
return log_probs

embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10

# 读取文本数据,构建CBOW模型训练数据集
corpus, vocab = load_reuters()
dataset = CbowDataset(corpus, vocab, context_size=context_size)
data_loader = get_loader(dataset, batch_size)

nll_loss = nn.NLLLoss()
# 构建CBOW模型,并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CbowModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
inputs, targets = [x.to(device) for x in batch]
optimizer.zero_grad()
log_probs = model(inputs)
loss = nll_loss(log_probs, targets)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Loss: {total_loss:.2f}")

# 保存词向量(model.embeddings)
save_pretrained(vocab, model.embeddings.weight.data, "cbow.vec")

你可能感兴趣的:(word2vec,深度学习,pytorch,自然语言处理,python)