本文主要对PyTorch的tutorial之一,Advanced: Making Dynamic Decisions and the Bi-LSTM CRF,进行详细解读,行文顺序上,首先说明一下前面几个辅助函数的作用,然后主体按照Run training的运行顺序进行。(以下删除了原代码注释,可回tutorial中查看)
def argmax(vec):
_, idx = torch.max(vec, 1)
return idx.item()
def prepare_sequence(seq, to_ix):
idxs = [to_ix[w] for w in seq]
return torch.tensor(idxs, dtype=torch.long)
def log_sum_exp(vec):
max_score = vec[0, argmax(vec)]
max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
x m + log ∑ i = 1 n e x i − x m = x m + log ( e − x m ∑ i = 1 n e x i ) = x m − x m + log ∑ i = 1 n e x i = log ∑ i = 1 n e x i x_m + \log \sum_{i=1}^n e^{x_i-x_m}=x_m+\log (e^{-x_m}\sum_{i=1}^ne^{x_i})=x_m-x_m+\log \sum_{i=1}^n e^{x_i}=\log \sum_{i=1}^n e^{x_i} xm+logi=1∑nexi−xm=xm+log(e−xmi=1∑nexi)=xm−xm+logi=1∑nexi=logi=1∑nexi
# Author: Robert Guthrie
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(1)
START_TAG = ""
STOP_TAG = ""
EMBEDDING_DIM = 5
HIDDEN_DIM = 4
training_data = [(
"the wall street journal reported today that apple corporation made money".split(),
"B I I I O O O B I O O".split()
), (
"georgia tech is a university in georgia".split(),
"B I O O O O B".split()
)]
word_to_ix = {}
for sentence, tags in training_data:
for word in sentence:
if word not in word_to_ix:
word_to_ix[word] = len(word_to_ix)
tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
class BiLSTM_CRF(nn.Module):
def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
super(BiLSTM_CRF, self).__init__()
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.vocab_size = vocab_size
self.tag_to_ix = tag_to_ix
self.tagset_size = len(tag_to_ix)
self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True)
self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))
self.transitions.data[tag_to_ix[START_TAG], :] = -10000
self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
self.hidden = self.init_hidden()
# class BiLSTM_CRF
def init_hidden(self):
return (torch.randn(2, 1, self.hidden_dim // 2),
torch.randn(2, 1, self.hidden_dim // 2))
with torch.no_grad():
precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
print(model(precheck_sent))
# class BiLSTM_CRF
def forward(self, sentence):
lstm_feats = self._get_lstm_features(sentence)
score, tag_seq = self._viterbi_decode(lstm_feats)
return score, tag_seq
# class BiLSTM_CRF
def _get_lstm_features(self, sentence):
self.hidden = self.init_hidden()
embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
lstm_out, self.hidden = self.lstm(embeds, self.hidden)
lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
lstm_feats = self.hidden2tag(lstm_out)
return lstm_feats
# class BiLSTM_CRF
def _viterbi_decode(self, feats):
backpointers = []
init_vvars = torch.full((1, self.tagset_size), -10000.)
init_vvars[0][self.tag_to_ix[START_TAG]] = 0
forward_var = init_vvars
for feat in feats:
bptrs_t = []
viterbivars_t = []
for next_tag in range(self.tagset_size):
next_tag_var = forward_var + self.transitions[next_tag]
best_tag_id = argmax(next_tag_var)
bptrs_t.append(best_tag_id)
viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
backpointers.append(bptrs_t)
terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
best_tag_id = argmax(terminal_var)
path_score = terminal_var[0][best_tag_id]
best_path = [best_tag_id]
for bptrs_t in reversed(backpointers):
best_tag_id = bptrs_t[best_tag_id]
best_path.append(best_tag_id)
start = best_path.pop()
assert start == self.tag_to_ix[START_TAG]
best_path.reverse()
return path_score, best_path
for epoch in range(300):
for sentence, tags in training_data:
model.zero_grad()
sentence_in = prepare_sequence(sentence, word_to_ix)
targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
loss = model.neg_log_likelihood(sentence_in, targets)
loss.backward()
optimizer.step()
# class BiLSTM_CRF
def neg_log_likelihood(self, sentence, tags):
feats = self._get_lstm_features(sentence)
forward_score = self._forward_alg(feats)
gold_score = self._score_sentence(feats, tags)
return forward_score - gold_score
l o s s = − log P ( y ∣ x ) = log ( ∑ y ′ e S c o r e ( x , y ′ ) ) − S c o r e ( x , y ) loss = -\log P(y|x) = \log (\sum_{y'}e^{Score(x, y')}) - Score(x,y) loss=−logP(y∣x)=log(y′∑eScore(x,y′))−Score(x,y)
其中,前一项即为forward_score,后一项为gold_score,下面分别计算这两项。
# class BiLSTM_CRF
def _forward_alg(self, feats):
init_alphas = torch.full((1, self.tagset_size), -10000.)
init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
forward_var = init_alphas
for feat in feats:
alphas_t = []
for next_tag in range(self.tagset_size):
emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
trans_score = self.transitions[next_tag].view(1, -1)
next_tag_var = forward_var + trans_score + emit_score
alphas_t.append(log_sum_exp(next_tag_var).view(1))
forward_var = torch.cat(alphas_t).view(1, -1)
terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
alpha = log_sum_exp(terminal_var)
return alpha
状态\标签 | B | I | O | START | STOP |
---|---|---|---|---|---|
START | x 0 0 x_0^0 x00 | x 1 0 x_1^0 x10 | x 2 0 x_2^0 x20 | x 3 0 x_3^0 x30 | x 4 0 x_4^0 x40 |
机器 | x 0 1 x_0^1 x01 | x 1 1 x_1^1 x11 | x 2 1 x_2^1 x21 | x 3 1 x_3^1 x31 | x 4 1 x_4^1 x41 |
学习 | x 0 2 x_0^2 x02 | x 1 2 x_1^2 x12 | x 2 2 x_2^2 x22 | x 3 2 x_3^2 x32 | x 4 2 x_4^2 x42 |
STOP | x 0 3 x_0^3 x03 | x 1 3 x_1^3 x13 | x 2 3 x_2^3 x23 | x 3 3 x_3^3 x33 | x 4 3 x_4^3 x43 |
当前\上一步 | B | I | O | START | STOP |
---|---|---|---|---|---|
B | y 0 0 y_0^0 y00 | y 1 0 y_1^0 y10 | y 2 0 y_2^0 y20 | y 3 0 y_3^0 y30 | y 4 0 y_4^0 y40 |
I | y 0 1 y_0^1 y01 | y 1 1 y_1^1 y11 | y 2 1 y_2^1 y21 | y 3 1 y_3^1 y31 | y 4 1 y_4^1 y41 |
O | y 0 2 y_0^2 y02 | y 1 2 y_1^2 y12 | y 2 2 y_2^2 y22 | y 3 2 y_3^2 y32 | y 4 2 y_4^2 y42 |
START | y 0 3 y_0^3 y03 | y 1 3 y_1^3 y13 | y 2 3 y_2^3 y23 | y 3 3 y_3^3 y33 | y 4 3 y_4^3 y43 |
STOP | y 0 4 y_0^4 y04 | y 1 4 y_1^4 y14 | y 2 4 y_2^4 y24 | y 3 4 y_3^4 y34 | y 4 4 y_4^4 y44 |
词\标签 | B | I | O | START | STOP |
---|---|---|---|---|---|
机器 | z 0 0 z_0^0 z00 | z 1 0 z_1^0 z10 | z 2 0 z_2^0 z20 | z 3 0 z_3^0 z30 | z 4 0 z_4^0 z40 |
学习 | z 0 1 z_0^1 z01 | z 1 1 z_1^1 z11 | z 2 1 z_2^1 z21 | z 3 1 z_3^1 z31 | z 4 1 z_4^1 z41 |
则计算最终状态过程如下:
初始状态: ( x 0 0 , x 1 0 , x 2 0 , x 3 0 , x 4 0 ) (x_0^0,x_1^0,x_2^0,x_3^0,x_4^0) (x00,x10,x20,x30,x40)
词"机器"的状态: x j 1 = log ∑ i = 0 4 e x i 0 + y i j + z j 0 , j = 0 , 1 , 2 , 3 , 4 x_j^1 = \log \sum_{i=0}^4 e^{x_i^0+y_i^j+z_j^0}, j = 0,1,2,3,4 xj1=log∑i=04exi0+yij+zj0,j=0,1,2,3,4
词"学习"的状态: x k 2 = log ∑ j = 0 4 e x j 1 + y j k + z k 1 , k = 0 , 1 , 2 , 3 , 4 x_k^2 = \log \sum_{j=0}^4 e^{x_j^1+y_j^k+z_k^1}, k = 0,1,2,3,4 xk2=log∑j=04exj1+yjk+zk1,k=0,1,2,3,4
将 x j 1 x_j^1 xj1表达式替换到 x k 2 x_k^2 xk2中,可得: x k 2 = log ∑ j = 0 4 ( ∑ i = 0 4 e x i 0 + y i j + z j 0 ) e y j k + z k 1 , k = 0 , 1 , 2 , 3 , 4 x_k^2= \log \sum_{j=0}^4 (\sum_{i=0}^4e^{x_i^0+y_i^j+z_j^0})e^{y_j^k+z_k^1}, k = 0,1,2,3,4 xk2=log∑j=04(∑i=04exi0+yij+zj0)eyjk+zk1,k=0,1,2,3,4
即: x k 2 = log ∑ j = 0 4 ∑ i = 0 4 e x i 0 + y i j + z j 0 + y j k + z k 1 , k = 0 , 1 , 2 , 3 , 4 x_k^2= \log \sum_{j=0}^4 \sum_{i=0}^4e^{x_i^0+y_i^j+z_j^0+y_j^k+z_k^1}, k = 0,1,2,3,4 xk2=log∑j=04∑i=04exi0+yij+zj0+yjk+zk1,k=0,1,2,3,4
最终状态: x l 3 = log ∑ k = 0 4 e x k 2 + y k l , l = 0 , 1 , 2 , 3 , 4 x_l^3 = \log \sum_{k=0}^4 e^{x_k^2+y_k^l}, l = 0,1,2,3,4 xl3=log∑k=04exk2+ykl,l=0,1,2,3,4
同理,替换可得: x l 3 = log ∑ k = 0 4 ∑ j = 0 4 ∑ i = 0 4 e x i 0 + y i j + z j 0 + y j k + z k 1 + y k l , l = 0 , 1 , 2 , 3 , 4 x_l^3= \log \sum_{k=0}^4 \sum_{j=0}^4 \sum_{i=0}^4e^{x_i^0+y_i^j+z_j^0+y_j^k+z_k^1+y_k^l}, l = 0,1,2,3,4 xl3=log∑k=04∑j=04∑i=04exi0+yij+zj0+yjk+zk1+ykl,l=0,1,2,3,4
最后再做一步log_sum_exp,可得alpha:
a l p h a = log ∑ l = 0 4 ∑ k = 0 4 ∑ j = 0 4 ∑ i = 0 4 e x i 0 + y i j + z j 0 + y j k + z k 1 + y k l alpha=\log \sum_{l=0}^4 \sum_{k=0}^4 \sum_{j=0}^4 \sum_{i=0}^4e^{x_i^0+y_i^j+z_j^0+y_j^k+z_k^1+y_k^l} alpha=log∑l=04∑k=04∑j=04∑i=04exi0+yij+zj0+yjk+zk1+ykl
每一组 ( i , j , k , l ) (i,j,k,l) (i,j,k,l)都对应一条可能路径,即 a l p h a = log ( ∑ y ′ e S c o r e ( x , y ′ ) ) alpha=\log (\sum_{y'}e^{Score(x, y')}) alpha=log(∑y′eScore(x,y′))
By the way,tutorial中也有说过,这部分实际通过矩阵相乘即可计算(可参考李航老师《统计学习方法》中的实例),这里只是为了显式说明具体过程。好了,重头戏结束,回到neg_log_likelihood,接下来计算gold_score。
# class BiLSTM_CRF
def _score_sentence(self, feats, tags):
score = torch.zeros(1)
tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
for i, feat in enumerate(feats):
score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
return score
with torch.no_grad():
precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
print(model(precheck_sent))
这样,全篇结束,其实模型虽然名为BI-LSTM CRF,但神经网络部分都利用的集成package,代码主体是在实现CRF部分。建议结合李航老师的《统计学习方法》,可以更深入地理解CRF。
如有代码理解或文字编排等方面的错误,还请批评指正。
OVER!