用马尔科夫公式计算, 设 w = w 1 , w 2 , . . . , w n \bold{w}={w_1,w_2,...,w_n} w=w1,w2,...,wn是单词序列, z = z 1 , z 2 , . . . z n \bold{z}={z_1,z_2,...z_n} z=z1,z2,...zn是词性标注序列
则 z ^ = argmax z ∑ i = 1 n log p ( w i ∣ z i ) + ∑ t = 1 n log p ( z t ∣ z t − 1 ) \hat{z}=\mathop{\text{argmax}}\limits_z\sum\limits_{i=1}^n\log p(w_i|z_i)+\sum\limits_{t=1}^n\log p(z_t|z_{t-1}) z^=zargmaxi=1∑nlogp(wi∣zi)+t=1∑nlogp(zt∣zt−1)
数据集位于: 自然语言处理训练营\资料\Lesson9-CaseStudy-Viterbi, csdn链接
# %% 加载训练集(单词和标签)
tags = {'' }
words = {'' }
PATH_TO_TRAIN_DATA = r'xxx\traindata.txt' # TODO 下载地址上传到CSDN
for line in open(PATH_TO_TRAIN_DATA, 'r'):
items = line.split('/')
word, tag = items[0], items[1].rstrip()
words.add(word)
tags.add(tag)
id_to_tag = list(tags)
id_to_word = list(words)
tag_to_id = {tag: i for i, tag in enumerate(id_to_tag)}
word_to_id = {word: i for i, word in enumerate(id_to_word)}
vocab_size = len(word_to_id)
tag_size = len(tag_to_id)
del tags
del words
# %% 构建转移概率矩阵
import numpy as np
tag_to_tag_prob = np.zeros((tag_size, tag_size))
tag_to_word_prob = np.zeros((tag_size, vocab_size))
prev_tag_id = tag_to_id['' ]
for line in open(PATH_TO_TRAIN_DATA):
items = line.split('/')
word_id, tag_id = word_to_id[items[0]], tag_to_id[items[1].rstrip()]
tag_to_tag_prob[prev_tag_id, tag_id] += 1
tag_to_word_prob[tag_id, word_id] += 1
prev_tag_id = tag_id
if word_id == word_to_id["."]:
prev_tag_id = tag_to_id['' ]
tag_to_word_prob[tag_to_id['' ], word_to_id['' ]] = 1
tag_to_word_prob = tag_to_word_prob / np.sum(tag_to_word_prob, axis=1, keepdims=True)
tag_to_tag_prob = tag_to_tag_prob / np.sum(tag_to_tag_prob, axis=1, keepdims=True)
定义一个函数 dp [ i ] [ tag j ] \text{dp}[i][\text{tag}_j] dp[i][tagj]表示从开始到节点 i i i, 以 tag j \text{tag}\space j tag j结尾最好的路径的分数
dp [ i ] [ tag j ] = max [ dp [ i − 1 ] + log p ( tag i ∣ tag i − 1 ) + log ( word i ∣ tag i ) for tag i − 1 in tags ] \text{dp}[i][\text{tag}_j]=\max[\text{dp}[i-1]+\log p(\text{tag}_{i}|\text{tag}_{i-1})+\log {(\text{word}_i|\text{tag}_i)}\space \text{for}\space\text{tag}_{i-1}\space \text{in}\space\text{tags}] dp[i][tagj]=max[dp[i−1]+logp(tagi∣tagi−1)+log(wordi∣tagi) for tagi−1 in tags]
# %% 维特比算法
def viterbi(x):
x = [word_to_id[word] for word in x.split(' ')]
seq_len = len(x)
# 计算从开始跳转到第一个 tag 的概率
dp = np.zeros((seq_len, tag_size))
for j in range(tag_size):
dp[0][j] = np.log(tag_to_tag_prob[tag_to_id['' ], j]) \
+ np.log(tag_to_word_prob[j, x[0]])
ptr = np.zeros((seq_len, tag_size), dtype=np.int)
# 计算第 i-1 个 tag 跳转到第 i 个 tag 的概率
for i in range(1, seq_len):
for j in range(tag_size):
dp[i][j] = -np.inf
for k in range(tag_size):
score = dp[i - 1][k] + np.log(tag_to_tag_prob[k][j]) + np.log(tag_to_word_prob[j][x[i]])
if score > dp[i][j]:
dp[i][j] = score
ptr[i][j] = k
# 从 ptr 中找到最好的序列
best_seq = [0] * seq_len
best_seq[seq_len - 1] = np.argmax(dp[seq_len - 1])
for i in range(seq_len - 2, -1, -1):
best_seq[i] = ptr[i + 1, best_seq[i + 1]]
for i in best_seq:
print(id_to_tag[i])
# %%
x = "Social Security number , passport number and details about the services provided for the payment"
viterbi(x)
中间会有RuntimeWarning: divide by zero encountered in log
, 因为其中有 log 0 \log 0 log0, 虽然有警告, 但是会返回正确的值-inf
, 所以能凑合用. 可以单独设置在log为0时返回极小的数.
运行结果
NNP
NNP
NN
,
NN
NN
CC
NNS
IN
DT
NNS
VBN
IN
DT
NN