# %load hmm_segment.py
# (1)通过对语料库进行统计,建立隐马尔科夫链模型(初始概率,转移概率,观测概率)
# (2)通过维特比算法(动态规划),确立某具体语句按其顺序出现每个字时,每个字对应的最大可能状态。
# (3)确定了每个字对应的状态,按('BM*E|S')的模式进行分词 # B代表词开始,M代表词的中间,E代表词的结尾。S代表单字词 M*表示中间大于等于0
# Hidden Markov Model,HMM
import pickle
import copy
import re
import sys
sys.path.append("..")
# 序列化文件夹
# model_path = 'model/hmm.model'
model_path = 'model/hmm20190920.model'
default_probability = 0.000000001 # 预防语料库较简单情况下,出现某状态下某字未出现的情况,则概率为零的情况。
# 转移概率矩阵
trans_mat = {}
# 观测概率矩阵
emit_mat = {}
# 初始概率矩阵
init_vec = {}
# 状态集合
state_set = set()
# 观测集合
observation_set = set()
data_path = 'data/199801人民日报.data'
def train():
print('begin training......')
sentences = read_data(data_path)
for sentence in sentences:
pre_label = -1
for word, label in sentence:
emit_mat[label][word] = emit_mat.setdefault(label, {}).setdefault(word, 0) + 1 # 观测概率矩阵 某状态下出现某次的概率
if pre_label == -1:
init_vec[label] = init_vec.setdefault(label, 0) + 1 # 每一句话开始时是什么label
else:
trans_mat[pre_label][label] = trans_mat.setdefault(pre_label, {}).setdefault(label,0) + 1 # 状态转移矩阵计算
pre_label = label
for key, value in trans_mat.items():
number_total = 0
for k, v in value.items(): # { key(pre_label): {k(label): v(counts)} }
number_total += v
for k, v in value.items():
trans_mat[key][k] = 1.0 * v / number_total # 乘以1.0 可以得到浮点除
for key, value in emit_mat.items():
number_total = 0
for k, v in value.items():
number_total += v
for k, v in value.items():
emit_mat[key][k] = 1.0 * v / number_total
number_total = sum(init_vec.values())
for k, v in init_vec.items():
init_vec[k] = 1.0 * v / number_total
print('finish training.....')
save_model()
def predict(text, v_states, start_p, trans_p, obs_p): # 输入,状态,初始概率,转移概率,观测概率
V = [{}]
path = {}
# 观测空间
obs = list(text)
# 当t=0时进行初始化
for y in v_states: # 对状态向量中的每个状态计算初始观测值为obs[0]的概率
V[0][y] = start_p.get(y, default_probability) * obs_p.get(y, {}).get(obs[0], default_probability)
path[y] = [y]
# 当t>0
for t in range(1, len(obs)): # 从1开始是因为时刻0已经由初始化算得
V.append({})
new_path = {}
for y in v_states:
(prob, state) = max([(V[t - 1].get(y0, default_probability) *
trans_p.get(y0, {}).get(y, default_probability) *
obs_p.get(y, {}).get(obs[t], default_probability), y0) for y0 in v_states])
V[t][y] = prob
new_path[y] = path[state] + [y]
path = new_path
(prob, max_state) = max([(V[len(obs) - 1][y], y) for y in v_states])
result = []
p = re.compile('BM*E|S') # B代表词开始,M代表词的中间,E代表词的结尾。S代表单字词 M*表示中间大于等于0
for i in p.finditer(''.join(path[max_state])):
start, end = i.span() # .span() 返回起始终止位置
word = text[start:end]
result.append(word)
return result
def load_model():
print('loading model... from: {}'.format(model_path))
with open(model_path, 'rb') as f:
model = pickle.load(f)
return model[0], model[1], model[2], model[3]
def save_model():
print('saving model...')
model = [trans_mat, emit_mat, init_vec, state_set, observation_set]
with open(model_path, 'wb') as f:
pickle.dump(model, f)
def read_data(filename):
sentences = []
sentence = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f.readlines():
word_label = line.strip().split('\t')
if len(word_label) == 2:
observation_set.add(word_label[0])
state_set.add(word_label[1])
sentence.append(word_label)
else:
# 遇到回车表示一句话结束
sentences.append(sentence)
sentence = []
return sentences
# 训练模型
train()
# 加载模型
trans_mat, emit_mat, init_vec, state_set = load_model()
# 模型预测
res = predict('你在图书馆', state_set, init_vec, trans_mat, emit_mat)
print('//'.join(res))
输出得到:
begin training...... finish training..... saving model... loading model... from: model/hmm20190920.model 你//在//图书馆
for y in v_states:
(prob, state) = max([(V[t - 1].get(y0, default_probability) *
trans_p.get(y0, {}).get(y, default_probability) *
obs_p.get(y, {}).get(obs[t], default_probability), y0) for y0 in v_states])
V[t][y] = prob
new_path[y] = path[state] + [y]
这段代码的含义,就是维特比算法中确定最优路径的方式。