使用HMM进行中文分词

一、代码

# %load hmm_segment.py  
# (1)通过对语料库进行统计,建立隐马尔科夫链模型(初始概率,转移概率,观测概率)
# (2)通过维特比算法(动态规划),确立某具体语句按其顺序出现每个字时,每个字对应的最大可能状态。
# (3)确定了每个字对应的状态,按('BM*E|S')的模式进行分词    # B代表词开始,M代表词的中间,E代表词的结尾。S代表单字词  M*表示中间大于等于0

# Hidden Markov Model,HMM
import pickle
import copy
import re
import sys

sys.path.append("..")

# 序列化文件夹
# model_path = 'model/hmm.model'
model_path = 'model/hmm20190920.model'
default_probability = 0.000000001  # 预防语料库较简单情况下,出现某状态下某字未出现的情况,则概率为零的情况。
# 转移概率矩阵
trans_mat = {} 
# 观测概率矩阵
emit_mat = {}
# 初始概率矩阵
init_vec = {}
# 状态集合
state_set = set()
# 观测集合
observation_set = set()
data_path = 'data/199801人民日报.data'


def train():
    print('begin training......')
    sentences = read_data(data_path)
    for sentence in sentences:
        pre_label = -1
        for word, label in sentence:
            emit_mat[label][word] = emit_mat.setdefault(label, {}).setdefault(word, 0) + 1   # 观测概率矩阵 某状态下出现某次的概率
            if pre_label == -1:
                init_vec[label] = init_vec.setdefault(label, 0) + 1  # 每一句话开始时是什么label
            else:
                trans_mat[pre_label][label] = trans_mat.setdefault(pre_label, {}).setdefault(label,0) + 1   # 状态转移矩阵计算
            pre_label = label

    for key, value in trans_mat.items():
        number_total = 0
        for k, v in value.items():    # {  key(pre_label):  {k(label):  v(counts)}  }
            number_total += v
        for k, v in value.items():
            trans_mat[key][k] = 1.0 * v / number_total  # 乘以1.0 可以得到浮点除
    for key, value in emit_mat.items():
        number_total = 0
        for k, v in value.items():
            number_total += v
        for k, v in value.items():
            emit_mat[key][k] = 1.0 * v / number_total

    number_total = sum(init_vec.values())
    for k, v in init_vec.items():
        init_vec[k] = 1.0 * v / number_total

    print('finish training.....')
    save_model()


def predict(text, v_states, start_p, trans_p, obs_p):  # 输入,状态,初始概率,转移概率,观测概率
    V = [{}]
    path = {}
    # 观测空间
    obs = list(text)
    # 当t=0时进行初始化
    for y in v_states:  # 对状态向量中的每个状态计算初始观测值为obs[0]的概率
        V[0][y] = start_p.get(y, default_probability) * obs_p.get(y, {}).get(obs[0], default_probability)
        path[y] = [y]

    # 当t>0
    for t in range(1, len(obs)): # 从1开始是因为时刻0已经由初始化算得
        V.append({})
        new_path = {}

        for y in v_states:
            (prob, state) = max([(V[t - 1].get(y0, default_probability) *
                                  trans_p.get(y0, {}).get(y, default_probability) *
                                  obs_p.get(y, {}).get(obs[t], default_probability), y0) for y0 in v_states])
            V[t][y] = prob
            new_path[y] = path[state] + [y]
        path = new_path

    (prob, max_state) = max([(V[len(obs) - 1][y], y) for y in v_states])
    result = []
    p = re.compile('BM*E|S')   # B代表词开始,M代表词的中间,E代表词的结尾。S代表单字词  M*表示中间大于等于0
    for i in p.finditer(''.join(path[max_state])):
        start, end = i.span()   # .span()  返回起始终止位置
        word = text[start:end]
        result.append(word)
    return result


def load_model():
    print('loading model... from: {}'.format(model_path))
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    return model[0], model[1], model[2], model[3]


def save_model():
    print('saving model...')
    model = [trans_mat, emit_mat, init_vec, state_set, observation_set]
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)


def read_data(filename):
    sentences = []
    sentence = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            word_label = line.strip().split('\t')
            if len(word_label) == 2:
                observation_set.add(word_label[0])
                state_set.add(word_label[1])
                sentence.append(word_label)
            else:
                # 遇到回车表示一句话结束
                sentences.append(sentence)
                sentence = []
    return sentences

 

# 训练模型
train()
# 加载模型
trans_mat, emit_mat, init_vec, state_set = load_model()
# 模型预测
res = predict('你在图书馆', state_set, init_vec, trans_mat, emit_mat)
print('//'.join(res))

 

输出得到:

begin training......
finish training.....
saving model...
loading model... from: model/hmm20190920.model
你//在//图书馆

二、备忘

  1. ‘default_probability = 0.000000001’  的初始化意义
  2. 好好理解def predict()中

        for y in v_states:
            (prob, state) = max([(V[t - 1].get(y0, default_probability) *
                                  trans_p.get(y0, {}).get(y, default_probability) *
                                  obs_p.get(y, {}).get(obs[t], default_probability), y0) for y0 in v_states])
            V[t][y] = prob
            new_path[y] = path[state] + [y]

这段代码的含义,就是维特比算法中确定最优路径的方式。

  1. dict.setdefault(key, default=None)  和 dict.get(key, default=None)

你可能感兴趣的:(python,ai,机器学习)