隐马尔科夫实现中文分词(Python3实现)

前言:隐马尔科夫的相关概念就不多说了,就是一个三元组(A,B,Pi),分别表示转移概率,发射概率和初始状态概率。

首先是语料库的训练部分:

#!/usr/bin/python
#-*-coding:utf-8
import sys
import math
import pdb
state_M = 4
word_N = 0

A_dic = {}
B_dic = {}
Count_dic = {}
Pi_dic = {}
word_set = set()
state_list = ['B','M','E','S']
line_num = -1

# 语料库 这里用的是人民日报已经人工分词的语料库
INPUT_DATA = "RenMinData.txt"
# 初始状态概率
PROB_START = "prob_start.py"
# 发射状态概率
PROB_EMIT = "prob_emit.py"
# 转移状态概率
PROB_TRANS = "prob_trans.py"


def init():
    global state_M
    global word_N
    for state in state_list:
        A_dic[state] = {}
        for state1 in state_list:
            A_dic[state][state1] = 0.0
    for state in state_list:
        Pi_dic[state] = 0.0
        B_dic[state] = {}
        Count_dic[state] = 0

# 输入词语  输出状态 B代表其实字,M代表中间字,E代表结束字,S代表单字成词
def getList(input_str):
    outpout_str = []
    if len(input_str) == 1:
        outpout_str.append('S')
    elif len(input_str) == 2:
        outpout_str = ['B','E']
    else:
        M_num = len(input_str) -2
        M_list = ['M'] * M_num
        outpout_str.append('B')
        outpout_str.extend(M_list)
        outpout_str.append('S')
    return outpout_str

# 输出模型的三个参数:初始概率+转移概率+发射概率
def Output():
    start_fp = open(PROB_START,'w')
    emit_fp = open(PROB_EMIT,'w')
    trans_fp = open(PROB_TRANS,'w')
    print("11111111")
    print ("len(word_set) = %s " % (len(word_set)))


    for key in Pi_dic:
        Pi_dic[key] = Pi_dic[key] * 1.0 / line_num
    # start_fp.write(str(start_fp)+str(Pi_dic))
    print(start_fp,Pi_dic)

    for key in A_dic:
        for key1 in A_dic[key]:
            A_dic[key][key1] = A_dic[key][key1] / Count_dic[key]
    # trans_fp.write(str(trans_fp)+str(A_dic))
    print(trans_fp,A_dic)

    for key in B_dic:
        for word in B_dic[key]:
            B_dic[key][word] = B_dic[key][word] / Count_dic[key]
    emit_fp.write(str(emit_fp)+str(B_dic))
    print(emit_fp,B_dic)

    start_fp.close()
    emit_fp.close()
    trans_fp.close()


def main():
    ifp = open(INPUT_DATA,encoding='utf-8')
    init()
    global word_set
    global line_num
    for line in ifp:
        line_num += 1
        if line_num % 10000 == 0:
            print (line_num)

        line = line.strip()
        if not line:
            continue
        # line = line.decode("utf-8","ignore")

        word_list = []
        for i in range(len(line)):
            if line[i] == " ":continue
            word_list.append(line[i])
        word_set = word_set | set(word_list)


        lineArr = line.split(" ")
        line_state = []
        for item in lineArr:
            # 一句话对应一行连续的状态
            line_state.extend(getList(item))

        if len(word_list) != len(line_state):
            print (sys.stderr,"[line_num = %d][line = %s]" % (line_num, line.encode("utf-8",'ignore')))
        else:
            for i in range(len(line_state)):
                if i == 0:
                    # 计算转移概率
                    Pi_dic[line_state[0]] += 1
                    Count_dic[line_state[0]] += 1
                else:
                    # 计算发射概率
                    A_dic[line_state[i-1]][line_state[i]] += 1
                    Count_dic[line_state[i]] += 1
                    # if not B_dic[line_state[i]].has_key(word_list[i]):
                    if word_list[i] not in B_dic[line_state[i]]:
                        B_dic[line_state[i]][word_list[i]] = 0.0
                    else:
                        B_dic[line_state[i]][word_list[i]] += 1
    Output()
    ifp.close()

if __name__ == "__main__":
    main()
然后在训练的基础上,进行测试,用的是维特比方法,该方法是用于降低计算复杂度而产生的,使用了动态规划的思想,笔记中有描述。

#!/usr/bin/python
#-*-coding:utf-8
import os
import sys
import pdb



def load_model(f_name):
    ifp = open(f_name, 'rb')
    return eval(ifp.read())

prob_start = load_model("prob_start.py")
prob_trans = load_model("prob_trans.py")
prob_emit = load_model("prob_emit.py")


def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}] #tabular
    path = {}
    for y in states: #init
        V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
        path[y] = [y]
    for t in range(1,len(obs)):
        V.append({})
        newpath = {}
        for y in states:
            (prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states if V[t-1][y0]>0])
            V[t][y] =prob
            newpath[y] = path[state] + [y]
        path = newpath
    (prob, state) = max([(V[len(obs) - 1][y], y) for y in states])
    return (prob, path[state])

def cut(sentence):
    #pdb.set_trace()
    prob, pos_list =  viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit)
    return (prob,pos_list)

if __name__ == "__main__":
    test_str = u"长春市长春节讲话。"
    prob,pos_list = cut(test_str)
    print  (test_str)
    print  (pos_list)
    test_str = u"他说的确实在理."
    prob,pos_list = cut(test_str)
    print (test_str)
    print (pos_list)

    test_str = u"毛主席万岁。"
    prob,pos_list = cut(test_str)
    print (test_str)
    print (pos_list)

    test_str = u"我有一台电脑。"
    prob,pos_list = cut(test_str)
    print (test_str)
    print (pos_list)

    test_str = u"我不喜欢你。"
    prob, pos_list = cut(test_str)
    print(test_str)
    print(pos_list)
结果为:

隐马尔科夫实现中文分词(Python3实现)_第1张图片

你可能感兴趣的:(NLP)