MEMM比HMM的的区别是没有转移概率,只计算条件一下S(t+1)的概率
学习网址
训练代码如下:
#-*-coding:utf8-*-
PROB_START = "data\prob_start.py" # 初始状态概率
PROB_EMIT = "data\prob_emit.py" # 计算给定标签下,观察值概率矩阵观察值是而不是HMM的求的是条件下St+1的概率
#PROB_TRANS = "data\prob_trans.py" # 不需要转移概率
start_fp = open(PROB_START, 'w', encoding='utf8')
emit_fp = open(PROB_EMIT, 'w', encoding='utf8')
#trans_fp = open(PROB_TRANS, 'w', encoding='utf8')不需要计算转移概率
def getList(input_str): # 输入词语,输出状态
"""
输入一个词语,然后把词语转化成B,M,E,S的形式
:param input_str:
:return:
"""
outpout_str = []
if len(input_str) == 1:
outpout_str.append('S')
elif len(input_str) == 2:
outpout_str = ['B', 'E']
else:
M_num = len(input_str) - 2
M_list = ['M'] * M_num
outpout_str.append('B')
outpout_str.extend(M_list) # 把M_list中的'M'分别添加进去
outpout_str.append('E')
return outpout_str
def init():
"""
初始化操作
count_tag={}计算B,M,E,S每一个tag的数量
trina_B={{},{},{},{}}通过字典嵌套计算转移概率
emit_C计算条件下St+1的概率S=[B,M,E,S]O表示word
:return:
"""
for A in tags:
word_dict = {}
for B in tags:
gailv={}
for wordA in words_set:
gailv[wordA] = 0
word_dict[B]=gailv
emit_C[A] = word_dict
tags=["B","M","E","S"]
o_tags=["B","M","E","S","start"]
initial={}
file=open("trainCorpus.txt_utf8","r",encoding="utf8")
words_set=set()
pre_a={}#初始概率
line_number=0#行数
all_words=[]#词汇总
all_tags=[]#[B,M,E,S]标签汇总
#trina_B={}#不需要计算转移概率
emit_C={}#发射概率
count_word={}
count_tag={}#计算[B,E,M,S]中每一个类别的字的总数
#计算B,M,E,S的数量
for line in file:
line=line.strip()
#生成词的set()
word_list = []
line_words=[]
line_number = line_number+1
for i in range(len(line)):
if line[i] == " ":
continue
word_list.append(line[i])
words_set.add(line[i])
words=line.split(" ")
# print(word_list)
all_words.append(word_list)
line_tags =[]
for word in words:
words_tags=getList(word)
for tag in words_tags:
line_tags.append(tag)
# print(line_tags)
all_tags.append(line_tags)
for tag in tags:
count_tag[tag]=0
for A in tags:
word_dict = {}
for B in o_tags:
gailv = {}
for wordA in words_set:
gailv[wordA] = 0
word_dict[B] = gailv
emit_C[A] = word_dict
#计算初始概率
for line_tags in all_tags:
lenght_tags=len(line_tags)
if line_tags[0] in pre_a.keys():
pre_a[line_tags[0]]=pre_a[line_tags[0]]+1.0/line_number
else:
pre_a[line_tags[0]] = 1.0/line_number
count_tag[line_tags[0]]=count_tag[line_tags[0]]+1
start_fp.write(str(pre_a))
start_fp.close()
#不需要计算转移概率
for line_tags in all_tags:
lenght_tags = len(line_tags)
for i in range(1,lenght_tags):
count_tag[line_tags[i]]=count_tag[line_tags[i]]+1
# trina_B[line_tags[i-1]][line_tags[i]]=trina_B[line_tags[i-1]][line_tags[i]]+1
#计算给定标签下,观察值概率矩阵观察值是而不是HMM的
for i in range(line_number):
lenght=len(all_tags[i])
for j in range(0,lenght):
if j==0:
emit_C[all_tags[i][j]]["start"][all_words[i][j]]=emit_C[all_tags[i][j]]["start"][all_words[i][j]]+1.0
else:
emit_C[all_tags[i][j]][all_tags[i][j - 1]][all_words[i][j]] = emit_C[all_tags[i][j]][all_tags[i][j - 1]][
all_words[i][j]] + 1.0
# for linex,liney in zip(all_tags,all_words):
# for x,y in zip(linex,liney):
# emit_C[x][y]=emit_C[x][y]+1.0
for tag in tags:
for key_pre in emit_C[tag].keys():
for key_word in emit_C[tag][key_pre].keys():
emit_C[tag][key_pre][key_word]=1.0*emit_C[tag][key_pre][key_word]/count_tag[tag]
emit_fp.write(str(emit_C))
emit_fp.close()
# for key in trina_B.keys():
# for other_key in trina_B[key].keys():
# trina_B[key][other_key]=1.0*trina_B[key][other_key]/count_tag[key]
for tag in count_tag.keys():
print(tag,count_tag[tag])
# trans_fp.write(str(trina_B))
# trans_fp.close()
测试代码如下:
# -*- coding: utf-8 -*-
import pandas as pd
from numpy import *
def load_model(f_name):
ifp = open(f_name, 'rb')
return eval(ifp.read()) # eval参数是一个字符串, 可以把这个字符串当成表达式来求值,
prob_start = load_model("data\prob_start.py")
prob_emit = load_model("data\prob_emit.py")
def viterbi(obs, states, start_p, emit_p): # 维特比算法(一种递归算法)
"""
:param obs:分词目标句子
:param states:状态概率
:param start_p:发射概率
:param trans_p:转移概率字典
:param emit_p:定标签下,观察值概率矩阵
:return:
"""
alllines=[]#[{}]里面存储每一层tag和对应的最大概率值
start = {}
for tag in states:
if tag in start_p.keys():
start[tag] = start_p[tag] * emit_p[tag]["start"][obs[0]]
alllines.append(start)
lenght=len(obs)
path=[]#存储每一层的tag最大概率值对应的前一个tag
pro=""
for i in range(1,lenght):
next_dict={}
new_path={}
for static in states:
list=[]
max=0
for key in alllines[i-1].keys():
value=alllines[i-1][key]*emit_p[static][key][obs[i]]
if value>max:
max=value
pro=key
list.append(value)
next_dict[static]=max
new_path[static]=pro
path.append(new_path)
alllines.append(next_dict)
#寻找路径
max=0
end=""
for key in alllines[-1].keys():
if alllines[-1][key]>max:
end=key
result=[]
result.append(end)
for i in range(len(alllines)-2,-1,-1):
for key in path[i].keys():
if key==result[len(alllines)-i-2]:
result.append(path[i][key])
result.reverse()
print(result)
def cut(sentence):
viterbi(sentence, ['B', 'M', 'E', 'S'], prob_start, prob_emit)
if __name__ == "__main__":
test_str = u"我们是中国人。"
cut(test_str)
跟HMM相比修改了维特比算法。