MEMM算法Python实现

MEMM比HMM的的区别是没有转移概率,只计算条件一下S(t+1)的概率

学习网址

训练代码如下:

#-*-coding:utf8-*-
PROB_START = "data\prob_start.py"  # 初始状态概率
PROB_EMIT = "data\prob_emit.py"  # 计算给定标签下,观察值概率矩阵观察值是而不是HMM的求的是条件下St+1的概率
#PROB_TRANS = "data\prob_trans.py"  # 不需要转移概率
start_fp = open(PROB_START, 'w', encoding='utf8')
emit_fp = open(PROB_EMIT, 'w', encoding='utf8')
#trans_fp = open(PROB_TRANS, 'w', encoding='utf8')不需要计算转移概率
def getList(input_str):  # 输入词语,输出状态
	"""
	输入一个词语,然后把词语转化成B,M,E,S的形式
	:param input_str:
	:return:
	"""
	outpout_str = []
	if len(input_str) == 1:
		outpout_str.append('S')
	elif len(input_str) == 2:
		outpout_str = ['B', 'E']
	else:
		M_num = len(input_str) - 2
		M_list = ['M'] * M_num
		outpout_str.append('B')
		outpout_str.extend(M_list)  # 把M_list中的'M'分别添加进去
		outpout_str.append('E')
	return outpout_str
def init():
	"""
	初始化操作
	count_tag={}计算B,M,E,S每一个tag的数量
	trina_B={{},{},{},{}}通过字典嵌套计算转移概率
	emit_C计算条件下St+1的概率S=[B,M,E,S]O表示word
	:return:
	"""
	for A in tags:
		word_dict = {}
		for B in tags:
			gailv={}
			for wordA in words_set:
				gailv[wordA] = 0
			word_dict[B]=gailv
		emit_C[A] = word_dict
tags=["B","M","E","S"]
o_tags=["B","M","E","S","start"]
initial={}
file=open("trainCorpus.txt_utf8","r",encoding="utf8")
words_set=set()
pre_a={}#初始概率
line_number=0#行数
all_words=[]#词汇总
all_tags=[]#[B,M,E,S]标签汇总
#trina_B={}#不需要计算转移概率
emit_C={}#发射概率
count_word={}
count_tag={}#计算[B,E,M,S]中每一个类别的字的总数
#计算B,M,E,S的数量
for line in file:
	line=line.strip()
	#生成词的set()
	word_list = []
	line_words=[]
	line_number = line_number+1
	for i in range(len(line)):
		if line[i] == " ":
			continue
		word_list.append(line[i])
		words_set.add(line[i])
	words=line.split(" ")
	# print(word_list)
	all_words.append(word_list)
	line_tags =[]
	for word in words:
		words_tags=getList(word)
		for tag in words_tags:
			line_tags.append(tag)
	# print(line_tags)
	all_tags.append(line_tags)
for tag in tags:
	count_tag[tag]=0
for A in tags:
	word_dict = {}
	for B in o_tags:
		gailv = {}
		for wordA in words_set:
			gailv[wordA] = 0
		word_dict[B] = gailv
	emit_C[A] = word_dict
#计算初始概率
for line_tags in all_tags:
	lenght_tags=len(line_tags)
	if line_tags[0] in pre_a.keys():
		pre_a[line_tags[0]]=pre_a[line_tags[0]]+1.0/line_number
	else:
		pre_a[line_tags[0]] = 1.0/line_number
	count_tag[line_tags[0]]=count_tag[line_tags[0]]+1
start_fp.write(str(pre_a))
start_fp.close()
#不需要计算转移概率
for line_tags in all_tags:
	lenght_tags = len(line_tags)
	for i in range(1,lenght_tags):
		count_tag[line_tags[i]]=count_tag[line_tags[i]]+1
		# trina_B[line_tags[i-1]][line_tags[i]]=trina_B[line_tags[i-1]][line_tags[i]]+1
#计算给定标签下,观察值概率矩阵观察值是而不是HMM的
for i in range(line_number):
	lenght=len(all_tags[i])
	for j in range(0,lenght):
		if j==0:
			emit_C[all_tags[i][j]]["start"][all_words[i][j]]=emit_C[all_tags[i][j]]["start"][all_words[i][j]]+1.0
		else:
			emit_C[all_tags[i][j]][all_tags[i][j - 1]][all_words[i][j]] = emit_C[all_tags[i][j]][all_tags[i][j - 1]][
																			  all_words[i][j]] + 1.0
# for linex,liney in zip(all_tags,all_words):
# 	for x,y in zip(linex,liney):
# 		emit_C[x][y]=emit_C[x][y]+1.0
for tag in tags:
	for key_pre in emit_C[tag].keys():
		for key_word in emit_C[tag][key_pre].keys():
			emit_C[tag][key_pre][key_word]=1.0*emit_C[tag][key_pre][key_word]/count_tag[tag]
emit_fp.write(str(emit_C))
emit_fp.close()
# for key in trina_B.keys():
# 	for other_key in trina_B[key].keys():
# 		trina_B[key][other_key]=1.0*trina_B[key][other_key]/count_tag[key]
for tag in count_tag.keys():
	print(tag,count_tag[tag])
# trans_fp.write(str(trina_B))
# trans_fp.close()

测试代码如下:

# -*- coding: utf-8 -*-
import pandas as pd
from numpy import *


def load_model(f_name):
	ifp = open(f_name, 'rb')
	return eval(ifp.read())  # eval参数是一个字符串, 可以把这个字符串当成表达式来求值,


prob_start = load_model("data\prob_start.py")
prob_emit = load_model("data\prob_emit.py")


def viterbi(obs, states, start_p, emit_p):  # 维特比算法(一种递归算法)
	"""
	:param obs:分词目标句子
	:param states:状态概率
	:param start_p:发射概率
	:param trans_p:转移概率字典
	:param emit_p:定标签下,观察值概率矩阵
	:return:
	"""
	alllines=[]#[{}]里面存储每一层tag和对应的最大概率值
	start = {}
	for tag in states:
		if tag in start_p.keys():
			start[tag] = start_p[tag] * emit_p[tag]["start"][obs[0]]
	alllines.append(start)
	lenght=len(obs)
	path=[]#存储每一层的tag最大概率值对应的前一个tag
	pro=""
	for i in range(1,lenght):
		next_dict={}
		new_path={}
		for static in states:
			list=[]
			max=0
			for key in alllines[i-1].keys():
				value=alllines[i-1][key]*emit_p[static][key][obs[i]]
				if value>max:
					max=value
					pro=key
				list.append(value)
			next_dict[static]=max
			new_path[static]=pro
		path.append(new_path)
		alllines.append(next_dict)
	#寻找路径
	max=0
	end=""
	for key in alllines[-1].keys():
		if alllines[-1][key]>max:
			end=key
	result=[]
	result.append(end)
	for i in range(len(alllines)-2,-1,-1):
		for key in path[i].keys():
			if key==result[len(alllines)-i-2]:
				result.append(path[i][key])
	result.reverse()
	print(result)

def cut(sentence):
	viterbi(sentence, ['B', 'M', 'E', 'S'], prob_start, prob_emit)


if __name__ == "__main__":
	test_str = u"我们是中国人。"
	cut(test_str)

跟HMM相比修改了维特比算法。

你可能感兴趣的:(NLP)