jieba分词的三种分词模式:
jieba分词的三种算法:
基于Viterbi算法做词性标注;
基于tf-idf和textrank模型抽取关键词;
第一部分是训练,有详细注释,所需要的训练语料:人民日报中文语料库
#encoding:utf-8
import sys
import math
A_dic = {}#状态转移概率矩阵
B_dic = {}#观测概率矩阵
Count_dic = {}#用于记录所有B、M、E、S的数量
Pi_dic = {}#初始概率矩阵,即记录了每个字是BMSE的概率
word_set = set()#所有词语的集合
state_list = ['B','M','E','S']
line_num = -1
INPUT_DATA = "trainCorpus.txt_utf8"
INPUT_DATA = "trainCorpus.txt_utf8"
PROB_START = "prob_start.py" #初始状态概率
PROB_EMIT = "prob_emit.py" #发射概率
PROB_TRANS = "prob_trans.py" #转移概率
#初始化字典,初始化的矩阵A全部为0
def init():
for state in state_list:
A_dic[state]={}
for state1 in state_list:
A_dic[state][state1]=0.0
for state in state_list:
Pi_dic[state]=0.0
B_dic[state]={}
Count_dic[state]=0
#输出观测状态
def getList(input_str):
outpout_str = []
if len(input_str) == 1:
outpout_str.append('S')
elif len(input_str) == 2:
outpout_str = ['B','E']
else:
M_num = len(input_str) -2
M_list = ['M'] * M_num
outpout_str.append('B')
outpout_str.extend(M_list) #把M_list中的'M'分别添加进去
outpout_str.append('E')
return outpout_str
#输出模型的三个矩阵
def Output():
start_fp=open(PROB_START,'w')
emit_fp=open(PROB_EMIT,'w')
trans_fp=open(PROB_TRANS,'w')
for key in Pi_dic: #状态的初始概率
Pi_dic[key]=float(Pi_dic[key])/ line_num
print (Pi_dic,file=start_fp)
for key in A_dic: #状态转移概率
for key1 in A_dic[key]:
A_dic[key][key1]=A_dic[key][key1]/Count_dic[key]
print (A_dic,file=trans_fp)
for key in B_dic: #观测概率,
for word in B_dic[key]:
B_dic[key][word] =B_dic[key][word]/Count_dic[key]
print (B_dic,file=emit_fp)
start_fp.close()
emit_fp.close()
trans_fp.close()
def main():
ifp=open(INPUT_DATA,'rb')
init()
global word_set
global line_num
for line in ifp.readlines():
line_num+=1
line=line.strip()
if not line:continue
line = line.decode("utf-8","ignore")
word_list = []
for i in range(len(line)):
if line[i] == " ":continue
word_list.append(line[i])
word_set = word_set | set(word_list) #训练预料库中所有字的集合
lineArr=line.split(" ")#每一行的字符串
line_state=[]#用于存放每个字符的BMSE属性
for item in lineArr:
line_state.extend(getList(item))
for i in range(len(line_state)):#遍历这个字符串的所有BMSE
if i==0:
Pi_dic[line_state[0]]+=1
Count_dic[line_state[0]]+= 1 #记录每一个状态的出现次数
else:
A_dic[line_state[i-1]][line_state[i]] += 1 #用于计算转移概率
Count_dic[line_state[i]] += 1
if word_list[i] not in B_dic[line_state[i]]:
B_dic[line_state[i]][word_list[i]] = 0.0
else:
B_dic[line_state[i]][word_list[i]] += 1 #用于计算发射概率
Output()
ifp.close()
if __name__=="__main__":
main()
上一部分代码是用训练数据集求得了HMM的三要素,即初始概率矩阵(语料中第一个字是BMSE的概率),转移矩阵(B到E之类这种的转移概率),以及发射矩阵每个字是BMSE的概率,接下来就用这三个矩阵来进行分词测试。
# -*- coding: utf-8 -*-
def load_model(f_name):
ifp=open(f_name,'rb').read()
ifp=ifp.decode('GB2312',"ignore")
return eval(ifp)
import chardet
file=open("prob_emit.py",'rb').read()
f_charInfo=chardet.detect(file)
print(f_charInfo)
prob_start = load_model("prob_start.py")
prob_trans = load_model("prob_trans.py")
prob_emit = load_model("prob_emit.py")
def viterbi(obs, states, start_p, trans_p, emit_p):
V = [{}]
path = {}
for y in states: #初始值
V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
path[y] = [y]
for t in range(1,len(obs)):
V.append({})
newpath = {}
for y in states: #从y0 -> y状态的递归
(prob, state) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states if V[t-1][y0]>0])
V[t][y] =prob
newpath[y] = path[state] + [y]
path = newpath #记录状态序列
(prob, state) = max([(V[len(obs) - 1][y], y) for y in states]) #在最后一个位置,以y状态为末尾的状态序列的最大概率
return (prob, path[state]) #返回概率和状态序列
def cut(sentence):
prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit)
return (prob,pos_list)