隐马尔可夫模型-维特比算法python实现中文分词(训练+预测)

原理参考链接

import numpy as np

def hmm_vtb(A,B,pi,O):
    δ=np.zeros((len(O),len(A)))
    Ψ=np.zeros((len(O),len(A)))
    #1、初始化t=1时刻的两个局部变量
    δ[0]=pi*B.T[O[0]]
    #2、动态规划,递归求每一步的两个局部变量
    for i in range(1,len(δ)):
        δ[i]=np.max(δ[i-1]*A.T,1)*B.T[O[i]]
        Ψ[i]=np.argmax(δ[i-1]*A.T,1)
    #3、求最后一个概率最大对应的隐含标签
    label=[np.argmax(δ[-1])]
    #4、回溯求整个序列的隐含标签
    for index,tag in enumerate(Ψ[::-1]):
        if index<len(Ψ)-1:
            label.append(int(tag[label[-1]]))
    return label[::-1]

A=np.array([[0.5,0.2,0.3,0.2,0.4],[0.3,0.5,0.2,0.2,0.4],[0.2,0.3,0.5,0.2,0.4],[0.5,0.2,0.3,0.2,0.4],[0.5,0.2,0.3,0.2,0.4]])
B=np.array([[0.5,0.5],[0.4,0.6],[0.7,0.3],[0.7,0.3],[0.7,0.3]])
pi=[0.2,0.4,0.4,0.2,0.8]
O=[1,1,1,0,1]

a=hmm_vtb(A,B,pi,O)
print(a)
#[1, 1, 1, 4, 0]

分词(训练、预测)

import numpy as np
import re

class hmm(object):
    def __init__(self,path):
        self.path=path
        self.clean_data()

    def clean_data(self):
        with open(self.path,encoding="utf-8") as f:
            sentences=f.read()

        self.data=[[word.split(" ") for word in sentence.split("\n")] for sentence in sentences.split("\n\n")]
        self.Q=sorted(list(set([word[1] for sentence in self.data for word in sentence])))
        self.V = sorted(list(set([word[0] for sentence in self.data for word in sentence])))

    def train(self):
        #初始状态pi
        first_label=[sentence[0][1] for sentence in self.data]
        self.pi=np.array([round(first_label.count(label)/len(first_label),4) for label in self.Q])

        #转移状态A
        label=[[word_label[1] for word_label in sentence] for sentence in self.data]
        two_label = ["".join(tag[i:i + 2]) for tag in label for i in range(len(tag) - 1)]
        # two_label=[[tag[i:i+2] for i in range(len(tag)-1)] for tag in label]
        # two_label=["".join(word)  for sentence in two_label for word in sentence]
        self.A=np.array([[round(two_label.count(q1+q2)/sum(1 for b in two_label if b[0]==q1),4) for q2 in self.Q] for q1 in self.Q])

        #发射矩阵B
        word_label=["".join(word) for sentence in self.data for word in sentence]
        # word_label = [["".join(word) for word in sentence] for sentence in self.data ]
        # word_label =[word for sentence in word_label for word in sentence]
        label=[label for sentence in label for label in sentence]
        self.B=np.array([[round(word_label.count(word+q)/label.count(q),4) for word in self.V] for q in self.Q])


    def predict(self,sent):
        O=np.array([self.V.index(word) for word in sent ])
        δ = np.zeros((len(O), len(self.A)))
        Ψ = np.zeros((len(O), len(self.A)))
        # 1、初始化t=1时刻的两个局部变量
        δ[0] = self.pi * self.B.T[O[0]]
        # 2、动态规划,递归求每一步的两个局部变量
        for i in range(1, len(δ)):
            δ[i] = np.max(δ[i - 1] * self.A.T, 1) * self.B.T[O[i]]
            Ψ[i] = np.argmax(δ[i - 1] * self.A.T, 1)
        # 3、求最后一个概率最大对应的隐含标签
        label = [(δ[-1]).argmax()]
        # 4、回溯求整个序列的隐含标签
        for index, tag in enumerate(Ψ[::-1]):
            if index < len(Ψ) - 1:
                label.append(int(tag[int(label[-1])]))
        return [self.Q[i] for i in label[::-1]]

path="./nlp.txt"
sentence="你想吃麻辣烫吗"
h=hmm(path)
h.train()
result=h.predict(sentence)
result=[sentence[i.start():i.end()] for i in re.finditer("bi+|o|b|i","".join(result))]
print(result)
# ['你', '想吃', '麻辣烫', '吗']

你可能感兴趣的:(人工智能,python,算法,概率论)