利用HMM进行词性标注实战

隐马尔可夫有三个问题,第三个问题就是利用已知模型和序列对隐状态序列I=(I1,I2,....In)进行预测.要想进行预测的话可以使用维特比算法,在写维特比算法的时候总是很多bug或者逻辑错误,之后代码会将每一步进行注释。

首先要初始化词典以及参数,主要就是做一个id与tag和word的相互转换,以及定义转移矩阵A,初始概率Π,以及观测概率B。

然后计算词频,并最后转换成概率,即train函数

最后就是利用维特比算法进行预测,这里是用了动态规划dp,并记录每一步最优概率的路径ptr.

import os
import sys
import numpy as np
import jieba

def log(v):
    if v == 0:
        return np.log(v + 0.000001)
    return np.log(v)

class HMM_PosTagger():
    def __init__(self):
        # 初始化词典
        self.id2tag, self.tag2id = {}, {}
        self.word2id, self.id2word = {}, {}

        # 初始化参数
        self.implicit_num = 0
        self.explicit_num = 0

        self.pi = None
        self.A = None
        self.B = None

    # 从数据中统计词典
    def init_stat(self, textfile):
        for line in open(textfile, 'r', encoding='utf-8'):
            items = line.split("/")
            word, tag = items[0], items[1].rstrip()

            if word not in self.word2id:
                self.word2id[word] = len(self.word2id)
                self.id2word[len(self.id2word)] = word

            if tag not in self.tag2id:
                self.tag2id[tag] = len(self.tag2id)
                self.id2tag[len(self.id2tag)] = tag

        print("tag size: {}".format(len(self.tag2id)))
        print("tag2id:{}".format(self.tag2id))

        self.implicit_num = len(self.tag2id)
        self.explicit_num = len(self.word2id)

        self.pi = np.zeros(self.implicit_num)
        self.A = np.zeros((self.implicit_num, self.implicit_num))
        self.B = np.zeros((self.implicit_num, self.explicit_num))

    def train(self, train_file):

        # 统计参数
        pre_tag = ""
        for line in open(train_file, 'r', encoding='utf-8'):
            items = line.split('/')
            if len(items) != 2:
                continue

            word, tag = items[0], items[1].rstrip()

            word_id, tag_id = self.word2id[word], self.tag2id[tag]

            if pre_tag == "":
                self.pi[tag_id] += 1
                self.B[tag_id][word_id] += 1
            else:
                self.A[self.tag2id[pre_tag]][tag_id] += 1
                self.B[tag_id][word_id] += 1
            if items[0] == '.':
                pre_tag = ""
            else:
                pre_tag = tag

        # 转成概率

        self.pi = self.pi / sum(self.pi)
        for i in range(self.implicit_num):
            self.A[i] /= sum(self.A[i])
            self.B[i] /= sum(self.B[i])


    # 时间复杂度: implicit * implicit * seq_len
    def viterbi_decode(self, in_seq):
        tokens = [self.word2id[word] for word in in_seq.split(" ")]
        seq_len = len(tokens)

        dp = np.zeros((seq_len, self.implicit_num))
        ptr = np.zeros((seq_len, self.implicit_num), dtype=int)

        for j in range(self.implicit_num):
            dp[0][j] = log(self.pi[j]) + log(self.B[j][tokens[0]])

        for i in range(1, seq_len):
            # j被指示为当前层的状态
            for j in range(self.implicit_num):
                dp[i][j] = -9999999
                # k被指示为上一层的状态
                for k in range(self.implicit_num):
                    # 为什么是dp直接加log A和log B,因为本身dp已经被log过了
                    score = dp[i-1][k] + log(self.A[k][j]) + log(self.B[j][tokens[i]])
                    if score > dp[i][j]:
                        dp[i][j] = score
                        ptr[i][j] = k

        best_seq = [0] * seq_len
        best_seq[seq_len - 1] = int(np.argmax(dp[seq_len - 1]))

        for i in range(seq_len - 2, -1, -1):
            best_seq[i] = ptr[i + 1][best_seq[i + 1]]

        result = []
        for i in range(len(best_seq)):
            result.append(self.id2tag[best_seq[i]])
        return result

if __name__ == '__main__' :
    tagger = HMM_PosTagger()

    tagger.init_stat('traindata.txt')

    tagger.train('traindata.txt')

    sentence = "U.K. will be die"
    print(sentence)
    tags = tagger.viterbi_decode(sentence)
    print(tags)

这里的需要的文本给出链接,需要的可自行下载,编写不易,留下你的赞吧,感谢!!!

链接:https://pan.baidu.com/s/10M1R5qfbU4P36Cd5SaP4Vw 提取码:azzn --来自百度网盘超级会员V5的分享

你可能感兴趣的:(python,自然语言处理,机器学习)