拼音转汉字

  拼音转汉字主要是针对具有连续序列如:woaizhongguorenmingya,这种类型转化为汉字。其主要思路分为两步走,第一步是对拼音进行分割,以分割出正确的拼音;第二步是利用hmm方法,计算汉字的最大观测序列,从而得到最大概率的中文字链。

算法描述:

  第一步:

  1、首先构建拼音的TrieNode树;

  2、利用TrieNode树查找出字符串中所有匹配词;

  3、选取最大匹配词,并将字符串的位置后移;

  4、重复2步骤,直到到词尾结束;

  5、输出分割的拼音;

  第二步:

  1、利用汉字转拼音工具,对语料进行拼音转换;

  2、分别统计并计算出拼音到汉字的发射概率、汉字到汉字的转移概率,如p(欢|喜),p(喜|B),p(欢|E),其中B表示词条的开始,E表示词条的末尾。

  3、利用维特比算法计算拼音观测序列的最大化生成概率,从而输出状态序列,得到最终的结果。

代码如下:

  

# 拼音转为汉字
import pickle
# 第一步,拼音进行拆分
# 定义TrieNode树的节点
class TrieNode:
    def __init__(self):
        self.value = None
        self.children = {}
# 遍历树
class SearchIndex:
    def __init__(self,index,char=None,parent=None):
        self.index = index
        self.char = char
        self.parent = parent
# 定义trie树字典
class Trie:
    def __init__(self):
        self.root = TrieNode()
        self.trie_path = 'D:\workspace\project\\NLPcase\\ping2han\\data\\pinyin_trie.model'
        self.pinyin_path = 'D:\workspace\project\\NLPcase\\ping2han\\data\\pinyin.txt'

    # 添加树节点
    def inser(self,key):
        node = self.root
        for char in key:
            if char not in node.children:
                child = TrieNode()
                node.children[char] = child
                node = child
            else:
                node = node.children[char]
        node.value = key # 最后一个字符用于存储字符
    # 查找节点
    def search(self,key):
        node = self.root
        matches = []
        for char in key:
            if char not in node.children:
                break
            node = node.children[char]
            if node.value:
                matches.append(node.value)
        return matches
    # 构建一颗trie树
    def build_trie(self):
        trie = Trie()# 这个只会初始化对应的参数
        for line in open(self.pinyin_path,encoding='utf-8'):
            word = line.strip().lower()
            trie.inser(word)
        with open(self.trie_path,'wb',encoding='utf-8') as f:
            pickle.dump(trie,f)
# 拼音切分
class PinyinCut:
    def __init__(self):
        self.trie_path = 'D:\workspace\project\\NLPcase\\ping2han\\data/pinyin_trie.model'
        self.trie = self.load_trie(self.trie_path)
    def load_trie(self,trie_path):
        with open(trie_path,'rb') as f:
            return pickle.load(f)
    def cut(self,sent):
        # 句子总长度
        len_sent = len(sent)
        # 返回的结果
        chars = []
        # 候选序列
        candidate_index = [SearchIndex(0)]
        # 当前单词的最后一个位置
        last_index = None
        while candidate_index:
            p = candidate_index.pop()# 选取是最大匹配的拼音字符
            # 如果当前字符所在索引为句子长度,则结束
            if p.index == len_sent:
                last_index == p
                break
            matches = self.trie.search(sent[p.index:])
            for m in matches:
                new_index = SearchIndex(p.index+len(m),m,p)# 这种结构倒是省了很多事儿也,边移位置边匹配,而且可以很好的记录后进行分割
                candidate_index.append(new_index)
        index = last_index
        while index:
            if index.parent:
                chars.insert(0,index.char)
            index = index.parent
        return chars
# 第二部分,进行拼音转中文
import math
from ping2han.pinyincut import PinyinCut
class PinyinWordTrans:
    def __init__(self):
        self.bigram_path = 'D:\workspace\project\\NLPcase\\ping2han\\data\\bigram.model'
        self.pinyin2word_path = 'D:\workspace\project\\NLPcase\\ping2han\\data/pinyin2word.model'
        self.wordfreq_path = 'D:\workspace\project\\NLPcase\\ping2han\\data/wordfreq.model'
        self.bigram_dict = self.load_model(self.bigram_path)
        self.pinyin2word_dict = self.load_model(self.pinyin2word_path)
        self.wordfreq_dict = self.load_model(self.wordfreq_path)
        self.pinyincuter = PinyinCut()
        self.min_trans = 1e-10
        self.min_emit = 1e-10

    def load_model(self, model_path):
        f = open(model_path, 'r',encoding='utf-8')
        a = f.read()
        word_dict = eval(a)
        f.close()
        return word_dict
    # 提取转移矩阵
    def trans(self, sent):
        pinyin_list = self.pinyincuter.cut(sent)
        route_dict = {len(pinyin_list):{'E':1.0}}
        for index, pinyin in enumerate(pinyin_list):
            route_dict[index] = {}
            if index == 0:
                for word, p_word in self.pinyin2word_dict[pinyin].items():
                    p0 = p_word * self.bigram_dict['B'].get(word, self.min_emit)
                    if p0 >0 :
                        route_dict[index][word] = p0
            else:
                for word, p_word in self.pinyin2word_dict[pinyin].items():
                    route_dict[index][word] = p_word

        result = self.viterbi(route_dict)
        return result

    '''verterbi算法求解'''
    def viterbi(self, route_dict): 
        V = [{}]
        result = list()
        for state in route_dict[0]:
            V[0][state] = route_dict[0][state]

        for t in range(1, len(route_dict)):
            V.append({})
            for word, word_prob in route_dict[t].items():
                tmp = []
                for pre_state in V[t - 1].keys():
                    last_p = V[t - 1][pre_state]
                    current_p = word_prob
                    if pre_state not in self.bigram_dict:
                        trans_p = 0
                    else:
                        trans_p = self.bigram_dict[pre_state].get(word, self.min_trans)
                    score = last_p * current_p * trans_p
                    tmp.append(score)
                print(tmp)
                max_prob = max(tmp)
                V[t][word] = max_prob
        #将序列进行提取
        for vector in V:
            max_state = sorted(vector.items(), key=lambda asd: asd[1], reverse=True)[0][0]
            result.append(max_state)
        return result[:-1]

总结

主要利用hmm模型求解序列问题,用于学习笔记。

参考资料:

https://blog.csdn.net/jiangzhenkang/article/details/84555947

https://github.com/liuhuanyong/Pinyin2Chinese/blob/master/pinyincut.py

你可能感兴趣的:(信息抽取)