最大正向匹配 mm

切词最基础的方法就是最大正向匹配,主要是基于词典完成的哈,但是存在的问题也很明显,比如 歧义、未登录词等,下面是我实现的代码,其实最主要的贡献就是提供了一个词袋,人民日报那个

def max_forward(dict_file, input_sentence, max_len=4):
    """最大匹配"""
    with open(dict_file) as fi:
        dict = []
        for word in fi:
            word = word.strip()
            dict.append(word.decode('utf-8'))

    print '字典加载完成'
    step = 0
    senlenth = len(list(input_sentence))
    word_seg = []
    while step < senlenth:
        if len(input_sentence[step:]) < max_len:
            temp_len = len(input_sentence[step:])
        else:
            temp_len = max_len
        temp_word = input_sentence[step: step + temp_len]

        while temp_len > 1:
            if temp_word in dict:
                word_seg.append(temp_word)
                step += temp_len
                break
            else:
                temp_len -= 1
                temp_word = input_sentence[step: step + temp_len]
        if temp_len == 1:
            word_seg.append(input_sentence[step: step + temp_len])
            step += temp_len
    return ' '.join(word_seg)


def max_forward_reverse(dict_file, input_sentence, max_len=4):
    with open(dict_file) as fi:
        dict = []
        for word in fi:
            word = word.strip()
            dict.append(word.decode('utf-8'))

    print '字典加载完成'
    step = len(list(input_sentence))
    word_seg = []

    while step > 0:
        if len(input_sentence[:step]) < max_len:
            temp_len = len(input_sentence[:step])
        else:
            temp_len = max_len
        temp_word = input_sentence[step - temp_len: step]
        #print temp_word
        while temp_len > 1:
            if temp_word in dict:
                word_seg.append(temp_word)
                step -= temp_len
                break
            else:
                temp_len -= 1
                temp_word = input_sentence[step - temp_len: step]
        if temp_len == 1:
            word_seg.append(input_sentence[step - temp_len: step])
            step -= temp_len
        #print step, temp_len
    word_seg.reverse()
    return ' '.join(word_seg)


print max_forward_reverse('chinese_dict.txt', u'今后三年中将翻两番')
print max_forward_reverse('chinese_dict.txt', u'今后三年中将翻两番')

字典下载 https://pan.baidu.com/s/1jWkS0X9qHkrgZq_hz2Uq7A,提取码: vwn2

你可能感兴趣的:(最大正向匹配 mm)