自然语言处理NLP_中文分词_逆向最大匹配算法

"""
    逆向最大匹配算法
"""
#词典元素存储变量
dict_words = []

#初始化函数,载入词典
def init():
    with open("dict/dict.txt","r",encoding="utf-8") as dict_input:
        for word in dict_input:
            dict_words.append(word.strip())

#分词函数
def cut_words(words_input,dict_words):
    #分词结果集
    cut_words_list = []

    words_input = words_input.strip()
    #统计输入系列的长度
    words_input_length = len(words_input)

    #统计词典的元素的最大长度
    max_length_dict_words = max(len(word) for word in dict_words)

    while words_input_length > 0:
        # 找出分词的最大长度
        max_cut_length = min(words_input_length, max_length_dict_words)

        # 切出最长匹配序列
        subString_words_input = words_input[-max_cut_length:]#!

        while max_cut_length > 0:
            if subString_words_input in dict_words:
                cut_words_list.append(subString_words_input)
                break
            elif max_cut_length == 1:
                cut_words_list.append(subString_words_input)
                break
            else:
                max_cut_length -= 1
                subString_words_input = words_input[-max_cut_length:]#!
        words_input = words_input[0:-max_cut_length]#!
        words_input_length -= max_cut_length

    # 分词结果
    cut_words_list.reverse()#!

    #result_cut_words = "/".join(cut_words_list)
    #return result_cut_words

    return cut_words_list

#主函数
def main():
    init()
    while True:
        print("请输入需要切分的序列:")
        words_input = input()
        if not words_input:
            break
        result = "/".join(cut_words(words_input, dict_words))
        print("分词结果:")
        print(result)

#测试函数
if __name__ == "__main__":
    main()

你可能感兴趣的:(自然语言处理NLP_中文分词_逆向最大匹配算法)