python实现正向最大匹配算法和反向最大匹配算法

正向最大匹配算法:从左到右将待分词文本中的几个连续字符与词表匹配,如果匹配上,则切分出一个词,并且要做到最大匹配。
反向最大匹配算法:从右到左将待分词文本中的几个连续字符与词表匹配,如果匹配上,则切分出一个词,并且要做到最大匹配。
这份代码对正向最大匹配算法和反向最大匹配算法进行封装,需要在代码的目录下存放一份词典,词典取名为:chineseDic.txt。
样例如下:

,nr
劼人,nr
勍,nr
喆,nr
揳入,v

即可以是无headers的csv格式文件,词,词性这样的格式存放。
代码如下:

#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Author:ChenYuan
import time
import os


class CSegment(object):
    
    def __init__(self):
        self.question = None
        self.true_index = []
        self.with_user_dict = False
        self.reverse = False
        self.result_reverse = False
        self.MM_result_index = []
        self.RMM_result_index = []
        self.MM_result_list = []
        self.RMM_result_list = []
        self.word_pos_dict = {}

    def read_user_dict(self, dict_path):
        """
        :param dict_path: 用户定义的词典文件
        :return:
        """
        tic = time.clock()
        word_pos = {}
        if not os.path.exists(dict_path):
            print('该文件不存在')
            assert os.path.exists(dict_path) is True
        with open(dict_path, 'r')as fp:
            for line in fp:
                line = line.strip()
                word = line.split(',')[0]
                pos = line.split(',')[1]
                word_pos[word] = pos
        self.word_pos_dict = word_pos
        self.with_user_dict = True
        toc = time.clock()
        time_clock = toc - tic
        print('\033[1;31;47m')
        print('*' * 50)
        print('*Load user dict:\t', dict_path)
        print('*Load time:\t', time_clock)
        print('*' * 50)
        print('\033[0m')

    def read_true_sentence(self, true_result):
        """
        :param true_result: 正确的分词结果
        :return: 分词结果的下表元组列表
        """
        if len(true_result) == 0:
            return []
        else:
            true_list = [t.strip() for t in true_result.split('/')]
            true_index = []
            index = 0
            for t in true_list:
                lth = len(t)
                if index + lth == len(self.question):
                    break
                if self.question[index:index + lth] == t:
                    true_index.append(str((index, index + lth)))
                    index += lth
            return true_index

    def get_true_index(self, result_list):
        """
        :param result_list: 结果列表
        :return: 结果对应的下表列表
        """
        if self.reverse:
            self.reverse = False
            return self.RMM_result_index
        else:
            return self.MM_result_index

    def evaluate(self, true_list, result_list):
        """
        :param true_list: 正确的分词列表
        :param result_list: 算法得到的分词列表
        :return: 三种评价指标:{正确率,召回率,F1-score}
        """
        true_index = self.read_true_sentence(true_list)
        result_index = self.get_true_index(result_list)
        if len(true_index) == 0:
            print('未导入正确结果,不能进行评估')
            assert len(true_index) > 0
        tp = 0
        fp = 0
        fn = 0
        for r, t in zip(result_index, true_index):
            if r in true_index:
                tp += 1
            if r not in true_index:
                fp += 1
            if t not in result_index:
                fn += 1
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        F1 = 2 * precision * recall / (precision + recall)
        evaluate_result = {'Precision': precision, 'Recall': recall, 'F1': F1}

        return evaluate_result

    @staticmethod
    def read_own_dict():
        dict_path = './chineseDic.txt'
        word_pos_dict = {}
        if not os.path.exists(dict_path):
            print('该文件不存在')
            assert os.path.exists(dict_path) is True

        with open(dict_path, 'r')as fp:
            for line in fp:
                line = line.strip()
                word = line.split(',')[0]
                w_pos = line.split(',')[1]
                word_pos_dict[word] = w_pos

        return word_pos_dict

    def MM(self, sentence, lth, pos=False):
        """
        :param sentence: 待分词句子
        :param lth: 正向匹配的最大长度
        :param pos: 结果是否显示词性标注
        """
        self.reverse = False
        self.result_reverse = False
        if lth <= 1:
            print('max_len 不能小于2')
            assert lth > 1
        if len(sentence) == 0:
            print('原句子不能为空')
            assert len(sentence) > 0
        self.question = sentence
        if self.with_user_dict:
            word_pos_dict = self.word_pos_dict
        else:

            word_pos_dict = self.read_own_dict()

        result_list = []
        result_index = []
        max_lth = lth
        index = 0
        index_last = index + max_lth
        while index <= len(sentence):
            if sentence[index:index_last] in word_pos_dict.keys():
                if pos:
                    result_list.append(sentence[index:index_last] + '/' + word_pos_dict[sentence[index:index_last]])
                else:
                    result_list.append(sentence[index:index_last])
                result_index.append(str((index, index_last)))
                index = index_last
                index_last = index + max_lth
            else:
                index_last -= 1

        self.MM_result_index = result_index
        self.MM_result_list = result_list

    def RMM(self, sentence, lth, pos=False):
        """
        :param sentence: 待分词句子
        :param lth: 反向匹配的最大长度
        :param pos: 结果是否显示词性标注
        :return:
        """
        self.reverse = True
        self.result_reverse = True
        if lth <= 1:
            print('max_len 不能小于2')
            assert lth > 1
        if len(sentence) == 0:
            print('原句子不能为空')
            assert len(sentence) > 0
        self.question = sentence
        if self.with_user_dict:
            word_pos_dict = self.word_pos_dict
        else:
            word_pos_dict = self.read_own_dict()

        result_list = []
        result_index = []
        max_lth = lth
        index_last = len(sentence)
        index = index_last - max_lth
        while index_last != 0:
            if sentence[index:index_last] in word_pos_dict.keys():
                if pos:
                    result_list.append(sentence[index:index_last] + '/' + word_pos_dict[sentence[index:index_last]])
                else:
                    result_list.append(sentence[index:index_last])
                result_index.append(str((index, index_last)))
                index_last = index
                index = index_last - max_lth
            else:
                index += 1
        result_list.reverse()
        result_index.reverse()
        self.RMM_result_index = result_index
        self.RMM_result_list = result_list

    def get_result(self):
        """
        :return: 返回结果
        """
        if self.result_reverse:
            return self.RMM_result_list
        else:
            return self.MM_result_list

由于是课程作业,对方法的调用和定义的变量有要求,所以显得比较笨拙,但是代码简单,可修改性强,没有很复杂的函数调用和第三方模块调用。
下面是测试方法:

question = '命名时应考虑的因素:直观、时髦用语、暗示创业模型、有说服力、能吸引顾客的注意力。'
true = '命名/  时/  应/  考虑/  的/  因素/  :/  直观/  、/  时髦/  用语/  、/  暗示/  创业/  模型/  、/  有/  说服力/  、/  能/  吸引/  顾客/  的/  注意力/  。/  '
# 这里是指用户词典
chinese_dict = 'chineseDic.txt'
max_len = 3
# 实例化一个对象cut
cut = CSegment()
# 加载用户词典
cut.read_user_dict(chinese_dict)
# 正向最大匹配
cut.MM(sentence=question, lth=max_len, pos=True)
# 获取结果
MM_result = cut.get_result()
print(MM_result)
# 评价分词结果,需要有true(正确答案)
evaluate = cut.evaluate(true, MM_result)
print(evaluate)
# 反向最大匹配
cut.RMM(sentence=question, lth=max_len, pos=True)
RMM_result = cut.get_result()
print(RMM_result)
evaluate = cut.evaluate(true, RMM_result)
print(evaluate)

你可能感兴趣的:(python)