正向最大匹配算法:从左到右将待分词文本中的几个连续字符与词表匹配,如果匹配上,则切分出一个词,并且要做到最大匹配。
反向最大匹配算法:从右到左将待分词文本中的几个连续字符与词表匹配,如果匹配上,则切分出一个词,并且要做到最大匹配。
这份代码对正向最大匹配算法和反向最大匹配算法进行封装,需要在代码的目录下存放一份词典,词典取名为:chineseDic.txt。
样例如下:
冮,nr
劼人,nr
勍,nr
喆,nr
揳入,v
即可以是无headers的csv格式文件,词,词性这样的格式存放。
代码如下:
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# Author:ChenYuan
import time
import os
class CSegment(object):
def __init__(self):
self.question = None
self.true_index = []
self.with_user_dict = False
self.reverse = False
self.result_reverse = False
self.MM_result_index = []
self.RMM_result_index = []
self.MM_result_list = []
self.RMM_result_list = []
self.word_pos_dict = {}
def read_user_dict(self, dict_path):
"""
:param dict_path: 用户定义的词典文件
:return:
"""
tic = time.clock()
word_pos = {}
if not os.path.exists(dict_path):
print('该文件不存在')
assert os.path.exists(dict_path) is True
with open(dict_path, 'r')as fp:
for line in fp:
line = line.strip()
word = line.split(',')[0]
pos = line.split(',')[1]
word_pos[word] = pos
self.word_pos_dict = word_pos
self.with_user_dict = True
toc = time.clock()
time_clock = toc - tic
print('\033[1;31;47m')
print('*' * 50)
print('*Load user dict:\t', dict_path)
print('*Load time:\t', time_clock)
print('*' * 50)
print('\033[0m')
def read_true_sentence(self, true_result):
"""
:param true_result: 正确的分词结果
:return: 分词结果的下表元组列表
"""
if len(true_result) == 0:
return []
else:
true_list = [t.strip() for t in true_result.split('/')]
true_index = []
index = 0
for t in true_list:
lth = len(t)
if index + lth == len(self.question):
break
if self.question[index:index + lth] == t:
true_index.append(str((index, index + lth)))
index += lth
return true_index
def get_true_index(self, result_list):
"""
:param result_list: 结果列表
:return: 结果对应的下表列表
"""
if self.reverse:
self.reverse = False
return self.RMM_result_index
else:
return self.MM_result_index
def evaluate(self, true_list, result_list):
"""
:param true_list: 正确的分词列表
:param result_list: 算法得到的分词列表
:return: 三种评价指标:{正确率,召回率,F1-score}
"""
true_index = self.read_true_sentence(true_list)
result_index = self.get_true_index(result_list)
if len(true_index) == 0:
print('未导入正确结果,不能进行评估')
assert len(true_index) > 0
tp = 0
fp = 0
fn = 0
for r, t in zip(result_index, true_index):
if r in true_index:
tp += 1
if r not in true_index:
fp += 1
if t not in result_index:
fn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
F1 = 2 * precision * recall / (precision + recall)
evaluate_result = {'Precision': precision, 'Recall': recall, 'F1': F1}
return evaluate_result
@staticmethod
def read_own_dict():
dict_path = './chineseDic.txt'
word_pos_dict = {}
if not os.path.exists(dict_path):
print('该文件不存在')
assert os.path.exists(dict_path) is True
with open(dict_path, 'r')as fp:
for line in fp:
line = line.strip()
word = line.split(',')[0]
w_pos = line.split(',')[1]
word_pos_dict[word] = w_pos
return word_pos_dict
def MM(self, sentence, lth, pos=False):
"""
:param sentence: 待分词句子
:param lth: 正向匹配的最大长度
:param pos: 结果是否显示词性标注
"""
self.reverse = False
self.result_reverse = False
if lth <= 1:
print('max_len 不能小于2')
assert lth > 1
if len(sentence) == 0:
print('原句子不能为空')
assert len(sentence) > 0
self.question = sentence
if self.with_user_dict:
word_pos_dict = self.word_pos_dict
else:
word_pos_dict = self.read_own_dict()
result_list = []
result_index = []
max_lth = lth
index = 0
index_last = index + max_lth
while index <= len(sentence):
if sentence[index:index_last] in word_pos_dict.keys():
if pos:
result_list.append(sentence[index:index_last] + '/' + word_pos_dict[sentence[index:index_last]])
else:
result_list.append(sentence[index:index_last])
result_index.append(str((index, index_last)))
index = index_last
index_last = index + max_lth
else:
index_last -= 1
self.MM_result_index = result_index
self.MM_result_list = result_list
def RMM(self, sentence, lth, pos=False):
"""
:param sentence: 待分词句子
:param lth: 反向匹配的最大长度
:param pos: 结果是否显示词性标注
:return:
"""
self.reverse = True
self.result_reverse = True
if lth <= 1:
print('max_len 不能小于2')
assert lth > 1
if len(sentence) == 0:
print('原句子不能为空')
assert len(sentence) > 0
self.question = sentence
if self.with_user_dict:
word_pos_dict = self.word_pos_dict
else:
word_pos_dict = self.read_own_dict()
result_list = []
result_index = []
max_lth = lth
index_last = len(sentence)
index = index_last - max_lth
while index_last != 0:
if sentence[index:index_last] in word_pos_dict.keys():
if pos:
result_list.append(sentence[index:index_last] + '/' + word_pos_dict[sentence[index:index_last]])
else:
result_list.append(sentence[index:index_last])
result_index.append(str((index, index_last)))
index_last = index
index = index_last - max_lth
else:
index += 1
result_list.reverse()
result_index.reverse()
self.RMM_result_index = result_index
self.RMM_result_list = result_list
def get_result(self):
"""
:return: 返回结果
"""
if self.result_reverse:
return self.RMM_result_list
else:
return self.MM_result_list
由于是课程作业,对方法的调用和定义的变量有要求,所以显得比较笨拙,但是代码简单,可修改性强,没有很复杂的函数调用和第三方模块调用。
下面是测试方法:
question = '命名时应考虑的因素:直观、时髦用语、暗示创业模型、有说服力、能吸引顾客的注意力。'
true = '命名/ 时/ 应/ 考虑/ 的/ 因素/ :/ 直观/ 、/ 时髦/ 用语/ 、/ 暗示/ 创业/ 模型/ 、/ 有/ 说服力/ 、/ 能/ 吸引/ 顾客/ 的/ 注意力/ 。/ '
# 这里是指用户词典
chinese_dict = 'chineseDic.txt'
max_len = 3
# 实例化一个对象cut
cut = CSegment()
# 加载用户词典
cut.read_user_dict(chinese_dict)
# 正向最大匹配
cut.MM(sentence=question, lth=max_len, pos=True)
# 获取结果
MM_result = cut.get_result()
print(MM_result)
# 评价分词结果,需要有true(正确答案)
evaluate = cut.evaluate(true, MM_result)
print(evaluate)
# 反向最大匹配
cut.RMM(sentence=question, lth=max_len, pos=True)
RMM_result = cut.get_result()
print(RMM_result)
evaluate = cut.evaluate(true, RMM_result)
print(evaluate)