NLP 中文分词-双向匹配算法(理论+Python实现)

import time
import re
class Segment:

# 数据成员
sentence = ""
MaxLen = 0
pos = 0
len = 0
result_MM = ""  # 存放MM分词结果
result_RMM = ""  # 存放RMM分词结果
final_res = ""
dict = []
# 构造函数
def __init__(self, sentence, MaxLen):
    self.sentence = sentence
    self.MaxLen = MaxLen
    self.pos = 0
    self.len = self.MaxLen
    self.result_MM = ""
    self.readDict()
# 读字典
def readDict(self):
    f = open("chineseDic.txt", "r", encoding="utf-8")
    lines = f.readlines()
    for line in lines:
        # print(line)
        words = line.split(",")
        self.dict.append(words[0])
# 正向最大匹配
def MM(self, nLen, nPos):
    length = len(self.sentence)
    if (nPos > length):
        return
    substr = self.sentence[nPos:nPos + nLen]
    if substr in self.dict:
        self.result_MM = self.result_MM + substr + "/ "
        nPos = nPos + nLen
        nLen = self.MaxLen
        self.MM(nLen, nPos)
    elif nLen > 1:
        nLen = nLen - 1
        self.MM(nLen, nPos)
    else:
        self.result_MM = self.result_MM + substr + "/ "
        nPos = nPos + 1
        nLen = self.MaxLen
        self.MM(nLen, nPos)
# 逆向最大匹配
def RMM(self, nLen, nPos):
    if (nPos < 0):
        return
    substr = self.sentence[nPos - nLen:nPos]
    if substr in self.dict:
        self.result_RMM = self.result_RMM + "/" + substr
        nPos = nPos - nLen
        nLen = self.MaxLen
        self.RMM(nLen, nPos)
    elif nLen > 1:
        nLen = nLen - 1
        self.RMM(nLen, nPos)
    else:
        self.result_RMM = self.result_RMM + substr + "/"
        nPos = nPos - 1
        nLen = self.MaxLen
        self.RMM(nLen, nPos)
def getMMResult(self):
    return self.result_MM
def getRMMResult(self):
    return self.result_RMM
def getFinalResult(self):
    return self.final_res
def printFinalResult(self):
    print("正向最大匹配结果:")
    seg_res_MM = self.result_MM.replace(" ", "")
    print(seg_res_MM)
    seg_list_MM = seg_res_MM.split('/')
    del seg_list_MM[-1]  # 由于按照'/'分割,所以最后会多出一个'',删去
    print(seg_list_MM)
    print("逆向最大匹配结果:")
    seg_res_RMM = self.result_RMM.replace(" ", "")
    print(seg_res_RMM)
    seg_list_RMM = list(reversed(seg_res_RMM.split('/')))
    del seg_list_RMM[0]
    del seg_list_RMM[-1]
    print(seg_list_RMM)
    len_MM = len(seg_list_MM)
    len_RMM = len(seg_list_RMM)
    flag = 1
    for i in range(0, min(len_MM, len_RMM)):
        if seg_list_MM[i] != seg_list_RMM[i]:
            print("两次分词结果不一致。")
            flag = 0
            break
    if (flag):
        print("两次分词结果一致。")
        print("最终的分词结果为:")
        self.final_res = self.result_MM
        print(self.final_res)

def to_region(segmentation):

region = []
start = 1
for word in re.compile("\\s+").split(segmentation.strip()):  # 空格,回车,换行等空白符
    end = start + len(word) - 2
    region.append((start, end))
    start = end + 1
return region

def PRF(target, pred):

t_set, p_set = set(target), set(pred)
target_num = len(t_set)
pred_num = len(p_set)
cap_num = len(t_set & p_set)
p = cap_num / pred_num
r = cap_num / target_num
f = 2 * p * r / (p + r)
print("P =", p)
print("R =", r)
print("F1 =", f)

if name == '__main__':

test_str = '在这一年中,中国的改革开放和现代化建设继续向前迈进。国民经济保持了“高增长、[利率期货](https://www.gendan5.com/ff/if.html)低通胀”的良好发展态势。农业生产再次获得好的收成,企业改革继续深化,人民生活进一步改善。对外经济技术合作与交流不断扩大。'
seg = Segment(test_str, 3)
time_start = time.time()
seg.MM(3, 0)
seg.RMM(3, len(test_str))
time_end = time.time()
seg.printFinalResult()
print('分词时间:', time_end - time_start, 's')
target_str = "在/  这/  一/  年/  中/  ,/  中国/  的/  改革/  开放/  和/  现代化/  建设/  继续/  向前/  迈进/  。/  国民经济/  保持/  了/  “/  高/  增长/  、/  低/  通胀/  ”/  的/  良好/  发展/  态势/  。/  农业/  生产/  再次/  获得/  好/  的/  收成/  ,/  企业/  改革/  继续/  深化/  ,/  人民/  生活/  进一步/  改善/  。/  对外/  经济/  技术/  合作/  与/  交流/  不断/  扩大/  。/"
re_pred = to_region(seg.getFinalResult())
re_target = to_region(target_str)
# 每个单词按它在文本中的起止位置可记作区间[i, j]
print("分词结果:", re_pred)
print("标准答案:", re_target)
PRF(re_target, re_pred)

你可能感兴趣的:(python)