1. 先将引号部分用特定字符替换
2. 然后按照句号,问号,感叹号,省略号分句
3. 再将特殊字符替换为原始内容
具体代码如下:
# coding:utf-8
import re
SPE_TOKEN = "NNNN"
def normal_cut_sentence(text):
# 双引号内容被NNNN替换
text = re.sub('(NNNN)(NNNN)', r'\1\n\2', text) # 普通断句符号且后面没有引号
text = re.sub('([。!?\?])([^’”()()])', r'\1\n\2', text) # 普通断句符号且后面没有引号
text = re.sub('(\.{6})([^’”]()())', r'\1\n\2', text) # 英文省略号且后面没有引号
text = re.sub('(\…{2})([^’”]()())', r'\1\n\2', text) # 中文省略号且后面没有引号
text = re.sub('([.。!?\?\.{6}\…{2}][’”])([^’”])', r'\1\n\2', text) # 断句号+引号且后面没有引号
return text.split("\n")
def cut_sentence_with_quotation_marks(text):
"""
相当于有一个替换错误
:param text:
:return:
"""
keep_edits = []
res = []
cor_sent = list(text)
offset = 0
# p = re.compile('((.*?))|((.*?\))|(\(.*?))|(\(.*?\))|(".*?")|(“.*?")|(".*?”)|(“.*?”)')
p = re.compile('(".*?")|(“.*?")|(".*?”)|(“.*?”)')
for i in p.finditer(text):
start = i.start()
end = i.end()
ori = text[start:end]
cor = SPE_TOKEN.split()
keep_edits.append(ori)
cor_sent[start + offset:end + offset] = cor
offset = offset - (end - start) + len(cor)
replace_text = "".join(cor_sent)
# print(text)
# print(replace_text)
sents = normal_cut_sentence(replace_text)
# print(sents)
i = 0
for sent in sents:
j = 0
num = len(sent)
offset1 = 0
while i < len(keep_edits):
if sent[j + offset1:j + len(SPE_TOKEN) + offset1] == SPE_TOKEN:
sent = sent[:j + offset1] + keep_edits[i] + sent[j + len(SPE_TOKEN) + offset1:]
offset1 = offset1 - len(SPE_TOKEN) + len(keep_edits[i])
i += 1
j += 4
else:
j += 1
if j >= len(sent):
break
res.append(sent)
return res
if __name__ == '__main__':
text = "“喂!路明非!你给我站住!”叔叔追了出来,在走廊尽头冲他低吼。路明非实在没时间让他兴师问罪了,只好说:“叔叔我真有事得先走,什么事以后再说!”叔叔可不听他说,跑过来一把抓住他的手:“你小子给我说老实话?是不是在外面惹事了?我看外面都是警车还有流氓,他们都是冲你来的?”“没……没有……”路明非想辩解。“你小子真不是骗我们说上学其实跑日本来混黑道了吧?”叔叔瞪着他。“真不是,这事儿一时没法解释……”叔叔从屁股后面摸出金利来的钱包,打开来夹层里有几张日圆钞票,大概一万多的样子。他把那张万圆大钞塞进路明非手里:“叔叔不知道你惹了什么麻烦,你们年轻人见的世面大,有些事不愿告诉我们大人,我问也没用。我以前也惹过事跑过路,跑路身上千万得有现金!银行卡信用卡跑车都没用!”"
sents = cut_sentence_with_quotation_marks(text)
for sent in sents:
print(sent)