import re
def remove_space(text):
"""
去除空格
"""
text = text.replace(' ', '')
return text
def sentence_split(text):
"""
将一个段落分成若干句子,以分号,句号作为切分。
"""
text = remove_space(text)
start = 0
result = []
groups = re.finditer(';|;|。', text)
for i in groups:
end = i.span()[1]
result.append(text[start:end])
start = end
# last one
result.append(text[start:])
return result
def main(path_in, path_out):
"""
读取整个jsonl文件逐个分析
"""
fout = open(path_out, 'w', encoding='utf8')
with open(path_in) as f:
lines = f.readlines()
for lin in lines:
text = eval(lin)['text']
result = sentence_split(text)
for i in result:
fout.write('{}\n'.format(i))
fout.close()
text = "1.具备电子产品知识; 2. 工单缺料情况。跟催物控或采购"
data = sentence_split(text)
print(data)
# path_in = 'text.jsonl'
# path_out = 'text'
# main(path_in, path_out)
## output:
## ['1.具备电子产品知识;', '2.工单缺料情况。', '跟催物控或采购']