python切分句子

import re


def remove_space(text):
    """
    去除空格
    """
    text = text.replace(' ', '')
    return text


def sentence_split(text):
    """
    将一个段落分成若干句子,以分号,句号作为切分。
    """
    text = remove_space(text)

    start = 0
    result = []
    groups = re.finditer(';|;|。', text)

    for i in groups:
        end = i.span()[1]
        result.append(text[start:end])
        start = end
    # last one
    result.append(text[start:])
    
    return result


def main(path_in, path_out):
    """
    读取整个jsonl文件逐个分析
    """
    fout = open(path_out, 'w', encoding='utf8')

    with open(path_in) as f:
        lines = f.readlines()

    for lin in lines:
        text = eval(lin)['text']
        result = sentence_split(text)
        for i in result:
            fout.write('{}\n'.format(i))
    fout.close()


text = "1.具备电子产品知识;   2. 工单缺料情况。跟催物控或采购"
data = sentence_split(text)
print(data)
# path_in = 'text.jsonl'
# path_out = 'text'
# main(path_in, path_out)

## output:
## ['1.具备电子产品知识;', '2.工单缺料情况。', '跟催物控或采购']

你可能感兴趣的:(python)