Python借助jieba包对中文txt文档去停用词、分词

Python借助jieba包对中文txt文档去停用词、分词`

import jieba
 
# 创建停用词list
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords
 
# 对句子进行分词
def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist('E:\停用词位置.txt') 
     # 这里加载停用词的路径
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr
 
 
inputs = open('E:\待分词文档位置.txt', 'r', encoding='utf-8')
 #读取路径与待分词文档位置一致
outputs = open('E:\分词后写入文档的位置.txt', 'w',encoding='utf-8')
 #写入路径与分词后写入文档的位置
for line in inputs:
    line_seg = seg_sentence(line)  # 这里的返回值是字符串
    outputs.write(line_seg + '\n')
outputs.close()
inputs.close()

欢迎大家一起学习,一起交流!

你可能感兴趣的:(python)