jieba分词及词性标注

jieba分词及词性标注
想着先分词,再给分过的词标注词性
很简单但是弄了蛮久
代码也不简便
要学习啊
鸡汤:脚踏实地,眼看前方

import jieba
import jieba.posseg as pseg
jieba.load_userdict('userdict1.txt')


# 创建停用词list
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 对句子进行分词
def seg_sentence(sentence):
    sentence_seged = jieba.posseg.cut(sentence.strip())
    stopwords = stopwordslist('stop_words.txt')  ## 这里加载停用词的路径
    outstr = ''
    for pairs in sentence_seged:
        # print(pairs)
        # print('.'*60)
        for word in pairs:
            # print(word)
            # print('='*50)
            if word not in stopwords:
                if word != '\t':
                    # print(word)
                    # print('>'*50)
                    outstr += word
                    outstr += "/"
                    # print(outstr)
        outstr += ','
    return outstr


inputs = open('input.txt', 'r', encoding='utf-8')
outputs = open('output.txt', 'w', encoding='utf-8')
for line in inputs:
    line_seg = seg_sentence(line)  # 这里的返回值是字符串
    outputs.write(line_seg + '\n')
outputs.close()
inputs.close()

你可能感兴趣的:(jieba分词及词性标注)