jieba 去除停用词、提取关键词、词性标注

#-*- coding:utf-8 -*-

import sys
import jieba
import os
import jieba.analyse
import string
import math
import json

reload(sys)
sys.setdefaultencoding('utf-8')


topk=10

testGuanJian = open(r'D:\PythonFiles\CINS\201722.news_zhengwen', 'r')
fenci = open(r'D:\PythonFiles\files\stopword.txt', 'r')
# 将停用次以列表的方式导入,方便使用
stopkey = [line.strip().decode('utf-8') for line in fenci.readlines()]

#去停用词
def jiebaData(rtitleContentComment):
    words = jieba.cut(rtitleContentComment, cut_all=False)
    stayed_line = ""
    # print words
    for word in words:
        if word not in stopkey:
            stayed_line += word + " "
    return stayed_line

#用jieba提取关键词
def testJieBaGuanJianCi():
    print 'start'
    f=open(r'D:\PythonFiles\CINS\test\cungj.news_zhengwen','w')
    s_zhengwen = testGuanJian.readlines()
    for i in s_zhengwen:
        try:
            print 'EEEEEEEEEEEEEE'
            m = i.strip().split('\001')
            print m[2]
            str=m[2]
            stayed_line=jiebaData(m[2])
            print stayed_line
            print 'gggggggggggggggggggg'
            gWord =jieba.analyse.extract_tags(stayed_line,10)
            #print type(gWord)
            str=''
            for j in gWord:
                str +=j
                str +=' '

            # if u'体育新闻' == str:
            #     print type(m[2])
            f.write(m[1]+ '\001')
            f.write(str+'\n')
        except Exception, e:
            print e, 'er'
    f.close()

#testJieBaGuanJianCi()

关于词典的加载:

# -*- coding:utf-8 -*-



from __future__ import print_function, unicode_literals




#sys.path.append("../")
import jieba

jieba.load_userdict(u'D:\\PythonFiles\\17secondYear\\importJieBaDic\\userDict.txt')
import jieba.posseg as pseg

jieba.add_word('石墨烯')
jieba.add_word('凱特琳')
jieba.del_word('自定义词')

test_sent = (
"李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n"
"例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n"
"「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。"
)
words = jieba.cut(test_sent)
print('/'.join(words))

print("="*40)

result = pseg.cut(test_sent)

for w in result:
    print(w.word, "/", w.flag, ", ", end=' ')

print("\n" + "="*40)

terms = jieba.cut('easy_install is great')
print('/'.join(terms))
terms = jieba.cut('python 的正则表达式是好用的')
print('/'.join(terms))

print("="*40)
# test frequency tune
testlist = [
('今天天气不错', ('今天', '天气')),
('如果放到post中将出错。', ('中', '将')),
('我们中出了一个叛徒', ('中', '出')),
]

for sent, seg in testlist:
    print('/'.join(jieba.cut(sent, HMM=False)))
    print ('完成======')
    word = ''.join(seg)
    print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
    print('/'.join(jieba.cut(sent, HMM=False)))
    print("-"*40)

 

你可能感兴趣的:(python基础,提取关键词)