Python--使用jieba进行分词并计算词权重

import jieba
import xlrd
import jieba.analyse

def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

def fenci(content):
    table = content.sheets()[0]
    nrows = table.nrows#获取行数
    row1=1
    cell=""
    final = ""
    while row1<nrows:
        cell = table.cell(row1,0).value
        fenci=jieba.cut(cell)

        for seg in fenci:

            if seg not in stopwords and len(seg)>0:
                final+=seg+" "
                final+=""
        final+='\n'
        # print(row1,final)
        row1 += 1

    return final

jieba.load_userdict("C:\\Users\\Administrator\\Desktop\\userdic.txt")#导入自定义词典,自定义词典编码方式为UTF-8
stopwords=stopwordslist("C:\\Users\\Administrator\\Desktop\\stop.txt")#导入停止词典

content=xlrd.open_workbook("C:\\Users\\Administrator\\Desktop\\zhaopin_data.xlsx")#导入数据
final=fenci(content)
# print(final)

keywords = jieba.analyse.extract_tags(final,topK=200,withWeight=True,allowPOS=())
# print(keywords)
for item in keywords:
    # if item[0] in ("SQL","Python","SAS"):
        print(item[0], item[1])  # 输出关键词和相应的权重


#可根据输出的topK词语,再挑选一些加入停止词典中。

你可能感兴趣的:(中文分词)