Python——对每条评论(每个单元格文本)进行词频统计

C列为统计结果
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 31 08:57:45 2018

@author: Shirley
"""

import xlrd
import jieba
from collections import defaultdict
from openpyxl import load_workbook

stopwords = []
with open("D:/anaconda/shirleylearn/cipintongji/stopwords.txt","r") as f:
    for stopword in f.readlines():
        stopwords.append(stopword.replace("\n",""))#可以用.strip()方法去掉首位空白符,但要保留空格,所以这里不用
    

path = "D:/anaconda/shirleylearn/cipintongji/wordexample2.xlsx"

myexcel = xlrd.open_workbook(path)
mysheet = myexcel.sheet_by_name("Sheet1")

n = mysheet.nrows
#cut_dic = defaultdict(int)
cutlist = []

#多个循环实现每条评论的词频统计
for i in range(1,n):
    comment = mysheet.row(i)[1].value#循环获取每条评论
    comment_cut = jieba.lcut(comment)
    cut_dic = defaultdict(int)#词频不叠加,每次统计一个句子后就清空
    for word in comment_cut:#对每条评论进行词频统计
        if word not in stopwords:
            cut_dic[word] += 1        
    order = sorted(cut_dic.items(),key = lambda x:x[1],reverse = True)#降序排列词频
    #print(order)

    myresult = ""#字典不叠加,每次统计一个句子后就清空
    for j in range(0,len(order)):#把每条评论的词频统计结果保存为str格式
        result = order[j][0]+"("+str(order[j][1])+")"
        myresult = myresult + " " + result#myresult和result的顺序不能换,否则就变升序啦
    cutlist.append(myresult)

#print(cutlist)

#打开excel,把词频统计结果放入
loadfile = load_workbook(path)
sheet = loadfile["Sheet1"]#激活sheet名为“Sheet1”的表格
sheet["C1"] = "result"
for k in range(2,len(cutlist)+2):
    sheet.cell(k,3,cutlist[k-2])
loadfile.save(path)

这样的统计结果方便回到原文中挑新词

你可能感兴趣的:(Python——对每条评论(每个单元格文本)进行词频统计)