python词频统计(word ——> excel,含去重)

 word资料处理 -------> 存入excel

精简地从word文档读取资料,分析后传入excel文档。
不是txt!因为我的电脑是mac,针对txt的乱码问题解决不了。

主要操作的思维导图:

 源代码:

from docx import Document
import openpyxl

# 创建一个表格存储信息
wb = openpyxl.Workbook()# 不是打开本地的excel,是重新新建打开
sheet1 = wb.active
sheet1.title = 'teaching_example'
# 设置表格大概的样子 表头
sheet1['A1'] = 'id'
sheet1['B1'] = 'word'
sheet1['C1'] = 'counting'

doc = Document('teach_test.docx')
# 打开文档

list_delete = [',', '.','_','-','?','“','"','”','(',')',':',';','\'']

d = {}

def count_words(sen_list):
    for word in sen_list:
        if word in d:
            d[word] += 1
        else:
            d[word] = 1


def save_d():
    row_num = 2#  表格第几行
    word_num = 0#  第几个单词

    print(d)
    word_sort = sorted(d.items(), key=lambda x: x[1], reverse=True) #  升序
    print(word_sort)
    for i in word_sort:
        word_num += 1
        sheet1.cell(column=1, row=row_num, value=word_num)
        sheet1.cell(column=2, row=row_num, value=i[0])
        sheet1.cell(column=3, row=row_num, value=i[1])
        row_num += 1
    wb.save('test.xlsx')# 存储

def read_docx(doc):
    for items in doc.paragraphs:
        sentences = items.text
        sentences = sentences.lower()
        for i in list_delete:
            sentences = sentences.replace(i,'')# 去掉特殊字符
        sentences = sentences.replace('\n','')
        sen_list = sentences.split(' ')
        sen_list = list(filter(None,sen_list))# 去掉空字符串
        # print(sen_list)
        length = len(sen_list)
        if length > 0:
            count_words(sen_list)

if __name__ =='__main__':
    read_docx(doc)
    save_d()

参考文章:

词频统计:python 词频统计_词频统计python代码_小艾菜菜菜的博客-CSDN博客


补充:去重代码

import openpyxl


wb_old = openpyxl.load_workbook('all_result.xlsx')
s1 = wb_old['Sheet1']
s2 = wb_old['Sheet2']
s3 = wb_old['Sheet3']
s4 = wb_old['Sheet4']
s5 = wb_old['Sheet5']
s6 = wb_old['Sheet6']

wb_new = openpyxl.Workbook()
sheet1 = wb_new.active
sheet1.title = 'result_A1'
sheet2 = wb_new.create_sheet('result_A2')
sheet3 = wb_new.create_sheet('result_B1')
sheet4 = wb_new.create_sheet('result_B2')
sheet5 = wb_new.create_sheet('result_C1')
sheet6 = wb_new.create_sheet('result_C2')
sheet7 = wb_new.create_sheet('result_fold')

sheet7['A1'] = 'id'
sheet7['B1'] = 'word'
sheet7['C1'] = 'counting'



d = {}
global fold_num
fold_num = 2



def read_excel(sheet,new_sheet):
    global fold_num
    row_num = 2  # 表格第几行
    word_num = 0  # 第几个单词
    fold_word_num = 0

    new_sheet['A1'] = 'id'
    new_sheet['B1'] = 'word'
    new_sheet['C1'] = 'counting'
    # 设置表格大概的样子 表头
    sheet7.cell(column=1, row=fold_num, value=new_sheet.title)
    fold_num += 1


    for row in range(2, sheet.max_row+1):
        w = sheet['B'+str(row)].value
        # print(w)
        if w in d:
            print(w)
            fold_word_num += 1
            sheet7.cell(column=1, row=fold_num, value=fold_word_num)
            sheet7.cell(column=2, row=fold_num, value=w)
            sheet7.cell(column=3, row=fold_num, value=sheet['C'+str(row)].value)
            fold_num +=1

        else:
            d[w] = 1
            word_num += 1
            new_sheet.cell(column=1, row=row_num, value=word_num)
            new_sheet.cell(column=2, row=row_num, value=w)
            new_sheet.cell(column=3, row=row_num, value=sheet['C'+str(row)].value)
            row_num += 1

    return


read_excel(s1,sheet1)
read_excel(s2,sheet2)
read_excel(s3,sheet3)
read_excel(s4,sheet4)
read_excel(s5,sheet5)
read_excel(s6,sheet6)


wb_new.save('test_pick.xlsx')

你可能感兴趣的:(word,excel,python)