精简地从word文档读取资料,分析后传入excel文档。
不是txt!因为我的电脑是mac,针对txt的乱码问题解决不了。
主要操作的思维导图:
源代码:
from docx import Document
import openpyxl
# 创建一个表格存储信息
wb = openpyxl.Workbook()# 不是打开本地的excel,是重新新建打开
sheet1 = wb.active
sheet1.title = 'teaching_example'
# 设置表格大概的样子 表头
sheet1['A1'] = 'id'
sheet1['B1'] = 'word'
sheet1['C1'] = 'counting'
doc = Document('teach_test.docx')
# 打开文档
list_delete = [',', '.','_','-','?','“','"','”','(',')',':',';','\'']
d = {}
def count_words(sen_list):
for word in sen_list:
if word in d:
d[word] += 1
else:
d[word] = 1
def save_d():
row_num = 2# 表格第几行
word_num = 0# 第几个单词
print(d)
word_sort = sorted(d.items(), key=lambda x: x[1], reverse=True) # 升序
print(word_sort)
for i in word_sort:
word_num += 1
sheet1.cell(column=1, row=row_num, value=word_num)
sheet1.cell(column=2, row=row_num, value=i[0])
sheet1.cell(column=3, row=row_num, value=i[1])
row_num += 1
wb.save('test.xlsx')# 存储
def read_docx(doc):
for items in doc.paragraphs:
sentences = items.text
sentences = sentences.lower()
for i in list_delete:
sentences = sentences.replace(i,'')# 去掉特殊字符
sentences = sentences.replace('\n','')
sen_list = sentences.split(' ')
sen_list = list(filter(None,sen_list))# 去掉空字符串
# print(sen_list)
length = len(sen_list)
if length > 0:
count_words(sen_list)
if __name__ =='__main__':
read_docx(doc)
save_d()
参考文章:
词频统计:python 词频统计_词频统计python代码_小艾菜菜菜的博客-CSDN博客
补充:去重代码
import openpyxl
wb_old = openpyxl.load_workbook('all_result.xlsx')
s1 = wb_old['Sheet1']
s2 = wb_old['Sheet2']
s3 = wb_old['Sheet3']
s4 = wb_old['Sheet4']
s5 = wb_old['Sheet5']
s6 = wb_old['Sheet6']
wb_new = openpyxl.Workbook()
sheet1 = wb_new.active
sheet1.title = 'result_A1'
sheet2 = wb_new.create_sheet('result_A2')
sheet3 = wb_new.create_sheet('result_B1')
sheet4 = wb_new.create_sheet('result_B2')
sheet5 = wb_new.create_sheet('result_C1')
sheet6 = wb_new.create_sheet('result_C2')
sheet7 = wb_new.create_sheet('result_fold')
sheet7['A1'] = 'id'
sheet7['B1'] = 'word'
sheet7['C1'] = 'counting'
d = {}
global fold_num
fold_num = 2
def read_excel(sheet,new_sheet):
global fold_num
row_num = 2 # 表格第几行
word_num = 0 # 第几个单词
fold_word_num = 0
new_sheet['A1'] = 'id'
new_sheet['B1'] = 'word'
new_sheet['C1'] = 'counting'
# 设置表格大概的样子 表头
sheet7.cell(column=1, row=fold_num, value=new_sheet.title)
fold_num += 1
for row in range(2, sheet.max_row+1):
w = sheet['B'+str(row)].value
# print(w)
if w in d:
print(w)
fold_word_num += 1
sheet7.cell(column=1, row=fold_num, value=fold_word_num)
sheet7.cell(column=2, row=fold_num, value=w)
sheet7.cell(column=3, row=fold_num, value=sheet['C'+str(row)].value)
fold_num +=1
else:
d[w] = 1
word_num += 1
new_sheet.cell(column=1, row=row_num, value=word_num)
new_sheet.cell(column=2, row=row_num, value=w)
new_sheet.cell(column=3, row=row_num, value=sheet['C'+str(row)].value)
row_num += 1
return
read_excel(s1,sheet1)
read_excel(s2,sheet2)
read_excel(s3,sheet3)
read_excel(s4,sheet4)
read_excel(s5,sheet5)
read_excel(s6,sheet6)
wb_new.save('test_pick.xlsx')