使用python读取多重文件夹下的word(doc、docx)文件,并处理存储到excel(xls、xlsx)文件...

#-*- coding:gbk -*-
import os
import docx
from win32com import client as wc
import xlwt
import xlsxwriter

# 获取filepath文件夹下的所有的文件
def getfilelist(filepath):
    filelist =  os.listdir(filepath)  
    files = []
    for i in range(len(filelist)):
        child = os.path.join('%s\\%s' % (filepath, filelist[i]))
        if os.path.isdir(child):
            files.extend(getfilelist(child))
        else:
            files.append(child)
    return files

# 获取word文件文本
def getDocx(fileName):
    d = docx.opendocx(fileName)
    doc = docx.getdocumenttext(d)
    return doc

# 将doc转换为docx
def doc2Docx(fileName):
    word = wc.Dispatch("Word.Application")
    doc = word.Documents.Open(fileName)
    doc.SaveAs(fileName + "x", 12, False, "", True, "", False, False, False, False)
    os.remove(fileName)
    doc.Close()
    word.Quit()

filepath = "C:\\xxx\\xx\\xx\\xx\\数据集"
filelist = (getfilelist(filepath))
##如果文件夹下的文件都是doc,需要先通过该函数全部转变为docx
##for i in range(len(filelist)):
##    doc2Docx(filelist[i])
    

list = []
for i in range(len(filelist)):
    if (filelist[i].endswith("docx")):
        list.append(filelist[i])
     
# 使用xlwt写入到excel,当存在大文本的时候会出现错误:Exception: String longer than 32767 characters
##for i in range(len(list)):
##    fileName = list[i]
##    doc = get_docx(fileName)
##    filePaths = fileName.split("\\")
##    string = ""
##    for j in range(len(doc)):
##        string += doc[j] + "\n"
##    if (len(string) > 10000):
##        string = string[:10000]
##    filePaths.append(string)
##    for j in range(20, -1, -1):
##        if j < len(filePaths):
##            worksheet.write(i, j, label = filePaths[j])
##workbook.save('Excel_Workbook.xls')

# 使用xlsxwriter处理超过的32767word文本
workbook = xlsxwriter.Workbook(u'数据.xlsx') 
worksheet = workbook.add_worksheet(u"数据")
for i in range(len(list)):
    fileName = list[i]
    doc = get_docx(fileName)
    filePaths = fileName.split("\\")
    string = ""
    for j in range(len(doc)):
        string += doc[j] + "\n"
    filePaths.append(string)
    for j in range(20, -1, -1):
        if j < len(filePaths):
            worksheet.write(i, j, filePaths[j])  
workbook.close()

 

转载于:https://www.cnblogs.com/sixu/p/10104752.html

你可能感兴趣的:(使用python读取多重文件夹下的word(doc、docx)文件,并处理存储到excel(xls、xlsx)文件...)