python脚本之: 将目录下所有的docx文件读取到一个Excel中展示

需求:

        将指定目录下所有的docx文件,读取到一个Excel进行展示,便于统一浏览

开发流程:

1. 便利目录下,获取所有word

2. 将word读取,转化为字典

3. 构建Excel,将字典全部写入到Excel

代码展示

from docx import Document
import xlsxwriter
import os,sys,shutil

'''
需求分析:遍历目录下所有的word,读取word,然后将读取的数据写入到Excel中,这样便于通过一个Excel表格,就可以浏览所有的word内容
'''
#定义word list
full_file_list = []
base_name = os.path.dirname(os.path.abspath(__file__))
word_path = os.path.join(base_name, 'word')
excel_name = 'test.xlsx'
excel_name_file = os.path.join(word_path,excel_name)



def word_path_list(word_path):
    '''
    :param word_path: word目录
    :return:
    '''
    if os.path.isdir(word_path):
        for file in os.listdir(word_path):
            full_path = os.path.join(word_path,file)
            if os.path.isdir(full_path):
                word_path_list(full_path)
            else:
                if full_path.endswith('docx'):
                    full_file_list.append(full_path)
                else:
                    continue
    else:
        return None
    return full_file_list

def get_word_content(word_path):
    '''
    :param word_path: 定义word文本
    :return:
    '''
    #数据结构
    '''
    {
        'filename': ['file_content','file_content']
    }
    '''
    data_json = {}
    tmp = []
    for li in word_path:
        tmp = []
        document = Document(li)
        all_content = document.paragraphs
        #获取每一段内容
        for paragraph in all_content:
            text = ' ' + paragraph.text
            tmp.append(text)
        data_json[li]=tmp
    return data_json

def create_excel(data,excel_path):
    '''
    :param data: json data
    :return:
    '''
    #创建Excel表
    workbook = xlsxwriter.Workbook(excel_path)
    #创建sheet
    worksheet = workbook.add_worksheet('数据统计')

    #定义表头
    headers = {
        'bold': True,
        'font_name': 'SimHei',
        'font_size': 12,
        'border': True,
        'align': 'center', #水平居中
        'valign': 'vcenter', #垂直居中
        'color': '#232323', #文字颜色
        'bg_color': '#D9E1F2' #背景色
    }

    header = workbook.add_format(headers)

    #定义正文字段格式
    text = {
        'font_name': 'SimHei',
        'font_size': 10,
        'border': True,
        'align': 'left',
        'color': '#232323',
        'valign': 'center'
    }
    text_header = workbook.add_format(text)

    #定义Excel格式
    worksheet.write('A1','序号',header)
    worksheet.write('B1','文件名称',header)
    worksheet.write('C1','文件内容',header)
    worksheet.set_row(0,30)
    worksheet.set_column('C:C',180)

    #定义写入单元
    row = 1
    col = 0
    num = 1000

    #开始写入
    for k,v in data.items():
        worksheet.write(row,col,str(num))
        worksheet.write(row,col+1,str(k))
        worksheet.write(row,col+2,str(v))
        row+=1
        num+=1
    workbook.close()
    print('表格创建完成',excel_path)


if __name__ == '__main__':
    base_name = os.path.dirname(os.path.abspath(__file__))
    word_path = os.path.join(base_name, 'word')
    p_list = word_path_list(word_path)
    data_json = get_word_content(p_list)
    create_excel(data_json,excel_name_file)

效果展示

python脚本之: 将目录下所有的docx文件读取到一个Excel中展示_第1张图片

 

你可能感兴趣的:(excel,python,word)