Python将doc文件里的表格汇总

Python将doc文件里的表格汇总

# -*- coding: utf-8 -*-
"""
Created on Sun May  7 14:22:49 2023
@author: ypzhao
"""
# 写入docx文件
import docx
# 设置图片格式
from docx.shared import Cm
from openpyxl import Workbook
from docx import Document
from docx.shared import Pt, RGBColor
from docx.oxml.ns import qn
import os
import time


def handleFile(filename):
    docx = docx.Document(r'D:\\CODE\\pythoncode\\1.docx')


    '''段落读取'''
    # 读取docx文件中的内容
    print(len(docx.paragraphs))  # 输出总的段落数
    # 读取word文档中的第一段内容
    print(docx.paragraphs[0].text)

    # 查看第1段中的对象数
    # print(len(docx.paragraphs[0].runs))
    # for i in range(6):
    #     print(f'第{i}个Run对象文本:',docx.paragraphs[0].runs[i].text)


    '''指定word某几段内容读取'''
    for i in range(2, 5):
        print(docx.paragraphs[i].text)

    '''读取word种所有内容读取'''
    for paragraph in docx.paragraphs:
        print(paragraph.text)


    for t in docx.tables:            # for 循环获取表格对象
        for row in t.rows:              # 获取每一行
            row_str = []
            for cell in row.cells:    # 获取每一行单独的小表格,然后将其内容拼接起来;拼接完成之后再第二个for循环中打印出来
                row_str.append(cell.text)
            print(row_str)


    p1 = docx.add_paragraph('这是一个段落')
    # 加粗
    p1.add_run('加粗的一句话').bold = True
    # 斜体
    p1.add_run("这句是斜体文字块").italic = True

    docx.add_paragraph('这是第二个段落')

    docx.add_paragraph('这是一个段落,后面带图片')

    '''
    Cm 模块,用于设定图片尺寸大小
    只给定一个宽度或高度
    '''

    docx.add_picture('1.png', width=Cm(14), height=Cm(7))
    docx.add_paragraph('这是第二个段落')


    # 分页
    docx.add_page_break()
    paragraph1 = docx.add_paragraph("这是新增的一页")


    Document().add_heading('正文', 1).add_run("前言")
    Document().add_heading('标题', 2)

    '''添加表格'''
    # 换行
    para = docx.add_paragraph().add_run('\n')
    # 换行
    para = docx.add_paragraph().add_run('\n')
    # 换行
    para = docx.add_paragraph().add_run('\n')
    # 换行
    para = docx.add_paragraph().add_run('\n')

    list1 = [
        ["语文", "数学", "英语"],
        ["100", "100", "100"],
        ["100", "100", "100"],
        ["100", "100", "100"],
        ["100", "100", "100"]
    ]
    list2 = [
        ["政治", "历史", "地理"],
        ["100", "100", "100"],
        ["100", "100", "100"],
        ["100", "100", "100"],
        ["100", "100", "100"]
    ]
    table1 = docx.add_table(rows=5, cols=3)

    for row in range(5):
        cells = table1.rows[row].cells
        for col in range(3):
            cells[col].text = str(list1[row][col])
    docx.add_paragraph("---------------------------------------------------------")

    table2 = docx.add_table(rows=4, cols=3)

    for row in range(4):
        cells = table2.rows[row].cells
        for col in range(3):
            cells[col].text = str(list2[row][col])


    t = docx.tables[1]
    workbook = Workbook()
    sheet = workbook.active
    for i in range(len(t.rows)):
        list1 = []
        for j in range(len(t.columns)):
            list1.append(t.cell(i, j).text)
        sheet.append(list1)
    workbook.save("table1.xlsx")
    docx.save('test.docx')


# 获取指定文件夹下所有文件
def getfilelist(dirname, ls):
    for root, dirs, files in os.walk(dirname):
        for f in files:
            m = os.path.join(root, f)            
            ls.append(m)

def test(path):
    from docx import Document
    doc = Document(path)

    # i =0
    # for t in doc.tables:            # for 循环获取表格对象
    #     i+=1
    #     p1 = doc1.add_paragraph('表格:'+ str(i))
    #     p1.runs[0].font.bold=True     
    #     table1 = doc1.add_table(len(t.rows),len(t.columns))
    #     for row in range(len(t.rows)):              # 获取每一行
    #         hdr_cells = table1.rows[row].cells
    #         for col in range(len(hdr_cells)):    # 获取每一行单独的小表格,然后将其内容拼接起来;拼接完成之后再第二个for循环中打印出来
    #             hdr_cells[col].text = t.cell(row, col).text
      
        
    from copy import deepcopy
    document2 = Document("result.doc")
    p1 = document2.add_paragraph('文件名:'+path) 
    p1.runs[0].font.bold=True     
    p1.runs[0].font.color.rgb = RGBColor(255,0,0)
    document = Document(path)
    
    i =0
    for t in doc.tables:  
        i+=1
        p1 = document2.add_paragraph('表格:'+ str(i))
        table = t  # For 1st table
        new_table = deepcopy(table)
        paragraph = document2.add_paragraph()
        paragraph._p.addnext(new_table._element)
    document2.save("result.doc")
def init():#把要保存的文件初始化清空
    from docx import Document
    # fo = open("result.docx" , 'w')
    # fo.write("1")
    # fo.close()
    doc1 = Document("result.doc")
    doc1._body.clear_content()
    doc1.save("result.doc")

    
if __name__ == '__main__':
    init()
    ls = []
    getfilelist('D:/test/', ls)
    for tep in ls:
        print("解析"+tep)
        test(tep)
    print("完成 , 结果输出在result.doc里")
    time.sleep(10)

你可能感兴趣的:(【Python】,python)