将OCR识别结果写入excel,直接上代码:
# -*- coding: UTF-8 -*-
import pdfplumber
import pandas as pd
import time, json
import os, openpyxl
from openpyxl.utils import get_column_letter
from openpyxl.styles import Border, Side
file_path = r"D:\Date11\data_pdf_txt"
save_path = r"D:\Date11\data_pdf_excel111"
os.makedirs(save_path, exist_ok=True)
# 设置边框样式
def set_border(t_border, b_border, l_border, r_border, t_color='000000', b_color='000000', l_color='000000', r_color='000000'):
border = Border(top=Side(border_style=t_border, color=t_color),
bottom=Side(border_style=b_border, color=b_color),
left=Side(border_style=l_border, color=l_color),
right=Side(border_style=r_border, color=r_color))
return border
#设置单元格的边框
def format_border(ws, start_row, end_row, start_col, end_col):
# 内部
for row in tuple(ws[start_row:end_row]):
for cell in row[start_col-1:end_col]:
# cell.border = set_border('dotted', 'dotted', 'dotted', 'dotted')
cell.border = set_border('medium', 'medium', 'medium', 'medium')
# cell.border = set_border(cell.border.top.style, cell.border.bottom.style, cell.border.left.style, 'medium')
# 左側
for cell in [row[start_col-1] for row in ws[start_row:end_row]]:
cell.border = set_border(cell.border.top.style, cell.border.bottom.style, 'medium', cell.border.right.style)
# cell.border = set_border(cell.border.top.style, cell.border.bottom.style, cell.border.left.style, 'medium')
# 右側
for cell in [row[end_col-1] for row in ws[start_row:end_row]]:
cell.border = set_border(cell.border.top.style, cell.border.bottom.style, cell.border.left.style, 'medium')
# 上側
for cell in ws[start_row][start_col-1:end_col]:
cell.border = set_border('medium', cell.border.bottom.style, cell.border.left.style, cell.border.right.style)
# cell.border = set_border(cell.border.top.style, cell.border.bottom.style, cell.border.left.style, 'medium')
# 下側
for cell in ws[end_row][start_col-1:end_col]:
cell.border = set_border(cell.border.top.style, 'medium', cell.border.left.style, cell.border.right.style)
return ws
def tranform_table(file_name, hd_name):
print(file_name)
head_name = os.path.splitext(hd_name)[0]
ele_dict = {1: "A", 2: "B", 3: "C", 4: "D", 5: "E", 6: "F", 7: "G", 8: "H", 9: "I", 10: "J", 11: "K", 12: "L"}
wb = openpyxl.Workbook()
a = 0
with open(file_name, 'r', encoding='utf-8') as load_json:
data_json = json.load(load_json)
data = data_json["pages"]
for pag in data:
tables = pag["tables"]
for table in tables: # 遍历所有的表排序
print(table["cells"])
table_list = table["cells"].sort(key=lambda x: (x["start_row"], x["start_column"]))
for table in tables:
a += 1
# ws = wb.active
# ws.title =
ws = wb.create_sheet('sheet{}'.format(a))
last_column = []
last_row = []
start_row=[]
start_column=[]
# for cell in table["cells"]:
# print(cell)
# if cell["start_row"] == cell["end_row"] :
# if cell["start_column"] == cell["end_column"]:
#
# # 调整行高
# ws.row_dimensions[1].height = 20
# ws.row_dimensions[2].height = 20
# # 调整列宽
# ws.column_dimensions['A'].width = 40.0
for dan in table["cells"]:
# for dan in cells:
print(dan["start_row"])
print(dan["content"])
print(type(dan["start_row"]))
try:
if dan["start_row"] == dan["end_row"] and dan["start_column"] == dan["end_column"]:
last_column.append(dan["end_column"])
last_row.append(dan["end_row"])
ws.cell(row=dan["start_row"], column=dan["start_column"]).value = dan["content"]
elif dan["start_row"] == dan["end_row"] and dan["start_column"] != dan["end_column"]:
font = ele_dict[dan["start_column"]] + str(dan["start_row"])
back = ele_dict[dan["end_column"]] + str(dan["start_row"])
last_column.append(dan["end_column"])
last_row.append(dan["end_row"])
ws.merge_cells('{}:{}'.format(font, back))
ws.cell(row=dan["start_row"], column=dan["start_column"]).value = dan["content"]
elif dan["start_row"] != dan["end_row"] and dan["start_column"] == dan["end_column"]:
font = ele_dict[dan["start_column"]] + str(dan["start_row"])
back = ele_dict[dan["end_column"]] + str(dan["end_row"])
ws.merge_cells('{}:{}'.format(font, back))
last_column.append(dan["end_column"])
last_row.append(dan["end_row"])
ws.cell(row=dan["start_row"], column=dan["start_column"]).value = dan["content"]
else:
font = ele_dict[dan["sta