编写两个文件ReadPdf.py和QTShow.py
ReadPdf.py
1.采用正则表达式re定义提取的字段:(目前只读取这8个字段,开户行及账户在测试中出现问题)
self.template_fields = {
"发票类型": r'[\u4e00-\u9fa5]+电子普通发票',
"发票代码": r'发票代码(.*\d+)',
"发票号码": r'发票号码(.*\d+)',
"开票日期": r'开票日期(.*)',
"名称": r'名\s*称\s*[::]\s*([\u4e00-\u9fa5]+)',
"纳税人识别号": r'纳税人识别号\s*[::]\s*([a-zA-Z0-9]+)',
"金额": r'小写.*(.*[0-9.]+)',
"收款人": r'收\s*款\s*人\s*[::]\s*([\u4e00-\u9fa5]+)',
}
2.对提取到的字段进行处理:
def extract_fields(self):
pdf_text = self.extract_text()
# 在PDF文本中查找所有匹配正则表达式的内容
matches = {}
for field, regex in self.template_fields.items():
matches[field] = self.re_text(re.compile(regex), pdf_text)
# 截取相应字段
for k, v in matches.items():
if ":" in v:
new_value = v.split(":")[1].strip()
matches[k] = new_value
elif "¥" in v:
new_value = v.split("¥")[1].strip()
matches[k] = new_value
elif "¥" in v:
new_value = v.split("¥")[1].strip()
matches[k] = new_value
else:
matches[k] = matches[k]
return matches
3.全部如下:
import re
import pandas as pd
from pdfminer.high_level import extract_text
import pdfplumber
import os
import xlwt
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
class PDFExtractor:
def __init__(self, filename):
self.filename = filename
# 定义需要提取的字段和其对应的正则表达式
self.template_fields = {
"发票类型": r'[\u4e00-\u9fa5]+电子普通发票',
"发票代码": r'发票代码(.*\d+)',
"发票号码": r'发票号码(.*\d+)',
"开票日期": r'开票日期(.*)',
"名称": r'名\s*称\s*[::]\s*([\u4e00-\u9fa5]+)',
"纳税人识别号": r'纳税人识别号\s*[::]\s*([a-zA-Z0-9]+)',
"金额": r'小写.*(.*[0-9.]+)',
"收款人": r'收\s*款\s*人\s*[::]\s*([\u4e00-\u9fa5]+)',
}
def extract_text(self):
with pdfplumber.open(self.filename) as pdf:
first_page = pdf.pages[0]
return first_page.extract_text()
def re_text(self, bt, text):
m1 = re.search(bt, text)
if m1 is not None:
return self.re_block(m1[0])
def re_block(self, text):
return text.replace(' ', '').replace(' ', '').replace(')', '').replace(')', '').replace(':', ':')
def extract_fields(self):
pdf_text = self.extract_text()
# 在PDF文本中查找所有匹配正则表达式的内容
matches = {}
for field, regex in self.template_fields.items():
matches[field] = self.re_text(re.compile(regex), pdf_text)
# 截取相应字段
for k, v in matches.items():
if ":" in v:
new_value = v.split(":")[1].strip()
matches[k] = new_value
elif "¥" in v:
new_value = v.split("¥")[1].strip()
matches[k] = new_value
elif "¥" in v:
new_value = v.split("¥")[1].strip()
matches[k] = new_value
else:
matches[k] = matches[k]
return matches
QTShow.py
import sys
import os
from PyQt5.QtWidgets import QApplication, QMainWindow, QTableWidget, QTableWidgetItem
from ReadPdf import PDFExtractor
from collections import OrderedDict
import xlwt
class MainWindow(QMainWindow):
def __init__(self):
super().__init__()
# 读取文件夹中的所有 PDF 文件
# 文件夹路径就行
folder_path = "/yourfilespath"
file_names = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
self.filename = "invoice.xls"
# 创建表格控件
self.table_widget = QTableWidget()
self.table_widget.setRowCount(len(file_names))
self.table_widget.setColumnCount(9)
self.setCentralWidget(self.table_widget)
wb = xlwt.Workbook(encoding='utf-8')
sh = wb.add_sheet('sheet 1')
row_num = 0
# 处理字典
for i, file_name in enumerate(file_names):
file_path = os.path.join(folder_path, file_name)
pdf_extractor = PDFExtractor(file_path)
# 提取数据
self.matches = pdf_extractor.extract_fields()
new_key = '文件名'
new_val = file_name
self.new_matches = OrderedDict([(new_key, new_val)])
for k, v in self.matches.items():
self.new_matches[k] = v
title = list(self.new_matches.keys())
self.table_widget.setHorizontalHeaderLabels(title)
for col_num, (key, value) in enumerate(self.new_matches.items()):
value_item = QTableWidgetItem(value)
if row_num == 0:
sh.write(row_num, col_num, key)
sh.write(row_num + 1, col_num, value)
sh.col(col_num).width = (len(value) + 11) * 256
self.table_widget.setItem(row_num, col_num, value_item)
# 自动调整每一列的宽度
self.table_widget.resizeColumnsToContents()
row_num += 1
wb.save(self.filename)
def setWindowSize(self):
# 调整窗口大小,使得窗口大小为QTable widget的1.5倍
table_size = self.table_widget.size()*1.5
return table_size
if __name__ == "__main__":
app = QApplication(sys.argv)
window = MainWindow()
window.resize(window.setWindowSize())
window.show()
sys.exit(app.exec_())