python 读取发票内容,在窗口中显示并保存到excel文件中

编写两个文件ReadPdf.py和QTShow.py

ReadPdf.py

1.采用正则表达式re定义提取的字段:(目前只读取这8个字段,开户行及账户在测试中出现问题)

 self.template_fields = {
            "发票类型": r'[\u4e00-\u9fa5]+电子普通发票',
            "发票代码": r'发票代码(.*\d+)',
            "发票号码": r'发票号码(.*\d+)',
            "开票日期": r'开票日期(.*)',
            "名称": r'名\s*称\s*[::]\s*([\u4e00-\u9fa5]+)',
            "纳税人识别号": r'纳税人识别号\s*[::]\s*([a-zA-Z0-9]+)',
            "金额": r'小写.*(.*[0-9.]+)',
            "收款人": r'收\s*款\s*人\s*[::]\s*([\u4e00-\u9fa5]+)',
        }

2.对提取到的字段进行处理:

 def extract_fields(self):
        pdf_text = self.extract_text()
        # 在PDF文本中查找所有匹配正则表达式的内容
        matches = {}
        for field, regex in self.template_fields.items():
            matches[field] = self.re_text(re.compile(regex), pdf_text)

        # 截取相应字段
        for k, v in matches.items():
            if ":" in v:
                new_value = v.split(":")[1].strip()
                matches[k] = new_value
            elif "¥" in v:
                new_value = v.split("¥")[1].strip()
                matches[k] = new_value
            elif "¥" in v:
                new_value = v.split("¥")[1].strip()
                matches[k] = new_value
            else:
                matches[k] = matches[k]
        return matches

3.全部如下:

import re
import pandas as pd
from pdfminer.high_level import extract_text
import pdfplumber
import os
import xlwt
from openpyxl import Workbook
from openpyxl.utils import get_column_letter


class PDFExtractor:
    def __init__(self, filename):
        self.filename = filename
        # 定义需要提取的字段和其对应的正则表达式
        self.template_fields = {
            "发票类型": r'[\u4e00-\u9fa5]+电子普通发票',
            "发票代码": r'发票代码(.*\d+)',
            "发票号码": r'发票号码(.*\d+)',
            "开票日期": r'开票日期(.*)',
            "名称": r'名\s*称\s*[::]\s*([\u4e00-\u9fa5]+)',
            "纳税人识别号": r'纳税人识别号\s*[::]\s*([a-zA-Z0-9]+)',
            "金额": r'小写.*(.*[0-9.]+)',
            "收款人": r'收\s*款\s*人\s*[::]\s*([\u4e00-\u9fa5]+)',
        }

    def extract_text(self):
        with pdfplumber.open(self.filename) as pdf:
            first_page = pdf.pages[0]
            return first_page.extract_text()

    def re_text(self, bt, text):
        m1 = re.search(bt, text)
        if m1 is not None:
            return self.re_block(m1[0])

    def re_block(self, text):
        return text.replace(' ', '').replace(' ', '').replace(')', '').replace(')', '').replace(':', ':')

    def extract_fields(self):
        pdf_text = self.extract_text()
        # 在PDF文本中查找所有匹配正则表达式的内容
        matches = {}
        for field, regex in self.template_fields.items():
            matches[field] = self.re_text(re.compile(regex), pdf_text)

        # 截取相应字段
        for k, v in matches.items():
            if ":" in v:
                new_value = v.split(":")[1].strip()
                matches[k] = new_value
            elif "¥" in v:
                new_value = v.split("¥")[1].strip()
                matches[k] = new_value
            elif "¥" in v:
                new_value = v.split("¥")[1].strip()
                matches[k] = new_value
            else:
                matches[k] = matches[k]
        return matches


QTShow.py

import sys
import os
from PyQt5.QtWidgets import QApplication, QMainWindow, QTableWidget, QTableWidgetItem
from ReadPdf import PDFExtractor
from collections import OrderedDict
import xlwt


class MainWindow(QMainWindow):
    def __init__(self):
        super().__init__()

        # 读取文件夹中的所有 PDF 文件
        # 文件夹路径就行
        folder_path = "/yourfilespath"
        file_names = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
        self.filename = "invoice.xls"
        # 创建表格控件
        self.table_widget = QTableWidget()
        self.table_widget.setRowCount(len(file_names))
        self.table_widget.setColumnCount(9)
        self.setCentralWidget(self.table_widget)

        wb = xlwt.Workbook(encoding='utf-8')
        sh = wb.add_sheet('sheet 1')
        row_num = 0
        # 处理字典
        for i, file_name in enumerate(file_names):
            file_path = os.path.join(folder_path, file_name)
            pdf_extractor = PDFExtractor(file_path)
            # 提取数据
            self.matches = pdf_extractor.extract_fields()
            new_key = '文件名'
            new_val = file_name
            self.new_matches = OrderedDict([(new_key, new_val)])
            for k, v in self.matches.items():
                self.new_matches[k] = v

            title = list(self.new_matches.keys())
            self.table_widget.setHorizontalHeaderLabels(title)

            for col_num, (key, value) in enumerate(self.new_matches.items()):
                value_item = QTableWidgetItem(value)
                if row_num == 0:
                    sh.write(row_num, col_num, key)
                sh.write(row_num + 1, col_num, value)
                sh.col(col_num).width = (len(value) + 11) * 256
                self.table_widget.setItem(row_num, col_num, value_item)
                # 自动调整每一列的宽度
                self.table_widget.resizeColumnsToContents()
            row_num += 1
        wb.save(self.filename)

    def setWindowSize(self):
        # 调整窗口大小,使得窗口大小为QTable widget的1.5倍
        table_size = self.table_widget.size()*1.5
        return table_size


if __name__ == "__main__":
    app = QApplication(sys.argv)
    window = MainWindow()
    window.resize(window.setWindowSize())
    window.show()
    sys.exit(app.exec_())

你可能感兴趣的:(python,python,开发语言)