python OCR 识别发票信息,cnocr识别增值税发票

首先创建发票实体类invoice.py
代码如下:

class InvoicePurchaser:
    """
    发票购买方信息
    """
    # 名称
    name: str
    # 纳税人识别号
    identification_number: str
    # 地址、电话
    address_telephone: str
    # 开户行及账号
    bank_account: str


class InvoiceSeller:
    """
    发票销售方信息
    """
    # 名称
    name: str
    # 纳税人识别号
    identification_number: str
    # 地址、电话
    address_telephone: str
    # 开户行及账号
    bank_account: str

class Invoice:
    """
    发票信息
    """

    # 机器编码
    machine_code: str
    # 发票代码
    invoice_code: str
    # 发票号码
    invoice_number: str
    # 开票日期
    invoice_date: str
    # 校验码
    check_code: str
    # 金额
    amount: str
    # 税额
    tax: str
    # 税率
    tax_rate: str
    # 价税合计
    total_amount: str
    # 收款人
    payee: str
    # 复核
    review: str
    # 开票人
    drawer: str
    # 发票购买方信息
    invoicePurchaser: InvoicePurchaser
    # 发票销售方信息
    invoiceSeller: InvoiceSeller

通过CnOcr识别发票信息

from cnocr import CnOcr
from models.invoice import Invoice, InvoicePurchaser, InvoiceSeller
import json


# 所有参数都使用默认值
ocr = CnOcr()


def ocr_invoice(img_path) -> Invoice:
    invoice = Invoice()
    invoicePurchaser = InvoicePurchaser()
    invoiceSeller = InvoiceSeller()
    result = ocr.ocr(img_path)
    identification_number: int = 0
    address_telephone: int = 0
    bank_account: int = 0
    for i in result:
        text = i['text']
        if '机器编号:' in text:
            list = text.split(':')
            if len(list) == 2:
                invoice.machine_code = list[1]
        elif '发票代码:' in text:
            list = text.split(':')
            if len(list) == 2:
                invoice.invoice_code = list[1]
        elif '发票号码:' in text:
            list = text.split(':')
            if len(list) == 2:
                invoice.invoice_number = list[1]
        elif '开票日期:' in text:
            list = text.split(':')
            if len(list) == 2:
                invoice.invoice_date = list[1]
        elif '校验码:' in text:
            list = text.split(':')
            if len(list) == 2:
                invoice.check_code = list[1]
        elif '收款人:' in text:
            list = text.split(':')
            if len(list) == 2:
                invoice.payee = list[1]
        elif '复核:' in text:
            list = text.split(':')
            if len(list) == 2:
                invoice.review = list[1]
        elif '开票人:' in text:
            list = text.split(':')
            if len(list) == 2:
                invoice.drawer = list[1]
        elif '(小写)¥' in text:
            list = text.split('¥')
            if len(list) == 2:
                invoice.total_amount = list[1]

        elif '称:' in text:
            list = text.split(':')
            if len(list) == 2:
                invoicePurchaser.name = list[1]
        elif '称:' in text:
            list = text.split(':')
            if len(list) == 2:
                invoiceSeller.name = list[1]
        elif '纳税人识别号:' in text:
            identification_number += 1
            list = text.split(':')
            if len(list) == 2:
                if identification_number == 1:
                    invoicePurchaser.identification_number = list[1]
                elif identification_number == 2:
                    invoiceSeller.identification_number = list[1]
        elif '地址、电话:' in text:
            address_telephone += 1
            list = text.split(':')
            if len(list) == 2:
                if address_telephone == 1:
                    invoicePurchaser.address_telephone = list[1]
                elif address_telephone == 2:
                    invoiceSeller.address_telephone = list[1]
        elif '开户行及账号:' in text:
            bank_account += 1
            list = text.split(':')
            if len(list) == 2:
                if bank_account == 1:
                    invoicePurchaser.bank_account = list[1]
                elif bank_account == 2:
                    invoiceSeller.bank_account = list[1]

    invoice.invoicePurchaser = invoicePurchaser
    invoice.invoiceSeller = invoiceSeller
    return invoice


if __name__ == "__main__":
    img_path = './images/fapiao.jpg'
    result = ocr_invoice(img_path)
    j = json.dumps(result, ensure_ascii=False, default=lambda obj: obj.__dict__)
    print(j)

你可能感兴趣的:(Python,python,ocr)