程序功能:调用百度的OCR接口,对指定目录下的所有发票(jpg、png、pdf)进行识别,最后将识别结果保存至excel中
相关资源:百度发票识别
代码比较简单,其中access_token
为发票识别api的token,fp_path
是发票存放的目录,这两个填好后直接运行程序就可,最后生成以增值税发票
命名的excel
# encoding:utf-8
import time
import requests
import base64
import os
import xlwt
target_fields = ['InvoiceCode','InvoiceNum','InvoiceType','InvoiceDate','SellerName','SellerRegisterNum','PurchaserName'
,'TotalAmount','TotalTax','AmountInFiguers','ServiceType']
'''
增值税发票识别
'''
# 获取发票正文内容
def get_normal_context(pic):
# print('正在获取图片正文内容!')
data = {}
try:
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
# 二进制方式打开图片文件
f = open(pic, 'rb')
img = base64.b64encode(f.read()).decode("utf8")
if pic.endswith('jpg') or pic.endswith('png'):
params = {
"image":img,
}
elif pic.endswith('pdf'):
params = {
"pdf_file":img,
}
else:
print('文件格式有误')
return False
# 这里需要替换成自己的access_token
access_token = ''
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
if response:
print (response.json())
json1 = response.json()
try:
if json1['error_code'] == 282103:
return False
except:
for field in target_fields:
try:
data[field] = json1['words_result'][field]
except:
data[field] = ''
# print('正文内容获取成功!')
return data
except Exception as e:
print(e)
return data
def get_roll_context(pic):
# print('正在获取图片正文内容!')
data = {}
try:
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
# 二进制方式打开图片文件
f = open(pic, 'rb')
img = base64.b64encode(f.read()).decode("utf8")
if pic.endswith('jpg') or pic.endswith('png'):
params = {
"image":img,
"type":'roll'
}
elif pic.endswith('pdf'):
params = {
"pdf_file":img,
"type":'roll'
}
else:
print('文件格式有误')
return False
# 这里需要替换成自己的access_token
access_token = ''
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
if response:
print(response.json())
json1 = response.json()
try:
if json1['error_code'] == 282103:
return False
except:
for field in target_fields:
try:
data[field] = json1['words_result'][field]
except:
data[field] = ''
# print(data['AmountInFiguers'])
# print('正文内容获取成功!')
return data
except Exception as e:
print(e)
return data
# 定义生成图片路径的函数
def pics(path):
print('正在生成图片路径')
#生成一个空列表用于存放图片路径
pics = []
# 遍历文件夹,找到后缀为jpg和png的文件,整理之后加入列表
for filename in os.listdir(path):
if filename.endswith('jpg') or filename.endswith('png') or filename.endswith('pdf'):
pic = path + '/' + filename
pics.append(pic)
print('图片路径生成成功!')
return pics
# 定义一个获取文件夹内所有文件正文内容的函数,每次返回一个字典,把返回的所有字典存放在一个列表里
def datas(pics):
datas = []
for p in pics:
data = get_normal_context(p)
if data:
datas.append(data)
continue
data = get_roll_context(p)
if data:
datas.append(data)
continue
print(f'{p} 该发票不能被识别')
time.sleep(0.5)
return datas
# 定义一个写入将数据excel表格的函数
def save(datas):
print('正在写入数据!')
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('增值税发票内容登记', cell_overwrite_ok=True)
title = ['发票代码','发票号码','发票类型','开票日期', '卖方名称','卖方纳税人识别号', '购买方名称','合计金额','合计税额' , '价税合计','消费类型']
for i in range(len(title)):
sheet.write(0, i, title[i])
for d in range(len(datas)):
for j in range(len(title)):
sheet.write(d + 1, 0, datas[d]['InvoiceCode']) # 发票代码
sheet.write(d + 1, 1, datas[d]['InvoiceNum']) # 发票号码
sheet.write(d + 1, 2, datas[d]['InvoiceType']) # 发票类型
sheet.write(d + 1, 3, datas[d]['InvoiceDate']) # 开票日期
sheet.write(d + 1, 4, datas[d]['SellerName']) # 卖方名称
sheet.write(d + 1, 5, datas[d]['SellerRegisterNum']) # 卖方纳税人识别号
sheet.write(d + 1, 6, datas[d]['PurchaserName']) # 购买方名称
sheet.write(d + 1, 7, datas[d]['TotalAmount']) # 合计金额
sheet.write(d + 1, 8, datas[d]['TotalTax']) # 合计税额
sheet.write(d + 1, 9, datas[d]['AmountInFiguers']) # 价税合计
sheet.write(d + 1, 10, datas[d]['ServiceType']) # 消费类型
print('数据写入成功!')
book.save('增值税发票.xls')
def main():
print('开始执行!!!')
# 这是你发票的存放地址,自行更改
fp_path = r''
Pics = pics(fp_path)
Datas = datas(Pics)
save(Datas)
print('执行结束!')
if __name__ == '__main__':
main()