1.准备:
1)Python开发环境, 笔者用的是3.7; 工具用的是Pycharm
2)百度云后台创建文字识别的应用, 获取AppID, API key, Secret Key
3) 百度模块
pip install baidu-aip
4) 要保存成csv需要用到pandas模块
pip Install pandas
2.上路:
1)初始化百度客户端, 用来发送图片信息
2)调用通用文字接口
这边我们断点查看一下返回来的数据:
3)保存成CSV
这里不显示返回数据的进一步处理过程,我把处理后的数据保存到全局变量Company_Data中。具体数据处理过程可以参考本文源码或者在《Python使用腾讯Ocr识别文字》中的方法.下面把Company_Data中的数据保存成CSV.
3.结果:
读取图片内容
输出结果:
4.完整代码:
# -*- coding: utf-8 -*-
from aipimport AipOcr
import os
import pandas
# create a new AipOcr
APP_ID ="16921559"
API_KEY ="HfpMM13vAnDlTRWabQVDKnk8"
SECRET_KEY ="EQpdKCeICwfHLWazx0vsIpRqoRkVX6pM"
IMG_EXT = ['.png', '.jpg', '.jpeg', '.bmp']
Company_Data = []
# initialize
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
#1 get text content
def imageReader(file_path):
with open(file_path,'rb')as f:
content = f.read()
# general text API
api_result = client.basicGeneral(content)
# text content
words_result = []
for iin api_result['words_result']:
words_result.append(i['words'])
return words_result
#2 write the content into file
def saveData(file_path):
# separate the file name and extend type
filename, ext = os.path.splitext(file_path)
if extin IMG_EXT:
# new_path = file_path + '.txt'
print(" reading the following image %s" % file_path)
result = imageReader(file_path)
#with open(new_path, 'w', encoding='utf-8') as f:
# f.write(result)
# 电话和传真前面加上'\t',可以防止excel打开csv内容的时候自动计算
data = {}
for textin result:
if ':' in text:
itemname, value = text.split(":")
if '展位号' in itemname:
data['展位号'] = value
elif '地址' in itemname:
data['地址'] = value
elif '邮编' in itemname:
data['邮编'] = value
elif '电话' in itemname:
data['电话'] ='\t'+ value
elif '传真' in itemname:
data['传真'] ='\t'+ value
elif '联系人' in itemname:
data['联系人'] = value
elif '职务' in itemname:
data['职务'] = value
elif '电邮' in itemname:
data['电邮'] = value
elif '网址' in itemname:
data['网址'] = value
elif '业务性质' in itemname:
data['业务性质'] = value
elif '产品类型' in itemname:
data['产品类型'] = value
print(" Saving the data of %s" % file_path)
print(data)
Company_Data.append(data)
#3 get all .jpg under the path
def each_path(dir_path):
# get file names of current directory
file_name = os.listdir(dir_path)
for namein file_name:
if '.jpg' in name:
image_path = dir_path +'\\' + name
saveData(image_path)
def save2csv():
file_name ='test1.csv'
save = pandas.DataFrame(data=Company_Data, dtype='object')
try:
save.to_csv(file_name,quoting=1,float_format='str', mode='a+')
except UnicodeDecodeError:
print("编码错误,该数据无法写到文件中,直接忽略该数据")
if __name__ =='__main__':
file_path =r"F:\我的坚果云\其他\Study\Python\Demo\BaiduOcr\picture"
each_path(file_path)
save2csv()