本文写于2021年2月11日,鼠年除夕夜,祝所有看到本文的朋友们身体健康,万事如意!
之前已经搞定了厦门房地产网签备案的图片下载,接下来就是识别图片的内容。
关于图片识别,网上大量的使用pytesseract进行识别的文章,但是使用了之后,发现对于中文的识别并不好,无奈之下另寻他途,发现百度的OCR还不错,每天5000次的免费额度,对于普通个人来说已经足够了,关于如何使用百度OCR,可见这篇文章https://zoutao.blog.csdn.net/article/details/86705491
以下是识别的具体内容,本文会持续更新至实现作者的全部意图。
第一步:识别图片内容,并读入csv文件
# python 3.5
# 百度tesseract-ocr使用
from aip import AipOcr
import os
import pandas as pd
from openpyxl import load_workbook
""" API """
APP_ID = '23657473'
API_KEY = 'WG43q2kD6vDUAjkGAse3Ei6y'
SECRET_KEY = 'IMATPqqUmSrmYvMVrwEP1siXjUvHqf44'
# 初始化AipFace对象
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
""" 读取图片 """
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
def img_to_str(image_path):
""" 可选参数 """
options = {
}
options["language_type"] = "CHN_ENG" # 中英文混合
options["detect_direction"] = "true" # 检测朝向
options["detect_language"] = "true" # 是否检测语言
options["probability"] = "false" # 是否返回识别结果中每一行的置信度
image = get_file_content(image_path)
""" 带参数调用通用文字识别 """
result = client.basicGeneral(get_file_content(filePath), options)
# 格式化输出-提取需要的部分
if 'words_result' in result:
oldtext = ('\n'.join([w['words'] for w in result['words_result']]))
text = oldtext.replace(',','').replace(':',',')
''' save '''
fs = open(root + '\\' + file[:-4] + '.csv', 'w+', encoding='utf-8') # 将str,保存到txt
fs.write(text)
fs.close()
csv = pd.read_csv(root + '\\' + file[:-4] + '.csv',encoding = 'utf-8')
csv.to_excel(root + '\\' + file[:-4] + '.xlsx', sheet_name='data')
os.remove(root + '\\' + file[:-4] + '.csv') # 删除csv,以免文件太多繁杂
print(text)
# print(type(result), "和", type(text))
return text
if __name__ == '__main__':
for root,dirs,files in os.walk(r'C:\data\网签备份\2021-02-09'):
for file in files:
if file[-3:] == 'png':
filePath = root + '\\' + file
# print(filePath)
print(img_to_str(filePath))
# filePath = r'C:\data\网签备份\2021-02-09\clfjy.png'
# print("识别完成。")
import pandas as pd
# print('done')
# from openpyxl import load_workbook
# zhuzhai_number = int(load_workbook('C:\\baidu_ocr.xlsx')['data']['b3'].value)
# others_number = int(load_workbook('C:\\baidu_ocr.xlsx')['data']['b4'].value) - zhuzhai_number
# zhuzhai_area = int(load_workbook('C:\\baidu_ocr.xlsx')['data']['b5'].value)
# others_area = int(load_workbook('C:\\baidu_ocr.xlsx')['data']['b6'].value) - zhuzhai_area
#
# from datetime import date
# mainbook = load_workbook(r'C:\data\网签备份\test.xlsx')
# sht = mainbook['每天成交']
# x = sht.max_row
# y = sht.max_column
# # print(x,y)
# sht.cell(x+1,1).value = date.today()
# sht.cell(x+1,2).value = '厦门'
# sht.cell(x+1,3).value = '二手'
# sht.cell(x+1,4).value = '住宅'
# sht.cell(x+1,5).value = zhuzhai_number
# sht.cell(x+1,6).value = zhuzhai_area
# sht.cell(x+2,1).value = date.today()
# sht.cell(x+2,2).value = '厦门'
# sht.cell(x+2,3).value = '二手'
# sht.cell(x+2,4).value = '非住宅'
# sht.cell(x+2,5).value = others_number
# sht.cell(x+2,6).value = others_area
# mainbook.save(r'C:\data\网签备份\test.xlsx')
# mainbook.close()
#
#
#
第二步:调用xlwings,对取出的数字进行处理