### 提取图片
使用的为 pymupdf 库 为 fitz子模块
```
pip install pymupdf -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
```
开头安装程序所用到的文件
```
链接:https://pan.baidu.com/s/18d5X9O2BF8dm3gZI8p0J2w
提取码:ob96
```
### 提取图片文字
感觉还是百度云识别的牛逼
#### 调用百度智能云接口
![image-20201020092037384](python%20pdf%E6%8F%90%E5%8F%96%E5%9B%BE%E7%89%87%E5%B9%B6%E6%8F%90%E5%8F%96%E5%9B%BE%E7%89%87%E4%B8%AD%E7%9A%84%E6%96%87%E5%AD%97.assets/image-20201020092037384.png)
每天 50000次 够用
![File:Tesseract.gif](python%20pdf%E6%8F%90%E5%8F%96%E5%9B%BE%E7%89%87%E5%B9%B6%E6%8F%90%E5%8F%96%E5%9B%BE%E7%89%87%E4%B8%AD%E7%9A%84%E6%96%87%E5%AD%97.assets/Tesseract.gif)
#### tesseract图像识别
##### win安装使用教程
转载:
https://gitee.com/super__man/blog/blob/master/%E5%9B%BE%E4%B9%A6/python%20tesseract-ocr%20%E5%9B%BE%E6%96%87%E8%AF%86%E5%88%AB%20%EF%BC%88windows%E5%9F%BA%E7%A1%80%E7%8E%AF%E5%A2%83%E6%90%AD%E5%BB%BA%EF%BC%89%20-%20%E6%B8%85%E9%A3%8E%E8%BD%AF%E4%BB%B6%E6%B5%8B%E8%AF%95%20-%20%E5%8D%9A%E5%AE%A2%E5%9B%AD.pdf
##### 问题1
pytesseract.pytesseract.TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.
---
需要修改 pytesseract.py ,没有找到 tesseract 程序
![image-20201020114646686](python%20pdf%E6%8F%90%E5%8F%96%E5%9B%BE%E7%89%87%E5%B9%B6%E6%8F%90%E5%8F%96%E5%9B%BE%E7%89%87%E4%B8%AD%E7%9A%84%E6%96%87%E5%AD%97.assets/image-20201020114646686.png)
我使用的 conda 虚拟环境管理,上面是我的包路径
进入 .py 文件
![image-20201020114805171](python%20pdf%E6%8F%90%E5%8F%96%E5%9B%BE%E7%89%87%E5%B9%B6%E6%8F%90%E5%8F%96%E5%9B%BE%E7%89%87%E4%B8%AD%E7%9A%84%E6%96%87%E5%AD%97.assets/image-20201020114805171.png)
将路径修改为你电脑 tesseract.exe 绝对路径
![image-20201020114915394](python%20pdf%E6%8F%90%E5%8F%96%E5%9B%BE%E7%89%87%E5%B9%B6%E6%8F%90%E5%8F%96%E5%9B%BE%E7%89%87%E4%B8%AD%E7%9A%84%E6%96%87%E5%AD%97.assets/image-20201020114915394.png)
再次运行就不会报错了
### 代码
```python
import os
import time
import fitz
import glob
import pytesseract
import tesserocr
from PIL import Image
from aip import AipOcr
class PdfHandle():
#初始化
def __init__(self):
self.APP_ID = ''
self.API_KEY = ''
self.SECRET_KEY = '9HddR7LxTpEe3zhRTTU41DwbSKEOkQN0'
self.img_path = './pdf/'
self.imgs_list = []
#pdf提取图片
def fun1(self,filename):
name = glob.glob(filename)[0]
doc = fitz.open(name)
for pg in range(0,doc.pageCount):
page = doc[pg]
zoom = int(1000)
rotate = int(0)
trans = fitz.Matrix(zoom / 100.0 ,zoom / 100.0).preRotate(rotate)
pm = page.getPixmap(matrix=trans,alpha=0)
pm.writePNG(self.img_path + '{}.png'.format(str(pg)))
#调用百度api提取文字
def tiqu(self):
self.traverse(self.img_path)
aip_orc = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)
for file_path in self.imgs_list[0:1]:
file_path = file_path[0]
options = {}
options['detect_direction'] = 'true'
options['probability'] = 'true'
id_card_side = 'front'
result = aip_orc.basicAccurate(self.get_file_content(file_path), options)
# print(result)
for temp in result['words_result']:
print(temp['words'])
#调用百度api提取文字
def tiqu2(self):
self.traverse(self.img_path)
aip_orc = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)
for file_path in self.imgs_list:
file_path = file_path[0]
print(file_path)
options = {}
options["language_type"] = "CHN_ENG"
options["detect_direction"] = "true"
options["detect_language"] = "true"
options["probability"] = "true"
while 1:
try:
result = aip_orc.basicGeneral(self.get_file_content(file_path), options)
break
except Exception as e:
time.sleep(2)
continue
# print(result)
for temp in result['words_result']:
print(temp['words'])
print('=========')
#读取图片二进制内容
def get_file_content(self,file_path):
with open(file_path, 'rb') as f:
return f.read()
#遍历目录下的文件图片
def traverse(self,d):
dirs = os.listdir(d) # 获取当前目录的文件列表
for item in dirs:
absPath = os.path.join(d, item) # 拼接绝对路径
if os.path.isdir(absPath): # 判断是否是目录
self.traverse(absPath) # 递归调用函数
else:
if os.path.basename(absPath).endswith('.png'):
self.imgs_list.append((absPath,os.path.basename(absPath)))
#使用 tesseract 识别文字
def my_tesseract(self):
img = Image.open(r'D:\bfy\my_code\task_3\pdf\6.png')
print(pytesseract.image_to_string(img,lang='chi_sim'))
if __name__ == '__main__':
pdf = PdfHandle()
# pdf.my_tesseract()
pdf.tiqu2()
```
一键复制
编辑
Web IDE
原始数据
按行查看
历史