申请网址:https://console.bce.baidu.com
点击后创建文字识别应用,在应用列表中可见调用api时的APP_ID、API_KEY、SECRET_KEY
pip3 install PyPDF2
pip3 install baidu-aip
pip3 install pdfkit
pip3 install pymupdf
下载网址:https://wkhtmltopdf.org/downloads.html
记下安装目录下 bin/wkhtmltopdf.exe位置,程序中的 path_wk 参数需要此位置
from PyPDF2 import PdfFileReader, PdfFileWriter
from aip import AipOcr
import pdfkit
import fitz
import os
pdfpath = 'D:\pdf3'
pdfname = '水浒传.pdf'
path_wk = r'D:/Procedure/wkhtmltopdf/bin/wkhtmltopdf.exe'
APP_ID = '1234567'
API_KEY = 'abcdefg'
SECRET_KEY = 'qwertyuiop'
# 以下为处理程序---------------------------------------------------------------------------
pdfkit_config = pdfkit.configuration(wkhtmltopdf=path_wk)
pdfkit_options = {'encoding': 'UTF-8', }
# 将每页pdf转为png格式图片
def pdf_image():
pdf = fitz.open(pdfpath+os.sep+pdfname)
for pg in range(0, pdf.pageCount):
# 获得每一页的对象
page = pdf[pg]
trans = fitz.Matrix(1.0, 1.0).preRotate(0),
# 获得每一页的流对象
pm = page.getPixmap(matrix=trans, alpha=False)
# 保存图片
pm.writePNG(image_path + os.sep + pdfname[:-4] + '_' + '{:0>3d}.png'.format(pg + 1))
page_range = range(pdf.pageCount)
pdf.close()
return page_range
def read_png_str(page_range):
# 读取本地图片的函数
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
all_pngstr = []
image_list = []
for page_num in page_range:
# 读取本地图片
image = get_file_content(image_path + os.sep + r'{}_{}.png'.format(pdfname[:-4], '%03d' % (page_num + 1)))
image_list.append(image)
# 新建一个AipOcr
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
options = {}
options["language_type"] = "CHN_ENG"
options["detect_direction"] = "false"
options["detect_language"] = "false"
options["probability"] = "false"
for image in image_list:
# 文字识别,得到一个字典
pngjson = client.basicGeneral(image, options)
pngstr = ''
for x in pngjson['words_result']:
pngstr = pngstr + x['words'] + ''
print('正在调用百度接口:第{}个,共{}个'.format(len(all_pngstr), len(image_list)))
all_pngstr.append(pngstr)
return all_pngstr
def str2pdf(page_range, all_pngstr):
# 字符串写入PDF
for page_num in page_range:
print('正在将字符串写入PDF:第{}个,共{}个'.format((page_num + 1), len(page_range)))
pdfkit.from_string((all_pngstr[page_num]), disperse_pdfpath + os.sep + '%s.pdf' % (str(page_num + 1)),
configuration=pdfkit_config, options=pdfkit_options)
def pdf_merge(page_range):
# 合并单页PDF
pdf_output = PdfFileWriter()
for page_num in page_range:
print('正在合并单页:第{}个,共{}个'.format((page_num + 1), len(page_range)))
pdf_input = PdfFileReader(open(disperse_pdfpath + os.sep + '%s.pdf' % (str(page_num + 1)), 'rb'))
page = pdf_input.getPage(0)
pdf_output.addPage(page)
newPdfPath = pdfpath+os.sep + 'new_{}'.format(pdfname)
pdf_output.write(open(newPdfPath, 'wb'))
return newPdfPath
image_path = pdfpath + os.sep + "image"
if not os.path.exists(image_path):
os.mkdir(image_path)
disperse_pdfpath = pdfpath + os.sep + "pdf"
if not os.path.exists(disperse_pdfpath):
os.mkdir(disperse_pdfpath)
range_count = pdf_image()
all_th = read_png_str(range_count)
str2pdf(range_count, all_th)
pdf_merge(range_count)
pip3 install python-docx
from docx import Document
from aip import AipOcr
import pdfkit
import fitz
import os
pdfpath = 'D:\pdf'
pdfname = '水浒传.pdf'
path_wk = r'D:/Procedure/wkhtmltopdf/bin/wkhtmltopdf.exe'
APP_ID = '123456789'
API_KEY = 'abcdefg'
SECRET_KEY = 'qwertyuiop'
# ---------------------------------------------------------------------------
pdfkit_config = pdfkit.configuration(wkhtmltopdf=path_wk)
pdfkit_options = {'encoding': 'UTF-8', }
# 将每页pdf转为png格式图片
def pdf_image():
pdf = fitz.open(pdfpath + os.sep + pdfname)
for pg in range(0, pdf.pageCount):
# 获得每一页的对象
page = pdf[pg]
trans = fitz.Matrix(1.0, 1.0).preRotate(0)
# 获得每一页的流对象
pm = page.getPixmap(matrix=trans, alpha=False)
# 保存图片
pm.writePNG(image_path + os.sep + pdfname[:-4] + '_' + '{:0>3d}.png'.format(pg + 1))
page_range = range(pdf.pageCount)
pdf.close()
return page_range
# 将图片中的文字转换为字符串
def read_png_str(page_range):
# 读取本地图片的函数
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
allPngStr = []
image_list = []
for page_num in page_range:
# 读取本地图片
image = get_file_content(image_path + os.sep + r'{}_{}.png'.format(pdfname[:-4], '%03d' % (page_num + 1)))
print(image)
image_list.append(image)
# 新建一个AipOcr
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
# 可选参数
options = {}
options["language_type"] = "CHN_ENG"
options["detect_direction"] = "false"
options["detect_language"] = "false"
options["probability"] = "false"
for image in image_list:
# 通用文字识别,得到的是一个dict
pngjson = client.basicGeneral(image, options)
pngstr = ''
for x in pngjson['words_result']:
pngstr = pngstr + x['words'] + '\n'
print('正在调用百度接口:第{}个,共{}个'.format(len(allPngStr), len(image_list)))
allPngStr.append(pngstr)
return allPngStr
def str2word(allPngStr):
document = Document()
for i in allPngStr:
document.add_paragraph(
i, style='ListBullet'
)
document.save(pdfpath + os.sep + pdfname[:-4] + '.docx')
print('处理完成')
image_path = pdfpath + os.sep + "image"
if not os.path.exists(image_path):
os.mkdir(image_path)
range_count = pdf_image()
allPngStr = read_png_str(range_count)
str2word(allPngStr)
pip3 install pdfminer3k
pip3 install python-docx
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from docx import Document
import warnings
import os
filePath = 'D:/pdf/水浒传.pdf'
file_name = os.open(filePath, os.O_RDWR)
document = Document()
warnings.filterwarnings("ignore")
def pdf2word():
fn = open(file_name, 'rb')
parser = PDFParser(fn)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
resource = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(resource, laparams=laparams)
interpreter = PDFPageInterpreter(resource, device)
for i in doc.get_pages():
interpreter.process_page(i)
layout = device.get_result()
for out in layout:
if hasattr(out, "get_text"):
content = out.get_text().replace(u'\xa0', u' ')
document.add_paragraph(
content, style='ListBullet'
)
document.save(filePath[:-4] + '.docx')
print('处理完成')
if __name__ == '__main__':
pdf2word()
参考博客:https://blog.csdn.net/dianepure/article/details/88568761