import os
import re
from PIL import Image
from pdf2image import convert_from_path, exceptions
from concurrent.futures import ProcessPoolExecutor
import tempfile
# 解除Pillow的像素限制
Image.MAX_IMAGE_PIXELS = None
chunk_size = 10 # 每个块的页数
def add_poppler_to_path():
poppler_path = r"E:\software\dev\poppler\bin"
os.environ["PATH"] += os.pathsep + poppler_path
def convert_pdf_to_jpg(pdf_path):
# 获取PDF文件名(不包含扩展名)
pdf_filename1 = os.path.splitext(os.path.basename(pdf_path))[0]
pdf_filename = re.sub(r'[^\w\s-]', '', pdf_filename1) # 去除特殊字符
print('正在处理文件:', pdf_filename1)
# 创建保存图像的文件夹,以PDF文件名命名
output_dir = f"./image/{pdf_filename}"
os.makedirs(output_dir, exist_ok=True)
try:
with tempfile.TemporaryDirectory() as path:
images_from_path = convert_from_path(pdf_path, output_folder=path)
base_filename = os.path.splitext(os.path.basename(pdf_path))[0]
for i, page in enumerate(images_from_path):
page.save(os.path.join(output_dir, f"{base_filename}_page_{i + 1}.jpg"), "JPEG")
print('正在保存的image信息:', os.path.join(output_dir, f"{base_filename}_page_{i + 1}.jpg"))
except exceptions.PDFPageCountError:
print(f"无法处理文件: {pdf_path} ,原因:文件格式可能有误或损坏,跳过此文件。")
except Image.DecompressionBombWarning:
print(f"无法处理文件: {pdf_path} ,原因:图像尺寸过大,可能导致内存耗尽,跳过此文件。")
# 添加poppler路径到系统环境变量
add_poppler_to_path()
pdf_dir = './pdf/' # PDF文件夹路径
pdf_files = [os.path.join(pdf_dir, filename) for filename in os.listdir(pdf_dir) if filename.endswith('.pdf')]
if __name__ == '__main__':
# 使用并行处理加速转换
with ProcessPoolExecutor(max_workers=os.cpu_count() // 3) as executor:
executor.map(convert_pdf_to_jpg, pdf_files)
备注:如果pdf超过10M,在转换的时候将根据pdf页数进行分割,会降低内存的消耗.减少out of memory出现的频率. 转换的快慢和文件大小,笔记本的配置相关.poppler的安装请自行百度,将文件输出和输入的路径以及系统变量替换即可.