pdf转换jpg(Python版本3.10)

import os
import re
from PIL import Image
from pdf2image import convert_from_path, exceptions
from concurrent.futures import ProcessPoolExecutor
import tempfile

# 解除Pillow的像素限制
Image.MAX_IMAGE_PIXELS = None

chunk_size = 10  # 每个块的页数


def add_poppler_to_path():
    poppler_path = r"E:\software\dev\poppler\bin"
    os.environ["PATH"] += os.pathsep + poppler_path


def convert_pdf_to_jpg(pdf_path):
    # 获取PDF文件名(不包含扩展名)
    pdf_filename1 = os.path.splitext(os.path.basename(pdf_path))[0]
    pdf_filename = re.sub(r'[^\w\s-]', '', pdf_filename1)  # 去除特殊字符
    print('正在处理文件:', pdf_filename1)
    # 创建保存图像的文件夹,以PDF文件名命名
    output_dir = f"./image/{pdf_filename}"
    os.makedirs(output_dir, exist_ok=True)

    try:
        with tempfile.TemporaryDirectory() as path:
            images_from_path = convert_from_path(pdf_path, output_folder=path)
            base_filename = os.path.splitext(os.path.basename(pdf_path))[0]

            for i, page in enumerate(images_from_path):
                page.save(os.path.join(output_dir, f"{base_filename}_page_{i + 1}.jpg"), "JPEG")
                print('正在保存的image信息:', os.path.join(output_dir, f"{base_filename}_page_{i + 1}.jpg"))
    except exceptions.PDFPageCountError:
        print(f"无法处理文件: {pdf_path} ,原因:文件格式可能有误或损坏,跳过此文件。")
    except Image.DecompressionBombWarning:
        print(f"无法处理文件: {pdf_path} ,原因:图像尺寸过大,可能导致内存耗尽,跳过此文件。")


# 添加poppler路径到系统环境变量
add_poppler_to_path()

pdf_dir = './pdf/'  # PDF文件夹路径

pdf_files = [os.path.join(pdf_dir, filename) for filename in os.listdir(pdf_dir) if filename.endswith('.pdf')]

if __name__ == '__main__':
    # 使用并行处理加速转换
    with ProcessPoolExecutor(max_workers=os.cpu_count() // 3) as executor:
        executor.map(convert_pdf_to_jpg, pdf_files)

备注:如果pdf超过10M,在转换的时候将根据pdf页数进行分割,会降低内存的消耗.减少out of memory出现的频率. 转换的快慢和文件大小,笔记本的配置相关.poppler的安装请自行百度,将文件输出和输入的路径以及系统变量替换即可.

你可能感兴趣的:(python,python,pdf,数学建模)