【PDF】使用python提取PDF里面的图片

为了手机论文里网络框架图,下载了280多篇论文,一个一个打开来看不现实,所以使用python提取里的图片,一幕了然,知道里面的框架类容了

pip install pymupdf 

 网上有很多帖子,但是里面的PyMuPDF都是老版本,很多类名都改了,你现在用的话,会报错

报错:doc._getXrefLength和doc.getObjectString(i) 

所以要改类名,新版本都不叫之前的名字了 

import fitz
import re
import os

file_path = r'D:\\baidu\\最最最\\2105.13381.pdf' # PDF 文件路径
dir_path = r'D:\\baidu\\最最最\\图片' # 存放图片的文件夹

def pdf2image1(path, pic_path):
    checkIM = r"/Subtype(?= */Image)"
    pdf = fitz.open(path)
    # lenXREF = pdf._getXrefLength()
    lenXREF = pdf.xref_length()
    count = 1
    for i in range(1, lenXREF):
        # text = pdf._getXrefString(i)
        # text = pdf.getObjectString(i)
        text = pdf.xref_object(i)
        isImage = re.search(checkIM, text)
        if not isImage:
            continue
        pix = fitz.Pixmap(pdf, i)
        new_name = f"img_{count}.png"
        pix.save(os.path.join(pic_path, new_name))
        count += 1
        pix = None

pdf2image1(file_path, dir_path)

完整批处理代码:

import fitz
import re
import os
import tqdm

def pdf2image1(path, pic_path, j):
    checkIM = r"/Subtype(?= */Image)"
    pdf = fitz.open(path)
    # lenXREF = pdf._getXrefLength()
    lenXREF = pdf.xref_length()
    count = 1
    for i in range(1, lenXREF):
        # text = pdf._getXrefString(i)
        # text = pdf.getObjectString(i)
        text = pdf.xref_object(i)
        isImage = re.search(checkIM, text)
        if not isImage:
            continue
        pix = fitz.Pixmap(pdf, i)
        if not pix.colorspace.name in (fitz.csGRAY.name, fitz.csRGB.name):
            pix = fitz.Pixmap(fitz.csRGB, pix)
        new_name = f"img_{j}_{count}.png"
        print(pix)
        pix.save(os.path.join(pic_path, new_name))
        count += 1
        pix = None

path = r'D:\\baidu\\最最最\\新建文件夹 (3)' # PDF 文件路径
dir_path = r'D:\\baidu\\最最最\\图片' # 存放图片的文件夹

fil = os.listdir(path)

for j, name in tqdm.tqdm(enumerate(fil)):
    # file_path = os.path.join(path,name)
    file_path = 'D:\\baidu\\最最最\\新建文件夹 (3)\diagnostics-11-01384-v2.pdf'
    print(file_path)
    pdf2image1(file_path, dir_path,j)

你可能感兴趣的:(pdf)