提取pdf中可能的图片

import re

input_pdf = 'Ch05-2006.pdf'
output_base = input_pdf.replace('-2006','').split('.')[0]

with open(input_pdf,'rb') as f:
    pdf = f.read()

jpg_pattern = re.compile(rb'\xff\xd8.*?\xff\xd9\x0a',re.DOTALL)
png_pattern = re.compile(rb'\x89\x50\x4e\x47.*?\xae\x42\x60\x82',re.DOTALL)
jpgs = jpg_pattern.findall(pdf)
pngs = png_pattern.findall(pdf)

jpgn = jpgs.__len__()
pngn = pngs.__len__()
print('Find {} jpg and {} png in {}'.format(jpgn,pngn,input_pdf))

if jpgn:
    for i,jpg in enumerate(jpgs):
        output_jpg = '{}-{}.jpg'.format(output_base,str(i + 1).zfill(3))
        print('  Export {}'.format(output_jpg))
        with open(output_jpg,'wb') as f:
            f.write(jpg)
if pngn:
    for i,pngn in enumerate(pngn):
        output_png = '{}-{}.png'.format(output_base,str(i + 1).zfill(3))
        print('  Export {}'.format(output_png))
        with open(output_jpg,'wb') as f:
            f.write(jpg)

你可能感兴趣的:(提取pdf中可能的图片)