实现从前端上传的PDF中提取图片
pip install pymupdf
pip install pypdf2
from PyPDF2 import PdfFileReader,PdfFileWriter
import fitz
def upload(request):
InitFile = request.FILES.get('file') # 获取文件对象,包括文件名文件大小和文件内容
file = PdfFileReader(Initfile)# 获取 PdfFileReader 对象
根据PyMuPDF官方文档
>>> # from a file
>>> doc = fitz.open("some.pdf")
>>> doc = fitz.open("some.file", None, "pdf") # copes with wrong extension
>>> doc = fitz.open("some.file", filetype="pdf") # copes with wrong extension
>>>
>>> # from memory
>>> doc = fitz.open("pdf", mem_area)
>>> doc = fitz.open(None, mem_area, "pdf")
>>> doc = fitz.open(stream=mem_area, filetype="pdf")
>>>
>>> # new empty PDF
>>> doc = fitz.open()
>>>
本文所用的是第二种方法,所需参数是数据流stream,根据官方文档可以是bytes,bytearray,BytesIO;还需指明文件类型filetype。那么如何将PdfFileReader对象转换成BytesIO?
# Get the page of the PDF
dst_pdf = PdfFileWriter()
dst_pdf.addPage(file.getPage(1))
# Create BytesIO
pdf_bytes = io.BytesIO()
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
#Get Images
getImgs(pdf_bytes)
利用函数get_page_images.PDF only: make a list of images on a page
def getImgs(pdf):
path = r'C:\Users\DYJ'#图片的存储路径
doc = fitz.open("pdf",pdf)
imgs = doc.get_page_images(0,full=True)
# print(doc.get_page_images(0,full=True))
for i in range(0,len(imgs)):
pix = fitz.Pixmap(doc,int(imgs[i][0]))
# print(pix)
pix.writePNG(os.path.join(path, f"img{i}.png"))
pix = None
至此一个简单的提取PDF某一页上的图片功能就实现了。
https://pymupdf.readthedocs.io/en/latest/document.html#Document.get_page_images