pdfminer读取PDF文本内容

# -*- coding: utf-8 -*-
# @Time    : 2023/8/1 13:14
# @Author  : Cocktail_py
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

file_name =r'xx.pdf'
output_string = StringIO()
with open(file_name, 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for number,page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        print("**********************************")
        print("{0}页的内容为".format(number+1),output_string.getvalue())
        output_string.truncate(0)
        output_string.seek(0)


# print(output_string.getvalue())

Extract elements from a PDF using Python

你可能感兴趣的:(个人学习记录,pdf,数据库)