python3读pdf

一、

pip install pdfplumber
# -*- coding: utf-8 -*-
import pdfplumber


file_path = ''

with pdfplumber.open(file_path) as pdf:
    for c in pdf.objects['char']:
        print(c)

二、

pip install pdfminer3k
# -*- encoding: utf-8 -*-

try:
    from urllib.request import urlopen
except:
    from urllib import urlopen

from io import StringIO

from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import  LAParams

# 读取pdf的函数,返回内容
def readPdf(pdf_file):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams)

    process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file)
    device.close()

    content = retstr.getvalue()
    retstr.close()

    return content


url = "http://www.pythonscraping.com/pages/warandpeace/chapter1.pdf"
pdf_file = urlopen(url) # 也可以换成本地pdf文件,用open rb模式打开
content = readPdf(pdf_file)
print(content)
pdf_file.close()

你可能感兴趣的:(python)