【Python】使用 pdfminer 读取 pdf 文件

Python 语言读取 pdf 文件是一个令人头疼的事情,本文将使用的代码包 pdfminer 的文档中就曾评价 "PDF is evil"。

本文将提供使用该代码包读取 pdf 文件的具体代码,实现提取英文 pdf 文本中的单词的功能。


import re
import pandas as pd

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *

def text_to_words(text):
    word_list_text = []

    text = text.replace('-\n', '')  # to one line
    text = text.replace('\n', '')
    words = text.split(' ')  # to words

    punctuations = {',', ';', '.', '!', '?', '\"', '\'', '(', ')'}
    pattern_hyphen = re.compile('[a-z]+-[a-z]+')  # pattern of words contain hyphens

    for word in words:
        if len(word) >= 2 or word in {'a', 'A', 'I'}:
            if word[0] in punctuations:
                word = word[1:]
            if word[-1] in punctuations:
                word = word[:-1]

            word = word.lower()  # to lower case
            if word.isalpha():
            elif pattern_hyphen.match(word):

    return word_list_text

def pdf_to_words(path):
    pdf_file = open(path, mode='rb')    # 以二进制读模式打开
    print('current pdf path:  ', path, '\n')

    parser = PDFParser(pdf_file)    # 用文件对象来创建一个pdf文档分析器
    doc = PDFDocument()    # 创建一个pdf文档
    doc.initialize()    # 提供初始化密码,如果没有密码,就创建一个空的字符串

    word_list = []
    text_list = []

    if not doc.is_extractable:
        return word_list
        # raise PDFTextExtractionNotAllowed
        rsrcmgr = PDFResourceManager()    # 创建pdf资源管理器,来管理共享资源
        laparams = LAParams()    # 创建一个pdf设备对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)    # 创建一个pdf解释器对象

        # in each page
        for page in doc.get_pages():
            layout = device.get_result()    # 接受该页面的LTPage对象

            # in each layout
            for content in layout:
                if isinstance(content, LTTextBox):
                    text = content.get_text()

    # texts to words
    for text in text_list:
        word_list_text = text_to_words(text)
        word_list += word_list_text

    ser_words = pd.Series(word_list)
    df_words = ser_words.reset_index()
    df_words.columns = ['index', 'word']

    return df_words


此外,需要注意的是,安装 pdfminer 时,如为 Python3,请安装 pdfminer3k,否则安装失败。
