【Python】使用 pdfminer 读取 pdf 文件

Python 语言读取 pdf 文件是一个令人头疼的事情,本文将使用的代码包 pdfminer 的文档中就曾评价 "PDF is evil"。

本文将提供使用该代码包读取 pdf 文件的具体代码,实现提取英文 pdf 文本中的单词的功能。

具体代码如下:

import re
import pandas as pd

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *

def text_to_words(text):
    word_list_text = []

    text = text.replace('-\n', '')  # to one line
    text = text.replace('\n', '')
    words = text.split(' ')  # to words

    punctuations = {',', ';', '.', '!', '?', '\"', '\'', '(', ')'}
    pattern_hyphen = re.compile('[a-z]+-[a-z]+')  # pattern of words contain hyphens

    for word in words:
        if len(word) >= 2 or word in {'a', 'A', 'I'}:
            if word[0] in punctuations:
                word = word[1:]
            if word[-1] in punctuations:
                word = word[:-1]

            word = word.lower()  # to lower case
            if word.isalpha():
                word_list_text.append(word)
            elif pattern_hyphen.match(word):
                word_list_text.append(word)

    return word_list_text


def pdf_to_words(path):
    pdf_file = open(path, mode='rb')    # 以二进制读模式打开
    print('current pdf path:  ', path, '\n')

    parser = PDFParser(pdf_file)    # 用文件对象来创建一个pdf文档分析器
    doc = PDFDocument()    # 创建一个pdf文档
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()    # 提供初始化密码,如果没有密码,就创建一个空的字符串

    word_list = []
    text_list = []

    if not doc.is_extractable:
        return word_list
        # raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()    # 创建pdf资源管理器,来管理共享资源
        laparams = LAParams()    # 创建一个pdf设备对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)    # 创建一个pdf解释器对象

        # in each page
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()    # 接受该页面的LTPage对象

            # in each layout
            for content in layout:
                if isinstance(content, LTTextBox):
                    text = content.get_text()
                    text_list.append(text)

    # texts to words
    for text in text_list:
        word_list_text = text_to_words(text)
        word_list += word_list_text

    ser_words = pd.Series(word_list)
    df_words = ser_words.reset_index()
    df_words.columns = ['index', 'word']

    return df_words

 

此外,需要注意的是,安装 pdfminer 时,如为 Python3,请安装 pdfminer3k,否则安装失败。

你可能感兴趣的:(Python)