Python 语言读取 pdf 文件是一个令人头疼的事情,本文将使用的代码包 pdfminer 的文档中就曾评价 "PDF is evil"。
本文将提供使用该代码包读取 pdf 文件的具体代码,实现提取英文 pdf 文本中的单词的功能。
具体代码如下:
import re
import pandas as pd
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *
def text_to_words(text):
word_list_text = []
text = text.replace('-\n', '') # to one line
text = text.replace('\n', '')
words = text.split(' ') # to words
punctuations = {',', ';', '.', '!', '?', '\"', '\'', '(', ')'}
pattern_hyphen = re.compile('[a-z]+-[a-z]+') # pattern of words contain hyphens
for word in words:
if len(word) >= 2 or word in {'a', 'A', 'I'}:
if word[0] in punctuations:
word = word[1:]
if word[-1] in punctuations:
word = word[:-1]
word = word.lower() # to lower case
if word.isalpha():
word_list_text.append(word)
elif pattern_hyphen.match(word):
word_list_text.append(word)
return word_list_text
def pdf_to_words(path):
pdf_file = open(path, mode='rb') # 以二进制读模式打开
print('current pdf path: ', path, '\n')
parser = PDFParser(pdf_file) # 用文件对象来创建一个pdf文档分析器
doc = PDFDocument() # 创建一个pdf文档
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize() # 提供初始化密码,如果没有密码,就创建一个空的字符串
word_list = []
text_list = []
if not doc.is_extractable:
return word_list
# raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager() # 创建pdf资源管理器,来管理共享资源
laparams = LAParams() # 创建一个pdf设备对象
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) # 创建一个pdf解释器对象
# in each page
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result() # 接受该页面的LTPage对象
# in each layout
for content in layout:
if isinstance(content, LTTextBox):
text = content.get_text()
text_list.append(text)
# texts to words
for text in text_list:
word_list_text = text_to_words(text)
word_list += word_list_text
ser_words = pd.Series(word_list)
df_words = ser_words.reset_index()
df_words.columns = ['index', 'word']
return df_words
此外,需要注意的是,安装 pdfminer 时,如为 Python3,请安装 pdfminer3k,否则安装失败。