import cv2 |
import numpy as np |
from PIL import Image, ImageDraw, ImageFont |
from pdfminer.high_level import extract_text |
from pdfminer.layout import LAParams |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter |
from pdfminer.converter import TextConverter |
def search_and_highlight(pdf_path, keyword, output_path): |
# 读取PDF文件并提取文本和图像 |
resource_manager = PDFResourceManager() |
fake_file_handle = io.StringIO() |
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams()) |
page_interpreter = PDFPageInterpreter(resource_manager, converter) |
with open(pdf_path, 'rb') as fh: |
for page in pdfminer.pdfparser.PDFParser(fh).get_pages(): |
page_interpreter.process_page(page) |
text = fake_file_handle.getvalue() |
# 关闭打开的PDF文件和文本转换器 |
converter.close() |
fake_file_handle.close() |
# 提取图像并转换为OpenCV格式 |
images = [] |
for line in text.split('\n'): |
if 'image' in line: |
image_path = line.split('image=')[1].split(';')[0] |
with open(image_path, 'rb') as img_file: |
img = Image.open(img_file) |
img = img.convert('RGB') |
img = np.array(img) |
images.append(img) |
# 在图像中识别关键字并标注 |
for i, img in enumerate(images): |
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) |
_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) |
text = ''.join(np.where(thresh > 0, 'X', ' ')).replace('X', '') # 识别关键字为'X' |
if keyword in text: # 如果识别到关键字,则标注在图像上 |
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # 将图像转换为RGB格式以便于绘制标注文字 |
draw = ImageDraw.Draw(img) # 创建绘制对象 |
font = ImageFont.truetype('arial.ttf', 15) # 加载字体文件并设置字体大小 |
x, y = np.where(thresh > 0) # 获取关键字在图像中的位置坐标 |
draw.text((x[0], y[0]), keyword, font=font, fill=(255, 0, 0)) # 在关键字位置上绘制标注文字并填充颜色为红色(RGB为(255,0,0)) |
img = np.array(img) # 将标注后的图像转换回NumPy数组格式以便于保存为文件或进一步处理 |
images[i] = img # 将标注后的图像添加到列表中以便于保存为文件或进一步处理 |
# 将标注后的图像保存为文件或进一步处理(例如,将标注后的图像合并到原始PDF文件中) |
# ... |