本 Python 脚本的主要功能是对当前目录及其子目录下的图片和 PDF 文件进行光学字符识别(OCR)处理。它使用 easyocr
库处理图片中的文字,使用 PyPDF2
库提取 PDF 文件中的文本,并将处理结果保存为文本文件。同时,脚本会记录详细的处理日志,方便用户跟踪处理过程和排查问题。
easyocr
:用于图片的 OCR 识别。PyPDF2
:用于读取 PDF 文件并提取文本。Pillow
(PIL
):虽然脚本中未直接使用,但 easyocr
处理图像时可能依赖。你可以使用以下命令安装这些依赖库:
收起
bash
pip install easyocr PyPDF2 Pillow
收起
python
import os
import time
import easyocr
from PyPDF2 import PdfReader
from PIL import Image
导入了处理文件系统、时间、OCR 识别、PDF 读取和图像处理所需的库。
收起
python
model_storage_directory = './easyocr_models'
os.makedirs(model_storage_directory, exist_ok=True)
定义了 easyocr
模型的存储目录,并确保该目录存在。
收起
python
def check_network():
try:
import urllib.request
urllib.request.urlopen('https://www.baidu.com', timeout=5)
return True
except:
return False
该函数尝试访问百度网站,以检查网络连接是否正常。如果能成功访问,则返回 True
,否则返回 False
。
收起
python
try:
print("Initializing EasyOCR...")
print(f"Model storage directory: {os.path.abspath(model_storage_directory)}")
if not check_network():
print("Network connection failed. Please check your internet connection.")
exit(1)
print("Downloading models (this may take several minutes)...")
reader = easyocr.Reader(
['ch_sim', 'en'],
model_storage_directory=model_storage_directory,
download_enabled=True,
verbose=True
)
print("EasyOCR initialized successfully")
except Exception as e:
print(f"Failed to initialize EasyOCR: {str(e)}")
exit(1)
easyocr
所需的模型,支持中文(简体)和英文识别。收起
python
def process_image(image_path):
"""处理图片文件"""
try:
result = reader.readtext(image_path)
text = '\n'.join([item[1] for item in result])
return text
except Exception as e:
print(f"Error processing image {image_path}: {str(e)}")
return ""
easyocr
对图片进行 OCR 识别,提取识别结果中的文本并拼接成字符串返回。收起
python
def process_pdf(pdf_path):
"""处理PDF文件"""
try:
text = ""
reader = PdfReader(pdf_path)
for page in reader.pages:
text += page.extract_text()
return text
except Exception as e:
print(f"Error processing PDF {pdf_path}: {str(e)}")
return ""
PyPDF2
读取 PDF 文件的每一页,并提取文本拼接成字符串返回。收起
python
def save_text(text, output_path):
"""保存提取的文本"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
main
收起
python
def main():
# 尝试多个可能的输出目录位置
output_folders = [
'./output_text', # 当前目录
os.path.expanduser('~/ocr_output'), # 用户主目录
os.path.join(os.getcwd(), 'ocr_output') # 当前工作目录
]
output_folder = None
for folder in output_folders:
try:
os.makedirs(folder, exist_ok=True)
output_folder = folder
print(f"Using output directory: {os.path.abspath(output_folder)}")
break
except Exception as e:
print(f"Failed to create output directory {folder}: {str(e)}")
if output_folder is None:
print("Error: Could not create any output directory")
exit(1)
# 初始化日志
log_file = os.path.join(output_folder, 'ocr_log.txt')
# 重定向标准输出到日志文件
import sys
class Logger(object):
def __init__(self, filename):
self.terminal = sys.stdout
self.log = open(filename, "a", encoding='utf-8')
def write(self, message):
self.terminal.write(message)
self.log.write(message)
def flush(self):
pass
sys.stdout = Logger(log_file)
print("OCR Processing Log\n")
print(f"Starting OCR processing at {time.strftime('%Y-%m-%d %H:%M:%S')}")
# 支持的图片格式
image_extensions = ['.bmp', '.jpg', '.jpeg', '.png', '.tiff', '.gif']
# 遍历当前目录及子目录
for root, dirs, files in os.walk('.'):
for file in files:
file_path = os.path.join(root, file)
base_name, ext = os.path.splitext(file)
try:
# 处理图片文件
if ext.lower() in image_extensions:
print(f"Processing image: {file_path}")
text = process_image(file_path)
output_path = os.path.join(output_folder, f"{base_name}.txt")
save_text(text, output_path)
print(f"Successfully processed image: {file_path} -> {output_path}")
with open(log_file, 'a') as f:
f.write(f"Success: {file_path} -> {output_path}\n")
# 处理PDF文件
elif ext.lower() == '.pdf':
print(f"Processing PDF: {file_path}")
text = process_pdf(file_path)
output_path = os.path.join(output_folder, f"{base_name}.txt")
save_text(text, output_path)
print(f"Successfully processed PDF: {file_path} -> {output_path}")
with open(log_file, 'a') as f:
f.write(f"Success: {file_path} -> {output_path}\n")
except Exception as e:
error_msg = f"Error processing {file_path}: {str(e)}"
print(error_msg)
with open(log_file, 'a') as f:
f.write(error_msg + "\n")
ocr_log.txt
日志文件,将标准输出重定向到该日志文件,同时保留在终端的输出。记录日志头部信息和处理开始时间。process_image
函数处理,对 PDF 文件调用 process_pdf
函数处理。将处理结果保存为文本文件,并在日志中记录成功或失败信息。收起
python
if __name__ == "__main__":
main()
当脚本作为主程序运行时,调用 main
函数开始执行。
ocr_process.py
)。收起
bash
python ocr_process.py
easyocr
模型下载可能需要一定时间,首次运行脚本时请确保网络连接稳定,耐心等待模型下载完成。PyPDF2
只能提取文本内容,若 PDF 为扫描版或加密文件,可能无法正常提取文本。ocr_log.txt
以获取详细的错误信息。完成代码
import os
import time
import easyocr
from PyPDF2 import PdfReader
from PIL import Image
# 设置模型下载路径
model_storage_directory = './easyocr_models'
os.makedirs(model_storage_directory, exist_ok=True)
# 检查网络连接
def check_network():
try:
import urllib.request
urllib.request.urlopen('https://www.baidu.com', timeout=5)
return True
except:
return False
# 初始化EasyOCR reader
try:
print("Initializing EasyOCR...")
print(f"Model storage directory: {os.path.abspath(model_storage_directory)}")
if not check_network():
print("Network connection failed. Please check your internet connection.")
exit(1)
print("Downloading models (this may take several minutes)...")
reader = easyocr.Reader(
['ch_sim', 'en'],
model_storage_directory=model_storage_directory,
download_enabled=True,
verbose=True
)
print("EasyOCR initialized successfully")
except Exception as e:
print(f"Failed to initialize EasyOCR: {str(e)}")
exit(1)
def process_image(image_path):
"""处理图片文件"""
try:
# 使用EasyOCR提取文本
result = reader.readtext(image_path)
# 合并所有识别结果
text = '\n'.join([item[1] for item in result])
return text
except Exception as e:
print(f"Error processing image {image_path}: {str(e)}")
return ""
def process_pdf(pdf_path):
"""处理PDF文件"""
try:
text = ""
reader = PdfReader(pdf_path)
for page in reader.pages:
text += page.extract_text()
return text
except Exception as e:
print(f"Error processing PDF {pdf_path}: {str(e)}")
return ""
def save_text(text, output_path):
"""保存提取的文本"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
def main():
# 尝试多个可能的输出目录位置
output_folders = [
'./output_text', # 当前目录
os.path.expanduser('~/ocr_output'), # 用户主目录
os.path.join(os.getcwd(), 'ocr_output') # 当前工作目录
]
output_folder = None
for folder in output_folders:
try:
os.makedirs(folder, exist_ok=True)
output_folder = folder
print(f"Using output directory: {os.path.abspath(output_folder)}")
break
except Exception as e:
print(f"Failed to create output directory {folder}: {str(e)}")
if output_folder is None:
print("Error: Could not create any output directory")
exit(1)
# 初始化日志
log_file = os.path.join(output_folder, 'ocr_log.txt')
# 重定向标准输出到日志文件
import sys
class Logger(object):
def __init__(self, filename):
self.terminal = sys.stdout
self.log = open(filename, "a", encoding='utf-8')
def write(self, message):
self.terminal.write(message)
self.log.write(message)
def flush(self):
pass
sys.stdout = Logger(log_file)
print("OCR Processing Log\n")
print(f"Starting OCR processing at {time.strftime('%Y-%m-%d %H:%M:%S')}")
# 支持的图片格式
image_extensions = ['.bmp', '.jpg', '.jpeg', '.png', '.tiff', '.gif']
# 遍历当前目录及子目录
for root, dirs, files in os.walk('.'):
for file in files:
file_path = os.path.join(root, file)
base_name, ext = os.path.splitext(file)
try:
# 处理图片文件
if ext.lower() in image_extensions:
print(f"Processing image: {file_path}")
text = process_image(file_path)
output_path = os.path.join(output_folder, f"{base_name}.txt")
save_text(text, output_path)
print(f"Successfully processed image: {file_path} -> {output_path}")
with open(log_file, 'a') as f:
f.write(f"Success: {file_path} -> {output_path}\n")
# 处理PDF文件
elif ext.lower() == '.pdf':
print(f"Processing PDF: {file_path}")
text = process_pdf(file_path)
output_path = os.path.join(output_folder, f"{base_name}.txt")
save_text(text, output_path)
print(f"Successfully processed PDF: {file_path} -> {output_path}")
with open(log_file, 'a') as f:
f.write(f"Success: {file_path} -> {output_path}\n")
except Exception as e:
error_msg = f"Error processing {file_path}: {str(e)}"
print(error_msg)
with open(log_file, 'a') as f:
f.write(error_msg + "\n")
if __name__ == "__main__":
main()