有一个需求,需要从excle文件中提取插入的音频文件。通过查找发现在文档中插入的对象、附件,都是ole文件,因此查找python获取ole文件方法。尝试了一些方法发现都不能获取到原文件名称,最后查找发现oletools库可获取到原文件名称
经过测试,可实现从word/excel/ppt中提取插入的附件、对象等文件(除word、excel、ppt文档外)。
对文件有要求,必须为docx、xlsx、pptx,因为这三种文件作为压缩文件打开后,有专门存放ole对象与图片的文件夹。
import olefile
from oletools import oleobj
import os
import shutil
from zipfile import ZipFile
class GetOleFileUtil:
def __init__(self):
pass
def save_olefile(self, file, savepath):
name_dict = {
'xlsx': 'xl',
'docx': 'word',
'pptx': 'ppt'
}
try:
name = name_dict[os.path.splitext(file)[-1].replace('.', '')]
savefolder = self.create_folder(file, savepath)
with ZipFile(file, 'r') as zip:
for entry in zip.infolist():
if not entry.filename.startswith(f'{name}/embedding/'):
continue
if not (entry.filename.endswith('.bin') and 'ole' in entry.filename.lower()):
continue
with zip.open(entry.filename) as f:
if not olefile.isOleFile(f):
continue
with olefile.OleFileIO(f) as ole:
stream = None
try:
stream = ole.openstream('\x01Ole10Native')
opkg = oleobj.OleNativeStream(stream)
except IOError:
print('不是ole文件')
if stream is not None:
stream.close()
if opkg.is_link:
print('是链接不是文件,跳过')
continue
ole_filename = self.re_decode(opkg.filename)
ole_src_path = self.re_decode(opkg.src_path)
ole_temp_path = self.re_decode(opkg.temp_path)
print(f'文件名{ole_filename},源路径{ole_src_path},缓存路劲{ole_temp_path}')
filename = savefolder + os.sep + ole_filename
try:
print(f'导出ole中的文件:{filename}')
with open(filename, 'wb') as writer:
n_dumped = 0
next_size = min(oleobj.DUMP_CHUNK_SIZE, opkg.actual_size)
while next_size:
data = stream.read(next_size)
writer.write(data)
n_dumped += len(data)
if len(data) != next_size:
print()
break
next_size = min(oleobj.DUMP_CHUNK_SIZE, opkg.actual_size - n_dumped)
except Exception as exc:
print('在转存时出现错误:')
raise exc
finally:
stream.close()
except Exception as ex:
return False, f'提取ole文件异常:{ex}'
@staticmethod
def re_decode(s, encoding='gbk'):
i81 = s.encode('iso-8859-1')
return i81.decode(encoding)
@staticmethod
def create_folder(file, savepath):
filename = os.path.basename(os.path.splitext(file)[0])
new_folder = savepath + os.sep + filename
if not (os.path.exists(new_folder)):
os.makedirs(new_folder)
print(f'创建文件夹成功:{new_folder}')
else:
print('文件夹已存在,删除再创建:{}')
shutil.rmtree(new_folder)
os.makedirs(new_folder)
return new_folder
将文档作为压缩文件打开,因word、excel、ppt各种文档存放ole文件的文件夹不同,因此使用字典区分各文档对应的文件夹名称。查找文件夹embedding中的文档,判断是否为ole文件,再获取到ole文件的源文件名称、存放路径。最后读取ole流,写入文件,保存成功。
1-【Python】导出docx格式Word文档中的文本、图片和附件等__清风来叙的博客-CSDN博客