.docx
文件其实也就是一个压缩文件,当我们将一个.docx
文件直接解压后可以看到_rels、docProps、word
三个文件夹和文件[Content_Types].xml
,其中我们要找的图片就在word/media
目录内。因此,要提取word内的图片可以考虑将.docx
文件解压,再从word/media
文件内提取图片,最后将解压后的临时文件删除即可。
import zipfile
import os
import shutil
def word2img(word_path, result_path):
tmp_path = f'{os.path.splitext(word_path)[0]}'
splitext = os.path.splitext(word_path)
zip_path = shutil.copy(word_path, f'{splitext[0]}_new{splitext[1]}')
with zipfile.ZipFile(zip_path, 'r') as f:
for file in f.namelist():
f.extract(file, tmp_path)
os.remove(zip_path)
pic_path = os.path.join(tmp_path, 'word/media')
if not os.path.exists(pic_path):
shutil.rmtree(tmp_path)
return 'no pictures found'
pictures = os.listdir(pic_path)
if not os.path.exists(result_path):
os.makedirs(result_path)
for picture in pictures:
word_name = os.path.splitext(word_path)[0]
if os.sep in word_name:
new_name = word_name.split('\\')[-1]
else:
new_name = word_name.split('/')[-1]
picture_name = f'{new_name}_{picture}'
shutil.copy(os.path.join(pic_path, picture), os.path.join(result_path, picture_name))
shutil.rmtree(tmp_path)
return (os.path.join(result_path, pic) for pic in os.listdir(result_path))
pip install docx
import os
import docx
import re
def word2img2(word_path, result_path):
doc = docx.Document(word_path)
dict_rel = doc.part._rels
for rel in dict_rel:
rel = dict_rel[rel]
if "image" in rel.target_ref:
if not os.path.exists(result_path):
os.makedirs(result_path)
img_name = re.findall("/(.*)", rel.target_ref)[0]
word_name = os.path.splitext(word_path)[0]
if os.sep in word_name:
new_name = word_name.split('\\')[-1]
else:
new_name = word_name.split('/')[-1]
img_name = f'{new_name}_{img_name}'
with open(f'{result_path}/{img_name}', "wb") as f:
f.write(rel.target_part.blob)