通过python 提取docx文件中的文本内容,包括:段落、文本域、页眉页脚、目录、超链接、脚注等各处文本
import os
import re
import docx
import lxml
import shutil
import logging
from io import BytesIO
from lxml import etree
from pydocx import PyDocX
from docx.shared import Pt
from docx.oxml.ns import qn
from zipfile import ZipFile
# 脚注提取与写入
class TransFootnote(object):
def __init__(self, docx_path):
self.footnotes = None
self.docx_path = docx_path
def get_footnote_text(self):
try :
self.docx_zip = ZipFile(self.docx_path)
self.docx_zip.extractall(self.docx_path[:-5])
self.footnote_xml = self.docx_path[:-5] + '/word/footnotes.xml'
if not os.path.exists(self.footnote_xml):
return None
self.tree = etree.parse(self.footnote_xml)
root = self.tree.getroot()
footnotes = []
footnotes_text = []
for i, element in enumerate(root):
if i >1:
foot_nt = []
for child in element.iter():
if child.text and child.text.strip():
foot_nt.append(child.text)
child.text = ''
if len(foot_nt) == 1:
footnotes.append(child)
if foot_nt:
footnotes_text.append(''.join(foot_nt))
self.footnotes = footnotes
return footnotes_text
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
def save_footnote_text(self, trans_text):
try:
if trans_text:
for ft, tt in zip(self.footnotes, trans_text):
ft.text = tt
self.tree.write(self.footnote_xml, encoding='utf-8', xml_declaration=True, standalone=True)
new_docx_file = ZipFile(self.docx_path, mode='w')
for i in self.docx_zip.namelist():
new_docx_file.write(self.docx_path[:-5]+'/'+i,i)
new_docx_file.close()
self.docx_zip.close()
shutil.rmtree(self.docx_path[:-5])
except Exception as e:
if os.path.exists(self.docx_path[:-5]):
shutil.rmtree(self.docx_path[:-5])
logging.error('Error: ' + str(e), exc_info=True)
class DocxFileTrans(object):
def __init__(self):
self.file_character_num = 0
# DOCX文本转换
def docx_file_trans(self, file_info):
input_value, output_value = {}, {}
file = docx.Document(file_info['filename'])
# 页眉页脚提取
header = self.get_docx_header(file)
footer = self.get_docx_footer(file)
# 目录提取
directory = self.get_docx_catalog(file)
# 文本域提取
text_domain = self.get_docx_textbox(file)
# 超链接提取
hyperlinks = self.get_docx_hyperlink(file)
# 表格提取
form = self.get_docx_table(file)
# 段落提取
paragraph = self.get_docx_paragraph(file)
# 文本内容转换
input_value, output_value = self.docx_trans_text(header, footer, directory,
text_domain, hyperlinks, form, paragraph)
# 字体设置
self.docx_set_style(file)
trans_file = file_info['trans_file']
logging.info('file.save(trans_file)')
file.save(trans_file)
# 脚注转换
tfnote = TransFootnote(trans_file)
footnote_text = tfnote.get_footnote_text()
input_value['footnote'] = footnote_text
if footnote_text:
self.file_character_num += sum([len(text) for text in footnote_text])
footnote_text = ["***"+text for text in footnote_text]
output_value['footnote'] = footnote_text
tfnote.save_footnote_text(footnote_text)
# 文档内的文本翻译
def docx_trans_text(self, header, footer, directory, text_domain, hyperlinks, form, paragraph):
input_value, output_value = {}, {}
file_text_total = []
text_num = [0]
input_value['header'] = header.send(None)
input_value['footer'] = footer.send(None)
input_value['directory'] = directory.send(None)
input_value['text_domain'] = text_domain.send(None)
input_value['hyperlinks'] = hyperlinks.send(None)
input_value['form'] = form.send(None)
input_value['paragraph'] = paragraph.send(None)
for key in input_value.keys():
file_text_total.extend(input_value[key])
text_num.append(text_num[-1] + len(input_value[key]))
# 文本转换
trans_file_text_total = ["***"+text for text in file_text_total]
file_part = [header, footer, directory, text_domain, hyperlinks, form, paragraph]
part_name = ['header', 'footer', 'directory', 'text_domain', 'hyperlinks', 'form', 'paragraph']
for i in range(len(text_num)-1):
try:
file_part[i].send(trans_file_text_total[text_num[i]:text_num[i+1]])
output_value[part_name[i]] = trans_file_text_total[text_num[i]:text_num[i+1]]
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
print('error:',part_name[i])
finally:
file_part[i].close()
return input_value, output_value
# 获取页眉文本
def get_docx_header(self, file):
headers = [parg for section in file.sections for parg in section.header.paragraphs if parg.text]
header_text = [header.text for header in headers]
self.file_character_num += sum([len(text) for text in header_text])
header_text = yield header_text
if header_text == None:
return
for hd, ht in zip(headers, header_text):
try:
hd.text = ht
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
yield None
# 获取页脚文本
def get_docx_footer(self, file):
footers = [parg for section in file.sections for parg in section.footer.paragraphs if parg.text]
footer_text = [footer.text for footer in footers]
self.file_character_num += sum([len(text) for text in footer_text])
footer_text = yield footer_text
if footer_text == None:
return
for fo, ft in zip(footers, footer_text):
try:
fo.text = ft
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
yield None
# docx中目录翻译
def get_docx_catalog(self, file):
children = file.element.body.getchildren()
child_iters = []
for child in children:
# 通过类型判断目录
if child.tag.endswith('main}sdt'):
for ci in child.iter():
if ci.tag.endswith('main}r') and ci.text.strip():
child_iters.append(ci)
catalog = [ci.text for ci in child_iters]
self.file_character_num += sum([len(text) for text in catalog])
catalog = yield catalog
if catalog == None:
return
for ci, cl in zip(child_iters, catalog):
try:
ci.text = cl
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
yield None
# docx中文本框翻译
def get_docx_textbox(self, file):
children = file.element.body.iter()
child_iters = []
for child in children:
# 通过类型判断目录
if child.tag.endswith(('AlternateContent', 'textbox')):
for ci in child.iter():
if ci.tag.endswith(('main}r', 'main}pPr')):
child_iters.append(ci)
textbox = ['']
for ci in child_iters:
if ci.tag.endswith('main}pPr'):
textbox.append('')
else:
textbox[-1] += ci.text
ci.text = ''
self.file_character_num += sum([len(text) for text in textbox])
textbox = yield textbox
if textbox == None:
return
i, k = 0, 0
for ci in child_iters:
if ci.tag.endswith('main}pPr'):
i += 1
k = 0
elif k == 0:
try:
ci.text = textbox[i]
k = 1
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
yield None
# docx中超链接翻译
def get_docx_hyperlink(self, file):
child_iters = []
child_texts = []
children = file.element.body.iter()
for child in children:
if child.tag.endswith('main}hyperlink'):
for ci in child.iter():
if ci.tag.endswith( 'main}t') and ci.text and ci.text.strip():
child_iters.append(ci)
child_texts.append(ci.text)
ci.text = ''
self.file_character_num += sum([len(text) for text in child_texts])
trans_texts = yield child_texts
if trans_texts == None:
return
for ci, tt in zip(child_iters, trans_texts):
try:
ci.text = tt
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
yield None
# docx中段落翻译
def get_docx_paragraph(self, file):
paragraphs_nonull = [parg for parg in file.paragraphs if parg.text]
paragraphs_run_text = []
for parg in paragraphs_nonull:
runt = []
for run in parg.runs:
if run.text:
runt.append(run.text)
run.text = ''
paragraphs_run_text.append(''.join(runt))
self.file_character_num += sum([len(text) for text in paragraphs_run_text])
paragraphs_run_text_trs = yield paragraphs_run_text
if paragraphs_run_text_trs == None:
return
for parg, prtt in zip(paragraphs_nonull, paragraphs_run_text_trs):
try:
parg.add_run(prtt)
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
yield None
# docx中表格翻译
def get_docx_table(self, file):
tables_parg = []
table_run_text = []
for table in file.tables:
cell_set = [cell for row in table.rows for cell in row.cells]
for cell in cell_set:
paragraphs_nonull = [parg for parg in cell.paragraphs if parg.text]
tables_parg.extend(paragraphs_nonull)
for paragraph in paragraphs_nonull:
runt = []
for run in paragraph.runs:
if run.text:
runt.append(run.text)
run.text = ''
table_run_text.append(''.join(runt))
self.file_character_num += sum([len(text) for text in table_run_text])
table_run_text = yield table_run_text
if table_run_text == None:
return
for tp, trt in zip(tables_parg, table_run_text):
try:
tp.runs[0].text = trt
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
yield None
# docx字体设置
def docx_set_style(self, file):
for style in file.styles:
if isinstance(style, docx.styles.style._ParagraphStyle):
try:
file.styles[style.name].font.name = u'微软雅黑'
file.styles[style.name]._element.rPr.rFonts.set(qn('w:eastAsia'), u'微软雅黑')
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
测试:
path_in = 'E:/data/converted.docx'
path_out = 'E:/data/converted(trans).docx'
content = {"filename":path_in, "trans_file":path_out}
df =DocxFileTrans()
df.docx_file_trans(content)
目前python对word的解析支持不完全,需通过各种手段组合提取word文档内容,以上代码就是通过不同方法组合提取docx 文档各部分文本内容: