python docx文档内容提取与写入(汇总)

通过python 提取docx文件中的文本内容,包括:段落、文本域、页眉页脚、目录、超链接、脚注等各处文本

import os
import re
import docx
import lxml
import shutil
import logging
from io import BytesIO
from lxml import etree
from pydocx import PyDocX
from docx.shared import Pt
from docx.oxml.ns import qn
from zipfile import ZipFile

# 脚注提取与写入
class TransFootnote(object):
    def __init__(self, docx_path):
        self.footnotes = None
        self.docx_path = docx_path

    def get_footnote_text(self):
        try :
            self.docx_zip = ZipFile(self.docx_path)
            self.docx_zip.extractall(self.docx_path[:-5])
            self.footnote_xml = self.docx_path[:-5] + '/word/footnotes.xml'
            if not os.path.exists(self.footnote_xml):
                return None
            self.tree = etree.parse(self.footnote_xml)
            root = self.tree.getroot()
            footnotes = []
            footnotes_text = []
            for i, element in enumerate(root):
                if i >1:
                    foot_nt = []
                    for child in element.iter():
                        if child.text and child.text.strip():
                            foot_nt.append(child.text)
                            child.text = ''
                            if len(foot_nt) == 1:
                                footnotes.append(child)
                    if foot_nt:
                        footnotes_text.append(''.join(foot_nt))
            self.footnotes = footnotes
            return footnotes_text
        except Exception as e:
            logging.error('Error: ' + str(e), exc_info=True)

    def save_footnote_text(self, trans_text):
        try:
            if trans_text:
                for ft, tt in zip(self.footnotes, trans_text):
                    ft.text = tt
                self.tree.write(self.footnote_xml, encoding='utf-8', xml_declaration=True, standalone=True)
                new_docx_file = ZipFile(self.docx_path, mode='w')
                for i in self.docx_zip.namelist():
                    new_docx_file.write(self.docx_path[:-5]+'/'+i,i)
                new_docx_file.close()
                self.docx_zip.close()
            shutil.rmtree(self.docx_path[:-5])
        except Exception as e:
            if os.path.exists(self.docx_path[:-5]):
                shutil.rmtree(self.docx_path[:-5])
            logging.error('Error: ' + str(e), exc_info=True)


class DocxFileTrans(object):
    def __init__(self):
        self.file_character_num = 0

    # DOCX文本转换
    def docx_file_trans(self, file_info):
        input_value, output_value = {}, {}
        file = docx.Document(file_info['filename'])
        # 页眉页脚提取
        header = self.get_docx_header(file)
        footer = self.get_docx_footer(file)
        # 目录提取
        directory = self.get_docx_catalog(file)
        # 文本域提取
        text_domain = self.get_docx_textbox(file)
        # 超链接提取
        hyperlinks = self.get_docx_hyperlink(file)
        # 表格提取
        form = self.get_docx_table(file)
        # 段落提取
        paragraph = self.get_docx_paragraph(file)
        # 文本内容转换
        input_value, output_value = self.docx_trans_text(header, footer, directory, 
                                            text_domain, hyperlinks, form, paragraph)

        # 字体设置
        self.docx_set_style(file)
        
        trans_file = file_info['trans_file']
        logging.info('file.save(trans_file)')
        file.save(trans_file)

        # 脚注转换
        tfnote = TransFootnote(trans_file)
        footnote_text = tfnote.get_footnote_text()
        input_value['footnote'] = footnote_text
        if footnote_text:
            self.file_character_num += sum([len(text) for text in footnote_text])
            footnote_text = ["***"+text for text in footnote_text]
        output_value['footnote'] = footnote_text
        tfnote.save_footnote_text(footnote_text)


    # 文档内的文本翻译
    def docx_trans_text(self, header, footer, directory, text_domain, hyperlinks, form, paragraph):
        input_value, output_value = {}, {}
        file_text_total = []
        text_num = [0]
        input_value['header'] = header.send(None)
        input_value['footer'] = footer.send(None)
        input_value['directory'] = directory.send(None)
        input_value['text_domain'] = text_domain.send(None)
        input_value['hyperlinks'] = hyperlinks.send(None)
        input_value['form'] = form.send(None)
        input_value['paragraph'] = paragraph.send(None)
        for key in input_value.keys():
            file_text_total.extend(input_value[key])
            text_num.append(text_num[-1] + len(input_value[key]))
        # 文本转换
        trans_file_text_total = ["***"+text for text in file_text_total]

        file_part = [header, footer, directory, text_domain, hyperlinks, form, paragraph]
        part_name = ['header', 'footer', 'directory', 'text_domain', 'hyperlinks', 'form', 'paragraph']
        for i in range(len(text_num)-1):
            try:
                file_part[i].send(trans_file_text_total[text_num[i]:text_num[i+1]])
                output_value[part_name[i]] = trans_file_text_total[text_num[i]:text_num[i+1]]
            except Exception as e:
                logging.error('Error: ' + str(e), exc_info=True)
                print('error:',part_name[i])
            finally:
                file_part[i].close()
        return input_value, output_value


    # 获取页眉文本
    def get_docx_header(self, file):
        headers = [parg for section in file.sections for parg in section.header.paragraphs if parg.text]
        header_text = [header.text for header in headers]
        self.file_character_num += sum([len(text) for text in header_text])
        header_text = yield header_text
        if header_text == None:
            return
        for hd, ht in zip(headers, header_text):
            try:
                hd.text = ht
            except Exception as e:
                logging.error('Error: ' + str(e), exc_info=True)
        yield None
    
    # 获取页脚文本
    def get_docx_footer(self, file):
        footers = [parg for section in file.sections for parg in section.footer.paragraphs if parg.text]
        footer_text = [footer.text for footer in footers]
        self.file_character_num += sum([len(text) for text in footer_text])
        footer_text = yield footer_text
        if footer_text == None:
            return
        for fo, ft in zip(footers, footer_text):
            try:
                fo.text = ft
            except Exception as e:
                logging.error('Error: ' + str(e), exc_info=True)
        yield None
    
    # docx中目录翻译
    def get_docx_catalog(self, file):
        children = file.element.body.getchildren()
        child_iters = []
        for child in children:
            # 通过类型判断目录
            if child.tag.endswith('main}sdt'):
                for ci in child.iter():
                    if ci.tag.endswith('main}r') and ci.text.strip():
                        child_iters.append(ci)
        catalog = [ci.text for ci in child_iters]
        self.file_character_num += sum([len(text) for text in catalog])
        catalog = yield catalog
        if catalog == None:
            return
        for ci, cl in zip(child_iters, catalog):
            try:
                ci.text = cl
            except Exception as e:
                logging.error('Error: ' + str(e), exc_info=True)
        yield None
            
    # docx中文本框翻译
    def get_docx_textbox(self, file):
        children = file.element.body.iter()
        child_iters = []
        for child in children:
            # 通过类型判断目录
            if child.tag.endswith(('AlternateContent', 'textbox')):
                for ci in child.iter():
                    if ci.tag.endswith(('main}r', 'main}pPr')):
                        child_iters.append(ci)
        textbox = ['']
        for ci in child_iters:
            if ci.tag.endswith('main}pPr'):
                textbox.append('')
            else:
                textbox[-1] += ci.text
            ci.text = ''
        self.file_character_num += sum([len(text) for text in textbox])
        textbox = yield textbox
        if textbox == None:
            return
        i, k = 0, 0
        for ci in child_iters:
            if ci.tag.endswith('main}pPr'):
                i += 1
                k = 0
            elif k == 0:
                try:
                    ci.text = textbox[i]
                    k = 1
                except Exception as e:
                    logging.error('Error: ' + str(e), exc_info=True)
        yield None

    # docx中超链接翻译
    def get_docx_hyperlink(self, file):
        child_iters = []
        child_texts = []
        children = file.element.body.iter()
        for child in children:
            if child.tag.endswith('main}hyperlink'):
                for ci in child.iter():
                    if ci.tag.endswith( 'main}t') and ci.text and ci.text.strip():
                            child_iters.append(ci)
                            child_texts.append(ci.text)
                            ci.text = ''
        self.file_character_num += sum([len(text) for text in child_texts])
        trans_texts = yield child_texts
        if trans_texts == None:
            return
        for ci, tt in zip(child_iters, trans_texts):
            try:
                ci.text = tt
            except Exception as e:
                logging.error('Error: ' + str(e), exc_info=True)
        yield None
    
    # docx中段落翻译
    def get_docx_paragraph(self, file):
        paragraphs_nonull = [parg for parg in file.paragraphs if parg.text]
        paragraphs_run_text = []
        for parg in paragraphs_nonull:
            runt = []
            for run in parg.runs:
                if run.text:
                    runt.append(run.text)
                    run.text = ''
            paragraphs_run_text.append(''.join(runt))
        self.file_character_num += sum([len(text) for text in paragraphs_run_text])
        paragraphs_run_text_trs = yield paragraphs_run_text
        if paragraphs_run_text_trs == None:
            return
        for parg, prtt in zip(paragraphs_nonull, paragraphs_run_text_trs):
            try:
                parg.add_run(prtt)
            except Exception as e:
                logging.error('Error: ' + str(e), exc_info=True)
        yield None

    # docx中表格翻译
    def get_docx_table(self, file):
        tables_parg = []
        table_run_text = []
        for table in file.tables:
            cell_set = [cell for row in table.rows for cell in row.cells]
            for cell in cell_set:
                paragraphs_nonull = [parg for parg in cell.paragraphs if parg.text]
                tables_parg.extend(paragraphs_nonull)
                for paragraph in paragraphs_nonull:
                    runt = []
                    for run in paragraph.runs:
                        if run.text:
                            runt.append(run.text)
                            run.text = ''
                    table_run_text.append(''.join(runt))
        self.file_character_num += sum([len(text) for text in table_run_text])
        table_run_text = yield table_run_text
        if table_run_text == None:
            return
        for tp, trt in zip(tables_parg, table_run_text):
            try:
                tp.runs[0].text = trt
            except Exception as e:
                logging.error('Error: ' + str(e), exc_info=True)
        yield None
        
    # docx字体设置
    def docx_set_style(self, file):
        for style in file.styles:
            if isinstance(style, docx.styles.style._ParagraphStyle):
                try:
                    file.styles[style.name].font.name = u'微软雅黑'
                    file.styles[style.name]._element.rPr.rFonts.set(qn('w:eastAsia'), u'微软雅黑')
                except Exception as e:
                    logging.error('Error: ' + str(e), exc_info=True)

测试:

path_in = 'E:/data/converted.docx'
path_out = 'E:/data/converted(trans).docx'
content = {"filename":path_in, "trans_file":path_out}
df =DocxFileTrans()
df.docx_file_trans(content)

总结

目前python对word的解析支持不完全,需通过各种手段组合提取word文档内容,以上代码就是通过不同方法组合提取docx 文档各部分文本内容:

  • docx包自带函数直接提取段落、表格、页眉页脚
  • docx包根据标签手动提取文本域、超链接
  • 将docx解析为xml文件手动提取脚注
    代码中用了协程的概念,相比直接对每一部分文本提取写入可以稍微提一点速度(50页文档用时3.301->3.067),如果对提取内容有后续操作,如翻译,则会加速很多。

你可能感兴趣的:(文档处理,python)