python pptx文本提取

最近做的项目要把PPT中的文本提取出来做其他任务,现把PPT提取文本的过程摘出来做个记录,主要是组合shape需要通过迭代的方式进行解析(iter_shape函数)

import pptx

def iter_shape(shape, text_shapes):
    if type(shape) == pptx.shapes.group.GroupShape:
        for sshape in shape.shapes:
            iter_shape(sshape, text_shapes)
    else:
        if shape.has_text_frame:
            text_shapes.append(shape)

# pptx中段落文本
def pptx_paragraph_text(file):
	text_shapes = []
    shapes = [shape for slide in file.slides for shape in slide.shapes]
    for shape in shapes:
        iter_shape(shape, text_shapes )
    paragraphs = [paragraph for shape in text_shapes \
                  for paragraph in shape.text_frame.paragraphs]
    return [p.text for p in paragraphs]
     

    # pptx中表格翻译
    def pptx_table_text(file):
        shapes = [shape for slide in file.slides for shape in slide.shapes if shape.has_table]
        cells = [cell for shape in shapes for cell in shape.table.iter_cells() if cell.text_frame]
        return [cell.text for cell in cells]

def pptx_file_translate(ppt_file):
	# 打开PPT
    file = pptx.Presentation(ppt_file)
    # 段落文本
    para_text = pptx_paragraph_text(file)
    # 表格文本
    table_text = pptx_table_text(file)
	return para_text, table_text 

你可能感兴趣的:(文档处理)