使用python将ppt文件批量转为pptx、批量提取ppt中的文字保存

import os
from pptx import Presentation
from docx import Document
import os.path
import win32com.client

class PPT2Word(object):
	"""将filepath对应的pptx文件中的文字提取,并保存为同名docx文档"""
    def __init__(self, filepath):
        self.wordfile = Document()
        self.filepath = filepath
        self.pptx = Presentation(self.filepath)

    def main(self):
        for slide in self.pptx.slides:
            for shape in slide.shapes:
                if shape.has_text_frame:
                    text_frame = shape.text_frame
                    for paragraph in text_frame.paragraphs:
                        self.wordfile.add_paragraph(paragraph.text)
        save_path = self.filepath.replace(".pptx", ".docx").replace(".ppt", ".doc")
        self.wordfile.save(save_path)


"""
新建目录,放入本文件、各个文件夹(文件夹内为待转换的若干ppt文件)
"""
if __name__ == "__main__":
    powerpoint = win32com.client.Dispatch('PowerPoint.Application')
    win32com.client.gencache.EnsureDispatch('PowerPoint.Application')
    powerpoint.Visible = 1

    dir_list = os.listdir()
    dir_list.remove("select_word4ppt.py")
    if ".idea" in dir_list:
        dir_list.remove(".idea")
    print(dir_list)
    for dir in dir_list:
        ppt_list = os.listdir(dir)
        print(ppt_list)
        for ppt in ppt_list:
			# 如果是ppt文件,先另存为pptx文件
            if ppt[-3:] == "ppt":
                subPath = os.path.abspath(r"{}\{}".format(dir, ppt)) # 此处要为绝对路径
                ppt1 = powerpoint.Presentations.Open(subPath)
                ppt1.SaveAs(subPath[:-4] + '.pptx')
                ppt = ppt + 'x'
                print(ppt)
			# 提取pptx文件中的文字并保存
            ppt2word = PPT2Word(filepath=r"{}\{}".format(dir, ppt))
            ppt2word.main()

你可能感兴趣的:(python)