win10 系统下 python3.8 word文档转为txt

此功能需要安装 python的win32com模块

python -m pip install pypiwin32

 

# coding=utf-8
'''
word 文档信息提取
'''
import os,fnmatch
from win32com import client as wc
from win32com.client import Dispatch

def word2txt(filepath, savepath=''):
	pass
	# 1. 切分文件路径和文件名
	dirs, filename = os.path.split(filepath)
	# print(dirs, "\r\n", filename)
	# 2. 修改文件后缀
	new_name = ''
	if fnmatch.fnmatch(filepath, '*.doc'):
		new_name = filepath[:-4]+'.txt'
	elif fnmatch.fnmatch(filepath, '*.docx'):
		new_name = filepath[:-5]+'.txt'
	else:
		return print('仅支持 doc和docx格式')

	# 3. 设置新文件保存路径
	if savepath == '':
		savepath = dirs
	else:
		savepath = savepath

	new_path = os.path.join(savepath, new_name)
	# print(filepath)

	# 4. 加载文本处理程序
	wordapp = wc.Dispatch('Word.Application')
	mytxt = wordapp.Documents.Open(filepath)

	# 5. 保存文本信息
	# print(new_path)
	mytxt.SaveAs(new_path, 4) # 参数4代表抽取文本
	mytxt.Close()


if __name__ == '__main__':
	filepath1 = os.path.abspath(r'文档1.doc')
	filepath2 = os.path.abspath(r'文档2.docx')
	filepath3 = os.path.abspath(r'pdf文档.pdf')
	word2txt(filepath1)

亲测有效 

 

你可能感兴趣的:(python,python,word转txt)