本文讲述通过python实现多线程将当前目录下的 .doc文档转.pdf、.docx文档转.pdf,以及.doc文档转.docx(在python中,如需对word文档进行读写,只能读写docx格式的word文档)。
其中涉及到的知识点有如下几点:
import os, re
import threading
import pythoncom
from win32com.client import DispatchEx
semaphore = threading.Semaphore(12)
lock = threading.Lock()
for a, b, c in os.walk(pre):
for file in c:
if re.search('\.doc', file) != None:
threading.Thread(target=WordConvertToOther.DocToDocx, args=(pre + file,)).start()
if __name__ == '__main__':
# 控制线程最大并发数为12
semaphore = threading.Semaphore(12)
# 线程锁
lock = threading.Lock()
# 当前脚本目录绝对路径
pre = os.path.realpath('./') + '\\'
for a, b, c in os.walk(pre):
for file in c:
if re.search('\.doc', file) != None:
# 将doc转存为docx
threading.Thread(target=WordConvertToOther.DocToDocx, args=(pre + file,)).start()
# 将doc、docx转存为pdf
# threading.Thread(target=WordConvertToOther.DocToPdf, args=(pre + file, )).start()
with semaphore:
pythoncom.CoInitialize()
word = DispatchEx('Word.Application')
doc = word.Documents.Open(docpath)
doc.SaveAs(re.sub('.doc$', '.docx', docpath), FileFormat=12)
word.Quit()
def DocToDocx(docpath):
'''将doc转存为docx'''
with semaphore:
lock.acquire()
try:
# CoInitialize初始化,为线程和word对象创建一个套间,令其可以正常关联和执行
pythoncom.CoInitialize()
# 用DispatchEx()的方式启动MS Word或与当前已执行的MS Word建立连结
word = DispatchEx('Word.Application')
# 打开指定目录下doc文档
doc = word.Documents.Open(docpath)
# 将打开的doc文档存储为docx
doc.SaveAs(re.sub('.doc$', '.docx', docpath), FileFormat=12)
# 关闭doc文档
doc.Close()
except:
# 报错则输出报错文件
print(docpath + ':无法打开')
else:
# 无报错输出转换完成
print(os.path.basename(docpath) + " : 转换完成")
finally:
# 关闭office程序
word.Quit()
# 释放资源
pythoncom.CoUninitialize()
lock.release()
这里与doc转docx类似,直接附上函数代码:
def DocToPdf(docpath):
'''将doc、docx转存为pdf'''
with semaphore:
lock.acquire()
try:
pythoncom.CoInitialize()
word = DispatchEx('Word.Application')
doc = word.Documents.Open(docpath)
doc.SaveAs(re.sub('\.doc.*', '.pdf', docpath), FileFormat=17)
doc.Close()
except:
print(docpath + ':无法打开')
else:
print(os.path.basename(docpath) + " : 转换完成")
finally:
word.Quit()
pythoncom.CoUninitialize()
lock.release()
import os, re
import threading
import pythoncom
from win32com.client import DispatchEx
class WordConvertToOther:
def DocToDocx(docpath):
'''将doc转存为docx'''
with semaphore:
lock.acquire()
try:
# CoInitialize初始化,为线程和word对象创建一个套间,令其可以正常关联和执行
pythoncom.CoInitialize()
# 用DispatchEx()的方式启动MS Word或与当前已执行的MS Word建立连结
word = DispatchEx('Word.Application')
# 打开指定目录下doc文档
doc = word.Documents.Open(docpath)
# 将打开的doc文档存储为docx
doc.SaveAs(re.sub('.doc$', '.docx', docpath), FileFormat=12)
# 关闭doc文档
doc.Close()
except:
# 报错则输出报错文件
print(docpath + ':无法打开')
else:
# 无报错输出转换完成
print(os.path.basename(docpath) + " : 转换完成")
finally:
# 关闭office程序
word.Quit()
# 释放资源
pythoncom.CoUninitialize()
lock.release()
def DocToPdf(docpath):
'''将doc、docx转存为pdf'''
with semaphore:
lock.acquire()
try:
pythoncom.CoInitialize()
word = DispatchEx('Word.Application')
doc = word.Documents.Open(docpath)
doc.SaveAs(re.sub('\.doc.*', '.pdf', docpath), FileFormat=17)
doc.Close()
except:
print(docpath + ':无法打开')
else:
print(os.path.basename(docpath) + " : 转换完成")
finally:
word.Quit()
pythoncom.CoUninitialize()
lock.release()
if __name__ == '__main__':
# 控制线程最大并发数为12
semaphore = threading.Semaphore(12)
# 线程锁
lock = threading.Lock()
# 当前脚本目录绝对路径
pre = os.path.realpath('./') + '\\'
for a, b, c in os.walk(pre):
for file in c:
if re.search('\.doc', file) != None:
# 将doc转存为docx
# threading.Thread(target=WordConvertToOther.DocToDocx, args=(pre + file,)).start()
# 将doc、docx转存为pdf
threading.Thread(target=WordConvertToOther.DocToPdf, args=(pre + file, )).start()