目录
文件格式转换
一、doc与docx的转换
二、els与elsx转换
三、docx转换txt
四、elsx转换txt
import os
import time
from win32com import client as wc
path1 = r'存储输入的doc文件的绝对路径' # 需要修改的文件的路径
path2 = r'存储输出的docx文件的绝对路径' # 存储的路径
for file in os.listdir(path1):
if file.endswith('.doc'):
word = wc.Dispatch("kwps.Application") # 调用WPS进行文档编辑
out_name = file.replace("doc", r'docx') # doc文件修改后缀名
in_file = os.path.abspath(path1 + "\\" + file)
out_file = os.path.abspath(path2 + "\\" + out_name)
print(in_file)
print(out_file)
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, 12, False, "", True, "", False, False, False,
False)
doc.Close()
print('转换成功')
word.Quit()
time.sleep(3) # 避免文件未关闭就打开下一个文件
else:
word = wc.Dispatch("Word.Application")
out_name = file # 不是doc文件则不修改后缀名
in_file = os.path.abspath(path1 + "\\" + file)
out_file = os.path.abspath(path2 + "\\" + out_name)
doc = word.Documents.Open(in_file)
print(in_file)
print(out_file)
doc.SaveAs(out_file, 12, False, "", True, "", False, False, False,
False)
doc.Close()
time.sleep(5)
print('复制成功')
word.Quit()
time.sleep(40) # 避免文件未关闭就打开下一个文件
import os
import pandas as pd
# 指定XLS文件所在目录和转换后的XLSX文件保存目录
xls_directory = r'xls文件的绝对路径'
xlsx_directory = r'xlsx的绝对路径'
# 遍历XLS文件目录中的所有文件
for filename in os.listdir(xls_directory):
if filename.endswith('.xls'): # 仅处理扩展名为.xls的文件
xls_path = os.path.join(xls_directory, filename)
xlsx_path = os.path.join(xlsx_directory, filename.replace('.xls', '.xlsx'))
# 读取XLS文件
xls_data = pd.read_excel(xls_path)
# 写入XLSX文件
xls_data.to_excel(xlsx_path, index=False)
print(f"Converted: {filename} -> {filename.replace('.xls', '.xlsx')}")
print("Conversion completed.")
1.安装docx包(cmd中)_
pip install python-docx
2.运行以下代码(运行此段代码前确保你的文件必须是docx格式,doc格式不可以)
import os
from docx import Document
def doc_to_txt(doc_path, txt_path):
doc = Document(doc_path)
txt_content = '\n'.join([p.text for p in doc.paragraphs])
with open(txt_path, 'w', encoding='utf-8') as txt_file:
txt_file.write(txt_content)
def batch_convert_docs_to_txt(doc_dir, txt_dir):
for filename in os.listdir(doc_dir):
if filename.endswith('.doc') or filename.endswith('.docx'):
doc_path = os.path.join(doc_dir, filename)
txt_filename = os.path.splitext(filename)[0] + '.txt'
txt_path = os.path.join(txt_dir, txt_filename)
doc_to_txt(doc_path, txt_path)
print(f"Converted {doc_path} to {txt_path}")
# 指定DOC文件所在目录和要保存TXT文件的目录
doc_directory = r'docx文件目录'
txt_directory = r'txt文件目录'
# 批量转换DOC文件为TXT文件
batch_convert_docs_to_txt(doc_directory, txt_directory)
import pandas as pd
import openpyxl
import time
import os
# 输入路径
inpath = r'xlsx文件目录'
# 输出路径
outpath = r'txt文件目录'
print('START:' + str(time.ctime()))
# 读取excel文件
for afile in os.listdir(inpath):
if afile[-4:].lower() == 'xlsx':
print(afile)
name = inpath + '/' + afile
# 读取每一个sheet
wb = openpyxl.load_workbook(name)
sheets = wb.sheetnames
for sheet in sheets:
print(sheet)
df = pd.read_excel(name, sheet_name=sheet, header=None)
print('开始写入txt文件...')
# 保存txt文件
df.to_csv(outpath + '/' + afile[:-5] + '_' + sheet + '.txt', header=None, sep=',', index=False)
print('文件写入成功!')
print('END:' + str(time.ctime()))
ps:代码非原创,有借鉴别的博主,仅做一个归纳。