import os
import codecs
from bs4 import BeautifulSoup
def generate_output_filename(file_path, save_path):
# 获取文件名(不包含扩展名)
file_name = os.path.splitext(os.path.basename(file_path))[0]
# 构造保存路径和文件名
output_filename = os.path.join(save_path, file_name + '.txt')
return output_filename
def get_content_from_mht(soup):
# 从 MHT 文件中提取内容,并返回字符串形式的内容
# 这里只是示例,您可以根据具体的 MHT 文件结构进行修改
# 下面的示例代码仅提取 标签下的文本内容
body = soup.body
if body:
return body.get_text()
else:
return ""
def convert_mht_to_txt(path, save_path_1, save_path_2):
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.mht'):
file_path = os.path.join(root, file)
output_filename_1 = generate_output_filename(file_path, save_path_1)
output_filename_2 = generate_output_filename(file_path, save_path_2)
with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \
codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:
with open(file_path, 'r', encoding='utf-8') as f_in:
soup = BeautifulSoup(f_in, 'html.parser')
content = get_content_from_mht(soup)
f_out_1.write(content)
f_out_2.write(content)
elif os.path.isfile(path) and path.endswith('.mht'):
output_filename_1 = generate_output_filename(path, save_path_1)
output_filename_2 = generate_output_filename(path, save_path_2)
with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \
codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:
with open(path, 'r', encoding='utf-8') as f_in:
soup = BeautifulSoup(f_in, 'html.parser')
content = get_content_from_mht(soup)
f_out_1.write(content)
f_out_2.write(content)
# 示例用法
path = input("请输入要处理的文件路径:")
save_path_1 = input("请输入第一个保存文件的位置(按回车默认保存):")
save_path_2 = input("请输入第二个保存文件的位置(按回车默认不保存):")
if save_path_1 == '':
save_path_1 = '/storage/emulated/0/文件/代码print相关/结构图/字符图/'
if save_path_2 == '':
convert_mht_to_txt(path, save_path_1, save_path_1)
else:
convert_mht_to_txt(path, save_path_1, save_path_2)
仅适用于处理 .mht
格式的文件。
如果您想要将转义字符转换为相应的文字形式,可以借助 Python 的内置模块 html
。以下是修改后的代码:
import os
import codecs
from bs4 import BeautifulSoup
import html
def generate_output_filename(file_path, save_path):
# 获取文件名(不包含扩展名)
file_name = os.path.splitext(os.path.basename(file_path))[0]
# 构造保存路径和文件名
output_filename = os.path.join(save_path, file_name + '.txt')
return output_filename
def get_content_from_mht(soup):
# 从 MHT 文件中提取内容,并返回字符串形式的内容
# 这里只是示例,您可以根据具体的 MHT 文件结构进行修改
# 下面的示例代码仅提取 标签下的文本内容
body = soup.body
if body:
content = body.get_text()
# 转义字符转换为相应的文字形式
content = html.unescape(content)
return content
else:
return ""
def convert_mht_to_txt(path, save_path_1, save_path_2):
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.mht'):
file_path = os.path.join(root, file)
output_filename_1 = generate_output_filename(file_path, save_path_1)
output_filename_2 = generate_output_filename(file_path, save_path_2)
with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \
codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:
with open(file_path, 'r', encoding='utf-8') as f_in:
soup = BeautifulSoup(f_in, 'html.parser')
content = get_content_from_mht(soup)
f_out_1.write(content)
f_out_2.write(content)
elif os.path.isfile(path) and path.endswith('.mht'):
output_filename_1 = generate_output_filename(path, save_path_1)
output_filename_2 = generate_output_filename(path, save_path_2)
with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \
codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:
with open(path, 'r', encoding='utf-8') as f_in:
soup = BeautifulSoup(f_in, 'html.parser')
content = get_content_from_mht(soup)
f_out_1.write(content)
f_out_2.write(content)
# 示例用法
path = input("请输入要处理的文件路径:")
save_path_1 = input("请输入第一个保存文件的位置(按回车默认保存):")
save_path_2 = input("请输入第二个保存文件的位置(按回车默认不保存):")
if save_path_1 == '':
save_path_1 = '/storage/emulated/0/文件/代码print相关/结构图/字符图/'
if save_path_2 == '':
convert_mht_to_txt(path, save_path_1, save_path_1)
else:
convert_mht_to_txt(path, save_path_1, save_path_2)
在这个版本的代码中,使用 html.unescape()
函数将转义字符转换为相应的文字形式。这样在保存为文本文件时,转义字符就会以正常的文本形式显示。