【常用代码】文件夹递归转换代码文件字符集为utf8 bom

来自chatgpt,前几个版本bug主要是0d0a混用等问题未考虑到。utf8 bom与utf8谁更好,未明确调查。

# C:\Users\xxx\AppData\Local\Programs\Python\Python313\python.exe code_utf8.py code_folder
import os
import sys
import chardet # pip install chardet

# 支持的代码文件扩展名
CODE_EXTENSIONS = {".c", ".h", ".cpp", ".hpp", ".cc", ".cxx"}

def convert_to_utf8_bom(file_path):
    try:
        with open(file_path, "rb") as f:
            content = f.read()
        
        # 尝试检测编码
        encoding_info = chardet.detect(content)
        encoding = encoding_info["encoding"]
        
        if not encoding:
            print(f"无法检测文件 {file_path} 的编码,跳过。")
            return
        
        if encoding.lower() in ["utf-8-sig", "utf-8"]:
            print(f"文件 {file_path} 已经是 UTF-8 BOM 编码,跳过。")
            return
        
        # 以检测到的编码解码,再以 UTF-8 BOM 保存
        text = content.decode(encoding, errors='replace')
        
        # 统一转换行尾为 Windows 兼容格式 \r\n
        text = text.replace("\r\n", "\n").replace("\n", "\r\n")
        
        with open(file_path, "w", encoding="utf-8-sig", newline="") as f:
            f.write(text)
        
        print(f"转换 {file_path} 为 UTF-8 BOM 编码,行尾统一为 CRLF")
    except Exception as e:
        print(f"转换 {file_path} 失败: {e}")

def process_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.endswith(ext) for ext in CODE_EXTENSIONS):
                convert_to_utf8_bom(os.path.join(root, file))

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("用法: python3 convert_to_utf8_bom.py <代码目录>")
        sys.exit(1)
    
    code_dir = sys.argv[1]
    if not os.path.isdir(code_dir):
        print(f"错误: {code_dir} 不是一个有效的目录")
        sys.exit(1)
    
    process_directory(code_dir)
    print("转换完成。")

你可能感兴趣的:(python)