文件编码检测chardet及乱码处理

def save_data(line):
    with open("new微博评论.csv","a+",newline="",encoding="utf-8") as f:
        f.write(line)
f = open("微博评论.csv","rb")#二进制格式读文件
i = 0
while True:
    i += 1
    # print(i)
    line = f.readline()
    if not line:
        break
    else:
        try:
            n_line = line.decode('utf8')
            save_data(n_line)
        except Exception as e:
            print(type(e),e)
            print("=========================")
            print(i,line)

编码检查chardet 

import chardet
 
def judge(data):
    return chardet.detect(data)["encoding"]
 
def error(e,q=1):
    input(e)
    if q:
        exit(0)
 
def trans(path):
    data = open(path, "rb").read()
    coding = judge(data)
    if coding == "GB2312":
        coding = "GBK"
    try:
        arr = [i.rstrip() for i in data.decode(coding).split("\n")]
        if len(arr) == 1:
            return [i for i in arr[0].split("\r")]
        return arr
    except Exception as e:
        print(e)
        error("[!] 无法使用此文本,请使用utf8编码的文本")
 
print(trans("123.txt"))

 

你可能感兴趣的:(python基础)