python txt csv文件同时含有多种编码清洗

# -*- coding: utf-8 -*-
"""
Created on Wed Feb 23 11:25:05 2022

@author: mw
"""

from tqdm import tqdm 

#源文件,清洗后的文件
file='idc201902-1.csv'
new_file='2_idc201902-1.csv'        


#打开文件,以二进制文件读取
file1 = open(file,'rb')
r = file1.read()
file1.close()

#文件拆分为列表
r=r.split(b'\n')

#删除文件头非数据部分
del r[0:3]
del r[1]

#计算文件长度
count=len(r)

#拆分子列表
for i in tqdm(range(0,count)):
    r[i]=r[i].split(b',')

#子列表第3个字段更换编码    
for i in tqdm(range(0,count)):
    if len(r[i])>=5:
        r[i][0]=r[i][0].strip()
        r[i][1]=r[i][1].strip()
        #第3列为udf8 其他列为gbk
        r[i][2]=r[i][2].decode('utf8').encode('gbk')
        

#写入替换文件        
with open(new_file, "w", encoding="gbk") as f:   # 写入替换好的文本
    print('写入替换文本...')
    for i in tqdm(range(0,count)):
        if len(r[i])>=5:
            f.write(r[i][0].decode('gbk')+','+r[i][1].decode('gbk')+','+r[i][2].decode('gbk')+','+r[i][3].decode('gbk')+','+r[i][4].decode('gbk')+'\n')
        else:
            print('错误行')
            print(r[i])

全部代码

from tqdm import tqdm 
import re

def nth_repl(s, sub, repl, n):
    """
    替换第n次出现的字符
    :s:字符串
    :sub:被替换的字符串
    :repl:替换为的新字符串
    :n:字符串第n次出现的次数  
    -------------------------
    替换第7次出现的位置
    nth_repl(z,'|','_',7)
    """
    find = s.find(sub)
    # If find is not -1 we have found at least one match for the substring
    i = find != -1
    # loop util we find the nth or we find no match
    while find != -1 and i != n:
        # find + 1 means we start searching from after the last match
        find = s.find(sub, find + 1)
        i += 1
    # If i is equal to n we found nth match so replace
    if i == n:
        return s[:find] + repl + s[find+len(sub):]
    return s

#清洗数据
def file_sub(old_file,new_file):  
    
    file_data = []  # 初始化
    
    with open(old_file, "r", encoding="utf-16") as f:
        print('开始替换...')
        rows = f.readlines()
        sep_cnt = rows[0].count(',')
        
        for line in tqdm(rows):      # line一行行读取替换文本
            cnt = line.count(',')
            if cnt!=sep_cnt:    
                a = re.sub('\x00','',re.sub('\s','',line))
                a = nth_repl(a,',','_',4)
                #如果仍然大于30个,继续替换
                while a.count(',')>sep_cnt:
                    a = nth_repl(a,',','_',4)
            else :
                a = re.sub('\x00','',re.sub('\s','',line)) 
            file_data.append(a) 
    

    with open(new_file, "w", encoding="utf-8") as f:   # 写入替换好的文本
        print('写入替换文本...')
        for line in tqdm(file_data):
            f.write(line + '\n')
        
    print('批量替换完成')
 
def main():  
    file_sub('erp.txt','erp.csv')

你可能感兴趣的:(python txt csv文件同时含有多种编码清洗)