需求:有一个文本,其中存在的一些网络上的Unicode码,现在需要对文本进行乱码处理
思路:对所有unicode码进行查找替换
# -*- coding: utf-8 -*-
# @Time : 2018/7/28 14:51
# @Author : xiangchaoming
# @QQ : 239036082
import re
from functools import reduce
print("读取文件》》》")
f = open("用户购买产品详细电话姓名地址.csv", 'r', encoding='utf-8')
f_read = str(f.read())
# f_read = r"\u91d1\u601d\u542b 18958111888 \u6d59\u6c5f\u7701 \u676d\u5dde\u5e02 \u897f\u6e56\u533a \u6587\u4e00\u897f\u8def378\u53f7\u5609\u5357\u516c\u3762\u541f\u6cc9\u5c45201"
print("乱码处理中》》》")
# 匹配出unicode码
codes = re.findall("(\\\\u\w{4})", f_read)
# unicode_codes = [x for x in codes if x.startswith(r"\u") and len(x) == 6]
# 去重
new_codes = list(reduce(lambda x, y: x if y in x else x + [y], [[], ] + codes))
for i in new_codes:
print(i, " ===> ")
print("*"*15, i.encode('utf-8').decode('unicode_escape'))
f_read = f_read.replace(i, i.encode('utf-8').decode('unicode_escape'))
print("筛选结束\n开始写入》》》")
with open("c.csv", "w", errors="ignore") as f2:
f2.write(f_read)
print("运行结束")