文本中unicode乱码处理

需求:有一个文本,其中存在的一些网络上的Unicode码,现在需要对文本进行乱码处理
思路:对所有unicode码进行查找替换

# -*- coding: utf-8 -*-
# @Time    : 2018/7/28 14:51
# @Author  : xiangchaoming
# @QQ      : 239036082
import re
from functools import reduce

print("读取文件》》》")

f = open("用户购买产品详细电话姓名地址.csv", 'r', encoding='utf-8')
f_read = str(f.read())
# f_read = r"\u91d1\u601d\u542b 18958111888 \u6d59\u6c5f\u7701  \u676d\u5dde\u5e02  \u897f\u6e56\u533a  \u6587\u4e00\u897f\u8def378\u53f7\u5609\u5357\u516c\u3762\u541f\u6cc9\u5c45201"

print("乱码处理中》》》")

# 匹配出unicode码
codes = re.findall("(\\\\u\w{4})", f_read)
# unicode_codes = [x for x in codes if x.startswith(r"\u") and len(x) == 6]

# 去重
new_codes = list(reduce(lambda x, y: x if y in x else x + [y], [[], ] + codes))

for i in new_codes:
    print(i, " ===> ")
    print("*"*15, i.encode('utf-8').decode('unicode_escape'))
    f_read = f_read.replace(i, i.encode('utf-8').decode('unicode_escape'))

print("筛选结束\n开始写入》》》")

with open("c.csv", "w", errors="ignore") as f2:
    f2.write(f_read)

print("运行结束")

你可能感兴趣的:(Python)