from pandas.io.json import json_normalize
import pandas as pd
import json
import time
# 读入数据
data_str = open('AgriculturalDisease_train_annotations.json').read()
#———————————————————— 测试json_normalize ————————————————————
start_time = time.time()
for i in range(0, 300):
data_list = json.loads(data_str)
df = json_normalize(data_list)
end_time = time.time()
print (end_time - start_time)#耗时109秒
#———————————————————— 测试自己构造 ————————————————————
start_time = time.time()
for i in range(0, 300):
data_list = json.loads(data_str)
data = [[d["disease_class"], d["image_id"]] for d in data_list]
df = pd.DataFrame(data, columns=["disease_class", "image_id"])
end_time = time.time()
print (end_time - start_time)#耗时22秒
#———————————————————— 测试read_json ————————————————————
start_time = time.time()
for i in range(0, 300):
df = pd.read_json(data_str, orient='records')
end_time = time.time()
print (end_time - start_time)#耗时36秒
# read_json
df2 = pd.read_json(data_str, orient='records')
# 自己构造
data_list = json.loads(data_str)
data = [[d["disease_class"], d["image_id"]] for d in data_list]
df = pd.DataFrame(data, columns=["disease_class", "image_id"])
df.head(5)
三种代码输出均如下
disease_class image_id
0 1 62fd8bf4d53a1b94fbac16738406f10b.jpg
1 1 0bdec5cccbcade6b6e94087cb5509d98.jpg
2 1 8951e940341f77c8d361c1872c67b16d.jpg
3 1 7ed158da58c451f75fb790530d6f19cc.jpg
4 1 9b7399aa-1c3c-4137-ae4e-196cd23fe573___FREC_Sc...
技巧:将复杂的json串整理成以下格式再读取,再使用data_list = json.loads(data_str)
读取即可
{"error_code":40007,"error_msg":"fail to recognize"}
[{"department": "abcdef",
"query_result": {"code": "1000", "description": "1000"},
"is_invoice": 1,
"imageName": "./imgs/8888888.jpeg",
"reco_result": {"total": "", "invoice_no": "123", "create_date": "", "check_code": ""}}]
./out_file
下两个json文件内容如下:
out_01.txt 内容为:"{"name_ID":"12343","name":"张三","身份编码":"未知"}"
out_02.txt 内容为:"{"name_ID":"12344","name":"李四","身份编码":"98983"}"
import json
import os
def img_w_h(text_path):
data_str_list = []
img_name_list = []
for filename in os.listdir(text_path):
file_path = text_path+'/'+filename
print("获取文件:",file_path)
data_str = open(file_path,"r",encoding='UTF-8').read()
data_str_list.append(data_str)
img_name_list.append(filename)
print("data_str_list",data_str_list)
return data_str_list,img_name_list
def json_to_excel(data_str_list):
data_all = []
for data_str in data_str_list:
if data_str.startswith(u'\ufeff'):
content = data_str.encode('utf8')[3:].decode('utf8')
text = json.loads(content[1:-1])
if text["身份编码"] =="未知":
data_all.append(text["身份编码"])
return data_all
if __name__ == "__main__":
text_path = "./out_file"
data_str_list, img_name_list = img_w_h(text_path)
data_all = json_to_excel(data_str_list)
print("data_all:",data_all)
输出:
获取文件: ./out_file/out_01.txt
获取文件: ./out_file/out_02.txt
data_str_list ['\ufeff"{"name_ID":"12343","name":"张三","身份编码":"98983"}"', '\ufeff"{"name_ID":"12343","name":"张三","身份编码":"未知"}"']
data_all: ['未知']
数据格式一:json数据格式
"""
[{"department": "abcdef",
"query_result": {"code": "1000", "description": "1000"},
"is_invoice": 1,
"imageName": "./imgs/8888888.jpeg",
"reco_result": {"total": "", "invoice_no": "01111111", "create_date": "", "check_code": "", "invoice_code": ""}},
{"department": "abcdef",
"query_result": {},
"is_invoice": 0,
"imageName": "./imgs/51111111.jpeg",
"reco_result": {}},
...]
"""
import json
import pandas as pd
import xlrd
excel_path = "C:\\Users\\Desktop\\test_data.xlsx"
def read_excel(excel_path):
workbook = xlrd.open_workbook(excel_path)
sheet = workbook.sheet_by_name("Sheet1")
nrows = sheet.nrows
list1 = []
for i in range(1,nrows):
list1.append(sheet.row_values(i)[0])
return list1
def get_data(excel_path):
list1 = read_excel(excel_path)
All_data = []
for i in range(len(list1)): #遍历列表数据(相当于遍历该列所有单元格)
data_list = json.loads(list1[i])
# print("data_list:", type(data_list))
for i in range(len(data_list)): #遍历该单元格列表中所有json串
# print(type(data_list[i]))
data_dict = data_list[i]
try:
imageNo = data_dict["imageNo"]
businessType = data_dict["businessType"]
reco_result = data_dict["reco_result"]
try:
total = reco_result["total"]
invoice_no = reco_result["invoice_no"]
create_date = reco_result["create_date"]
check_code = reco_result["check_code"]
invoice_code = reco_result["invoice_code"]
except:
total = "NAN"
invoice_no = "NAN"
create_date = "NAN"
check_code = "NAN"
invoice_code = "NAN"
is_invoice = data_dict["is_invoice"]
billId = data_dict["billId"]
imageName = data_dict["imageName"]
applyNum = data_dict["applyNum"]
department = data_dict["department"]
query_result = data_dict["query_result"]
try:
code = query_result["code"]
description = query_result["description"]
except:
code = "NAN"
description = "NAN"
All_data.append((imageNo, businessType, total, invoice_no, create_date, check_code,
invoice_code, is_invoice, billId, imageName,
applyNum, department, code, description))
except:
print("数据格式出错!")
pass
return All_data
All_data = get_data(excel_path)
df = pd.DataFrame(All_data, index=None,columns=["imageNo", "businessType", "total","invoice_no", "create_date", "check_code", \
"invoice_code","is_invoice","billId","imageName",\
"applyNum","department","code","description"])
df.to_excel('C:\\Users\\Desktop/001.xls')
print("done!")
最原始数据:
{"41196516":"{\"type\":\"身份证正面\",\"name\":\"徐XX\",\"sex\":\"男\",\"people\":\"汉\",...,"41196243":"{\"error_code\"
处理成如下json文件:(非常不正规)
{"41196516":"{"type":"身份证正面","name":"徐XX","sex":"男","people":"汉","birthday":"19XX年7月XX日","address":"广州市花都区*****号","id_number":"4401***15","issue_authority":"广州市XXX局","validity":"20XX.XX.13-20XX.XX.13","time_cost":{"recognize":348,"preprocess":28},"complete":true,"border_covered":false,"head_covered":false,"head_blurred":false,"gray_image":true,"error_code":0,"error_msg":"OK"}",
"41196243":"{"error_code":40007,"error_msg":"fail to recognize"}",
"41196510":"{"type":"二代身份证","name":"魏XX","sex":"男","people":"汉","birthday":"19XX年9月XX日","address":"江苏省江阴市XXX号","id_number":"320XXX17","time_cost":{"recognize":398,"preprocess":29},"complete":true,"border_covered":false,"head_covered":false,"head_blurred":false,"gray_image":false,"error_code":0,"error_msg":"OK"}",
"41197139":"{"type":"身份证背面","issue_authority":"佛山市XXX分局","validity":"2005.XX.XX-2025.XX.XX","time_cost":{"recognize":464,"preprocess":48},"complete":true,"error_code":0,"error_msg":"OK"}"}
格式化展示:
{"41196516":"{"type":"身份证正面",
"name":"徐XX",
"sex":"男",
"people":"汉",
"birthday":"19XX年7月XX日",
"address":"广州市花都区*****号",
"id_number":"4401***15",
"issue_authority":"广州市XXX局",
"validity":"20XX.XX.13-20XX.XX.13",
"time_cost":{"recognize":348,"preprocess":28},
"complete":true,
"border_covered":false,
"head_covered":false,
"head_blurred":false,
"gray_image":true,
"error_code":0,
"error_msg":"OK"}",
"41196243":"{"error_code":40007,"error_msg":"fail to recognize"}",
"41196510":"{"type":"二代身份证",
"name":"魏XX",
"sex":"男",
"people":"汉",
"birthday":"19XX年9月XX日",
"address":"江苏省江阴市XXX号",
"id_number":"320XXX17",
"time_cost":{"recognize":398,"preprocess":29},
"complete":true,
"border_covered":false,
"head_covered":false,
"head_blurred":false,
"gray_image":false,
"error_code":0,
"error_msg":"OK"}",
"41197139":"{"type":"身份证背面",
"issue_authority":"佛山市XXX分局",
"validity":"2005.XX.XX-2025.XX.XX",
"time_cost":{"recognize":464,"preprocess":48},
"complete":true,
"error_code":0,
"error_msg":"OK"}"}
解析代码如下:
import json
import pandas as pd
data_str = open('D:/XXX/XXX文档/reize_result20181227.txt',"r",encoding="utf-8").read()
data_str0 = data_str.replace("\\","")
print(data_str0)
imgName_list = []
def get_data(data_str0):
All_data = []
num = data_str0.count("error_code") #统计共有多少个json("error_code"每个json都有)
for i in range(num):
imgName = data_str0[1:-1].split("\":\"{")[i][-8:] #获取ImageID([1:-1]去除最外层括号)
print("imgName", imgName)
img_str1 = "{"+data_str0[1:-1].split("\":\"{")[i+1].split("}\",\"")[0]+"}" #获取整个json
img_str1 = img_str1.replace("\"}\"}}","\"}") if "\"}\"}" in img_str1 else img_str1 #去除末尾多余的符号
print("img_str1:", img_str1)
data_list = json.loads(img_str1)
#########################################################################
try:
type_ = data_list["type"]
except:
type_ = "NAN"
try:
name = data_list["name"]
except:
name = "NAN"
try:
sex = data_list["sex"]
except:
sex = "NAN"
try:
people = data_list["people"]
except:
people = "NAN"
try:
birthday = data_list["birthday"]
except:
birthday = "NAN"
try:
address = data_list["address"]
except:
address = "NAN"
try:
id_number = data_list["id_number"]
except:
id_number = "NAN"
try:
issue_authority = data_list["issue_authority"]
except:
issue_authority = "NAN"
try:
validity = data_list["validity"]
except:
validity = "NAN"
try:
time_cost = data_list["time_cost"]
recognize = time_cost["recognize"]
preprocess = time_cost["preprocess"]
except:
time_cost = "NAN"
recognize = "NAN"
preprocess = "NAN"
try:
complete = data_list["complete"]
except:
complete = "NAN"
try:
border_covered = data_list["border_covered"]
except:
border_covered = "NAN"
try:
head_covered = data_list["head_covered"]
except:
head_covered = "NAN"
try:
head_blurred = data_list["head_blurred"]
except:
head_blurred = "NAN"
try:
gray_image = data_list["gray_image"]
except:
gray_image = "NAN"
error_code = data_list["error_code"]
error_msg = data_list["error_msg"]
All_data.append((imgName,type_,name,sex,people,birthday,\
address,id_number,issue_authority,validity,\
recognize,preprocess,complete,border_covered,\
head_covered,head_blurred,gray_image,error_code,error_msg))
return All_data
All_data = get_data(data_str0)
df = pd.DataFrame(All_data, index=None,columns=["imgName", "type_", "name","sex", "people", "birthday", \
"address","id_number","issue_authority","validity",\
"recognize","preprocess","complete","border_covered",\
"head_covered","head_blurred","gray_image","error_code","error_msg"])
df.to_excel('D:/XXX/XXX文档/reize_result20181227.xls')
复杂json解析
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)
import json
def Read_txt(txt_path):
f = open(txt_path, "r", encoding="utf-8")
lines = f.readlines()
read_txt = []
for line in lines:
read_txt.append(line.strip('\n'))
f.close()
return read_txt
def Parse_data(data_list,save_path):
result_str = {}
# ===================出租车========================
result_str["taxi_mileage"] = ''
result_str["taxi_time_get_on"] = ''
result_str["taxi_amount_little"] = ''
result_str["taxi_place"] = ''
result_str["taxi_date"] = ''
result_str["taxi_invoice_code"] = ''
result_str["taxi_invoice_no"] = ''
result_str["taxi_time_get_off"] = ''
result_str["taxi_waiting_time"] = ''
# ===================增值税发票========================
result_str['buyer_name'] = ''
result_str['amount_big'] = ''
result_str['amount_little'] = ''
result_str['invoice_code'] = ''
result_str['issuer'] = ''
result_str['seller_tax_id'] = ''
result_str['machine_code'] = ''
result_str['checker'] = ''
result_str['check_code'] = ''
result_str['seller_bank_info'] = ''
result_str['buyer_address'] = ''
result_str['pretax_amount'] = ''
result_str['date'] = ''
result_str['receiptor'] = ''
result_str['invoice_no'] = ''
result_str['buyer_tax_id'] = ''
result_str['tax_amount'] = ''
result_str['seller_name'] = ''
result_str['seller_address'] = ''
result_str['invoice_name'] = ''
result_str['buyer_bank_info'] = ''
result_str['count'] = ''
result_str['name'] = ''
result_str['tax'] = ''
result_str['unit_price'] = ''
result_str['amount'] = ''
result_str['tax_rate'] = ''
result_str['type'] = ''
result_str['unit'] = ''
result_str['tax_rate'] = ''
# ===================定额发票========================
result_str["dinge_amount_little"] = ''
result_str["dinge_invoice_no"] = ''
result_str["dinge_invoice_code"] = ''
# ===================火车票========================
result_str['train_entrance_info'] = ''
result_str['train_date_time'] = ''
result_str['train_purchase_type'] = ''
result_str['train_no'] = ''
result_str['train_station_to'] = ''
result_str['train_seat'] = ''
result_str['train_id_name'] = ''
result_str['train_little'] = ''
result_str['train_seat_class'] = ''
result_str['train_station_from'] = ''
result_str['train_issuing_station'] = ''
result_str['train_invoice_no'] = ''
result_str['train_issuing_no'] = ''
imgtype = {
'taxi_ticket': u'\u51fa\u79df\u8f66\u7968',
'e_invoice': u'\u589e\u503c\u7a0e\u7535\u5b50\u666e\u901a\u53d1\u7968',
'invoice': u'\u589e\u503c\u7a0e\u666e\u901a\u53d1\u7968',
'invoice_zhuan':u'\u589e\u503c\u7a0e\u4e13\u7528\u53d1\u7968',
'juan_invoice': u'\u589e\u503c\u7a0e\u666e\u901a\u53d1\u7968(\u5377\u7968)',
'dinge_invoice': u'\u5b9a\u989d\u53d1\u7968',
'train_ticket': u'\u706b\u8f66\u7968'
}
f = open(save_path+'result_BX3010000201906_xls.txt','w+')
f.write(','.join(['file_name','error_code','expense_type_msg','type_msg','expense_type','score',"taxi_mileage","taxi_time_get_on","taxi_amount_little","taxi_place","taxi_date","taxi_invoice_code","taxi_invoice_no","taxi_time_get_off","taxi_waiting_time",'||','buyer_name','amount_big','amount_little','invoice_code','issuer','seller_tax_id','machine_code','checker','check_code','seller_bank_info','buyer_address','pretax_amount','date','receiptor','invoice_no','buyer_tax_id','tax_amount','seller_name','seller_address','invoice_name','buyer_bank_info','count','name','tax','unit_price','amount','tax_rate','type','unit','tax_rate','||','dinge_amount_little','dinge_invoice_no','dinge_invoice_code','||','train_entrance_info','train_date_time','train_purchase_type','train_no','train_station_to','train_seat','train_id_name','train_little','train_seat_class','train_station_from','train_issuing_station','train_invoice_no','train_issuing_no']))
f.write(u"\n")
for data in data_list:
if data.endswith('jpg'):
filename = data
f.write(filename)
f.write(u"\n")
else:
file = data.split(",")
filename = file[0]
print('filename2:', filename)
filejson = ','.join(data.split(",")[1:])
try:
result_list = json.loads(filejson)
for i, result_dict in enumerate(result_list):
error_code = result_dict['error_code']
expense_type_msg = result_dict['expense_type_msg']
type_msg = result_dict['type_msg']
expense_type = result_dict['expense_type']
score = result_dict['score']
result = result_dict['recognize_result']
if type_msg == imgtype['taxi_ticket']:
result_str["taxi_mileage"] = result['mileage']['item_words']
result_str["taxi_time_get_on"] = result['time_get_on']['item_words']
result_str["taxi_amount_little"] = result['amount_little']['item_words']
result_str["taxi_place"] = result['place']['item_words']
result_str["taxi_invoice_code"] = result['invoice_code']['item_words']
result_str["taxi_date"] = result['date']['item_words']
result_str["taxi_invoice_no"] = result['invoice_no']['item_words']
result_str["taxi_time_get_off"] = result['time_get_off']['item_words']
result_str["taxi_waiting_time"] = result['waiting_time']['item_words']
elif type_msg == imgtype['train_ticket']:
result_str['train_entrance_info'] = result['entrance_info']['item_words']
result_str['train_date_time'] = result['date_time']['item_words']
result_str['train_purchase_type'] = result['purchase_type']['item_words']
result_str['train_no'] = result['train_no']['item_words']
result_str['train_station_to'] = result['station_to']['item_words']
result_str['train_seat'] = result['seat']['item_words']
result_str['train_id_name'] = result['id_name']['item_words']
result_str['train_little'] = result['amount_little']['item_words']
result_str['train_seat_class'] = result['seat_class']['item_words']
result_str['train_station_from'] = result['station_from']['item_words']
result_str['train_issuing_station'] = result['issuing_station']['item_words']
result_str['train_invoice_no'] = result['invoice_no']['item_words']
result_str['train_issuing_no'] = result['issuing_no']['item_words']
elif type_msg == imgtype['dinge_invoice']:
result_str["dinge_amount_little"] = result['amount_little']['item_words']
result_str["dinge_invoice_no"] = result['invoice_no']['item_words']
result_str["dinge_invoice_code"] = result['invoice_code']['item_words']
elif type_msg in [imgtype['e_invoice'], imgtype['invoice'], imgtype['invoice_zhuan'],imgtype['juan_invoice']]:
result_str['buyer_name'] = result['buyer_name']['item_words']
result_str['amount_big'] = result['amount_big']['item_words']
result_str['amount_little'] = result['amount_little']['item_words']
result_str['invoice_code'] = result['invoice_code']['item_words']
try:
result_str['issuer'] = result['issuer']['item_words']
except:
pass
result_str['seller_tax_id'] = result['seller_tax_id']['item_words']
try:
result_str['machine_code'] = result['machine_code']['item_words']
result_str['checker'] = result['checker']['item_words']
except:
pass
result_str['check_code'] = result['check_code']['item_words']
try:
result_str['seller_bank_info'] = result['seller_bank_info']['item_words']
except:
pass
try:
# print("result['buyer_address']['item_words']",result['buyer_address']['item_words'])
result_str['buyer_address'] = str(result['buyer_address']['item_words']).replace(',','_')
result_str['pretax_amount'] = result['pretax_amount']['item_words']
except:
pass
result_str['date'] = result['date']['item_words']
try:
result_str['receiptor'] = result['receiptor']['item_words']
except:
pass
result_str['invoice_no'] = result['invoice_no']['item_words']
result_str['buyer_tax_id'] = result['buyer_tax_id']['item_words']
try:
result_str['tax_amount'] = result['tax_amount']['item_words']
except:
pass
result_str['seller_name'] = result['seller_name']['item_words']
try:
result_str['seller_address'] = result['seller_address']['item_words']
except:
pass
result_str['invoice_name'] = result['invoice_name']['item_words']
try:
result_str['buyer_bank_info'] = result['buyer_bank_info']['item_words']
except:
pass
try:
countent = result['details'][0]
result_str['count'] = countent['count']['item_words']
result_str['name'] = countent['name']['item_words']
result_str['tax'] = countent['tax']['item_words']
result_str['unit_price'] = countent['unit_price']['item_words']
result_str['amount'] = countent['amount']['item_words']
result_str['tax_rate'] = countent['tax_rate']['item_words']
result_str['type'] = countent['type']['item_words']
result_str['unit'] = countent['unit']['item_words']
result_str['tax_rate'] = countent['tax_rate']['item_words']
except:
pass
else:
pass
f.write(','.join(
[filename, str(error_code), expense_type_msg, type_msg,
expense_type,
str(score), str(result_str["taxi_mileage"]), str(result_str["taxi_time_get_on"]),
str(result_str["taxi_amount_little"]), str(result_str["taxi_place"]),
str(result_str["taxi_date"]),
str(result_str["taxi_invoice_code"]), str(result_str["taxi_invoice_no"]),
str(result_str["taxi_time_get_off"]), str(result_str["taxi_waiting_time"]), '||',
str(result_str['buyer_name']), str(result_str['amount_big']),
str(result_str['amount_little']),
str(result_str['invoice_code']), str(result_str['issuer']),
str(result_str['seller_tax_id']),
str(result_str['machine_code']), str(result_str['checker']), str(result_str['check_code']),
str(result_str['seller_bank_info']), str(result_str['buyer_address']),
str(result_str['pretax_amount']),
str(result_str['date']), str(result_str['receiptor']), str(result_str['invoice_no']),
str(result_str['buyer_tax_id']), str(result_str['tax_amount']),
str(result_str['seller_name']),
str(result_str['seller_address']), str(result_str['invoice_name']),
str(result_str['buyer_bank_info']),
str(result_str['count']), str(result_str['name']), str(result_str['tax']),
str(result_str['unit_price']),
str(result_str['amount']), str(result_str['tax_rate']), str(result_str['type']),
str(result_str['unit']),
str(result_str['tax_rate']), '||', str(result_str["dinge_amount_little"]),
str(result_str["dinge_invoice_no"]), str(result_str["dinge_invoice_code"]), '||',
str(result_str['train_entrance_info']), str(result_str['train_date_time']),
str(result_str['train_purchase_type']), str(result_str['train_no']),
str(result_str['train_station_to']),
str(result_str['train_seat']), str(result_str['train_id_name']),
str(result_str['train_little']),
str(result_str['train_seat_class']), str(result_str['train_station_from']),
str(result_str['train_issuing_station']), str(result_str['train_invoice_no']),
str(result_str['train_issuing_no'])]))
f.write(u"\n")
except:
f.write("json read error!")
f.write(u"\n")
print('json read error!')
f.close()
if __name__ == '__main__':
txt_path = './ok/result_BX3010000201906.txt'
save_path = './ok/'
read_txt = Read_txt(txt_path)
parse_data = Parse_data(read_txt,save_path)
print("done!")
import json
import os
def get_img(file_path):
img_path = []
for path,dirname,filenames in os.walk(file_path):
for filename in filenames:
img_path.append(path+"/"+filename)
return img_path
def json_str(file_path):
dict_str = []
img_path = get_img(file_path)
for i in img_path:
dict_str.append({"ImageName":"/image/bus/"+i,"id":"8abs63twy2001"})
return dict_str
file_path = "./image/ocr"
dict_str = json_str(file_path)
json_str = json.dumps(dict_str)
with open("./dict_str_to_json.json","w") as json_file:
json_file.write(json_str)
print("done!")
其中xml文件为labelimg软件标注图片生成的
import json
import xml.etree.ElementTree as ET
def convert_annotation(xml_path):
in_file = open(xml_path)
tree = ET.parse(in_file)
root = tree.getroot()
data = {}
for obj in root.iter('object'):
cls = obj.find('name').text
if cls not in CLASSES:
print('not a valid object', xml_path)
print(cls)
xmlbox = obj.find('bndbox')
b = (int(xmlbox.find('xmin').text),
int(xmlbox.find('ymin').text),
int(xmlbox.find('xmax').text),
int(xmlbox.find('ymax').text))
data[str(cls)]=b
print('data["'+str(cls)+'"]:',data[str(cls)])
return data
if __name__ == "__main__":
xml_path = './temp/tempImg_0.xml'
data = convert_annotation(xml_path)
json_str = json.dumps(data)
with open(xml_path[:-4]+".json", "w") as json_file:
json_file.write(json_str)
print("done!")
非标准xml解析
{idcard201320110100002=
<data>
<message>
<status>2status>
<value>识别完成value>
message>
<cardsinfo>
<card type="idcard_face">
<item desc="姓名">item>
<item desc="性别">item>
<item desc="民族">item>
<item desc="出生">item>
<item desc="住址">item>
<item desc="公民身份号码">item>
<item desc="人像">/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAYEBQYFBAYGBQYHBwYIChAKCgkJChQODwwQFxQYGBcUFhYaHSUfGhsjHBYWICwgIyYnKSopGR8tMC0oMCUoKSj/2wBDAQcHBwoIChMKChMoGhYaKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCj/wAARCAH7AY8DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL+Pu4qNXaRsE8U8w8807YI1zQBEpKtxVe7y4xVlfmah4cnn/Gqleg6UAYczys5AHH0pqeaG4AH4VrMi7zwKjcDPQUgP//Zitem>
card>
cardsinfo>
data>==@@==##}
首先以文本的形式读入,删除xml文本前后多余的字符串,形成标准的xml
<data>
<message>
<status>2status>
<value>识别完成value>
message>
<cardsinfo>
<card type="idcard_face">
<item desc="姓名">item>
<item desc="性别">item>
<item desc="民族">item>
<item desc="出生">item>
<item desc="住址">item>
<item desc="公民身份号码">item>
<item desc="人像">/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAYEBQYFBAYGBQYHBwYIChAKCgkJChQODwwQFxQYGBcUFhYaHSUfGhsjHBYWICwgIyYnKSopGR8tMC0oMCUoKSj/2wBDAQcHBwoIChMKChMoGhYaKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCj/wAARCAH7AY8DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL+Pu4qNXaRsE8U8w8807YI1zQBEpKtxVe7y4xVlfmah4cnn/Gqleg6UAYczys5AHH0pqeaG4AH4VrMi7zwKjcDPQUgP//Zitem>
card>
cardsinfo>
data>
以上识别错误的xml没有
标签。
解析代码如下:
#encoding='utf-8'
import csv
import os
import xml.etree.ElementTree as ET
def gettxt(input_txt):
txt_paths = []
for (path,dirname,filenames) in os.walk(input_txt):
for filename in filenames:
if filename.endswith('.txt'):
txt_paths.append(path+'/'+filename)
return txt_paths
def Idcard(txtpath):
datastr = open(txtpath,'r',encoding='utf-8').read()
print('type(datastr):',type(datastr))
datastr2 = datastr[23:-10]
with open(txtpath[:-4]+'.xml','w',encoding='utf-8') as f:
f.write(datastr2)
filename = datastr[1:22]
print('filename:',filename)
tree = ET.parse(txtpath[:-4]+'.xml')
root = tree.getroot()
for obj in root.iter('data'):
message = obj.find('message')
status = message.find('status').text
value = message.find('value').text
data = {}
data['姓名'] = ''
data['性别'] = ''
data['民族'] = ''
data['出生'] = ''
data['住址'] = ''
data['公民身份号码'] = ''
try:
for info in root.iter('item'):
print('type(info.attrib):', type(info.attrib))
data[info.attrib['desc']] = info.text
except:
pass
return filename,status,value,data['姓名'],data['性别'],data['民族'],data['出生'],data['住址'],data['公民身份号码']
if __name__ == '__main__':
xml_path = './idcard2019000001225417.xml'
excelpath = 'C:\\Users\\user\\Desktop\\wentong\\idcard_face.csv'
txt_paths = gettxt(xml_path)
All_data = []
for txtpath in txt_paths:
formatdata = Idcard(txtpath)
All_data.append(formatdata)
with open(excelpath,'w',newline = '',encoding='utf-8') as csv_file:
field_idcard = ('filename','status','value','姓名','性别','民族','出生','住址','公民身份号码')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(field_idcard)
csv_writer.writerows(All_data)
以上纯属个人原创仅供新手json解析参考,老手请忽略,谢谢