python 读取json与xml格式化等处理

文章目录

      • 三个 pandas读取json文件方式
      • json2excel
        • 批量读取 json 文件(中文 json)
        • 读取excel中某列的json数据(每个单元格数据格式为:数据格式一)
        • 特殊json文件格式化
        • 多票据数据解析(一张图片有几张增值税发票或火车票)
      • 写json文件
      • xml2json
      • xml2excel

三个 pandas读取json文件方式

from pandas.io.json import json_normalize
import pandas as pd
import json
import time
 
# 读入数据
data_str = open('AgriculturalDisease_train_annotations.json').read()
 
#———————————————————— 测试json_normalize ————————————————————
start_time = time.time()
for i in range(0, 300):
    data_list = json.loads(data_str)
    df = json_normalize(data_list)
end_time = time.time()
print (end_time - start_time)#耗时109秒
 
#———————————————————— 测试自己构造 ————————————————————
start_time = time.time()
for i in range(0, 300):
    data_list = json.loads(data_str)
    data = [[d["disease_class"], d["image_id"]] for d in data_list]
    df = pd.DataFrame(data, columns=["disease_class", "image_id"])
end_time = time.time()
print (end_time - start_time)#耗时22秒
 
#———————————————————— 测试read_json ————————————————————
start_time = time.time()
for i in range(0, 300):
    df = pd.read_json(data_str, orient='records')
end_time = time.time()
print (end_time - start_time)#耗时36秒
# read_json
df2 = pd.read_json(data_str, orient='records')

# 自己构造
data_list = json.loads(data_str)
data = [[d["disease_class"], d["image_id"]] for d in data_list]
df = pd.DataFrame(data, columns=["disease_class", "image_id"])
df.head(5)

三种代码输出均如下
  disease_class	      	  image_id
0		1		62fd8bf4d53a1b94fbac16738406f10b.jpg
1		1		0bdec5cccbcade6b6e94087cb5509d98.jpg
2		1		8951e940341f77c8d361c1872c67b16d.jpg
3		1		7ed158da58c451f75fb790530d6f19cc.jpg
4		1		9b7399aa-1c3c-4137-ae4e-196cd23fe573___FREC_Sc...

技巧:将复杂的json串整理成以下格式再读取,再使用data_list = json.loads(data_str)读取即可

{"error_code":40007,"error_msg":"fail to recognize"}

[{"department": "abcdef",
 "query_result": {"code": "1000", "description": "1000"}, 
 "is_invoice": 1, 
 "imageName": "./imgs/8888888.jpeg", 
 "reco_result": {"total": "", "invoice_no": "123", "create_date": "", "check_code": ""}}]

json2excel

批量读取 json 文件(中文 json)

./out_file下两个json文件内容如下:

out_01.txt 内容为:"{"name_ID":"12343","name":"张三","身份编码":"未知"}"
out_02.txt 内容为:"{"name_ID":"12344","name":"李四","身份编码":"98983"}"
import json
import os

def img_w_h(text_path):
    data_str_list = []
    img_name_list = []
    for filename in os.listdir(text_path):
        file_path = text_path+'/'+filename
        print("获取文件:",file_path)
        data_str = open(file_path,"r",encoding='UTF-8').read()
        data_str_list.append(data_str)
        img_name_list.append(filename)
    print("data_str_list",data_str_list)
    return data_str_list,img_name_list

def json_to_excel(data_str_list):
    data_all = []
    for data_str in data_str_list:
        if data_str.startswith(u'\ufeff'):
            content = data_str.encode('utf8')[3:].decode('utf8')
            text = json.loads(content[1:-1])
            if text["身份编码"] =="未知":
                data_all.append(text["身份编码"])
    return data_all

if __name__ == "__main__":
    text_path = "./out_file"
    data_str_list, img_name_list = img_w_h(text_path)
    data_all = json_to_excel(data_str_list)
    print("data_all:",data_all)

输出:
获取文件: ./out_file/out_01.txt
获取文件: ./out_file/out_02.txt
data_str_list ['\ufeff"{"name_ID":"12343","name":"张三","身份编码":"98983"}"', '\ufeff"{"name_ID":"12343","name":"张三","身份编码":"未知"}"']
data_all: ['未知']

读取excel中某列的json数据(每个单元格数据格式为:数据格式一)

数据格式一:json数据格式

"""
[{"department": "abcdef",
 "query_result": {"code": "1000", "description": "1000"}, 
 "is_invoice": 1, 
 "imageName": "./imgs/8888888.jpeg", 
 "reco_result": {"total": "", "invoice_no": "01111111", "create_date": "", "check_code": "", "invoice_code": ""}}, 
 {"department": "abcdef",
 "query_result": {}, 
 "is_invoice": 0, 
 "imageName": "./imgs/51111111.jpeg", 
 "reco_result": {}},
 ...]
"""
import json
import pandas as pd
import xlrd

excel_path = "C:\\Users\\Desktop\\test_data.xlsx"
def read_excel(excel_path):
    workbook = xlrd.open_workbook(excel_path)
    sheet = workbook.sheet_by_name("Sheet1")
    nrows = sheet.nrows
    list1 = []
    for i in range(1,nrows):
        list1.append(sheet.row_values(i)[0])
    return list1

def get_data(excel_path):
    list1 = read_excel(excel_path)
    All_data = []
    for i in range(len(list1)):          #遍历列表数据(相当于遍历该列所有单元格)
        data_list = json.loads(list1[i])
        # print("data_list:", type(data_list))
        for i in range(len(data_list)): #遍历该单元格列表中所有json串
            # print(type(data_list[i]))
            data_dict = data_list[i]
            try:
                imageNo = data_dict["imageNo"]
                businessType = data_dict["businessType"]
                reco_result = data_dict["reco_result"]

                try:
                    total = reco_result["total"]
                    invoice_no = reco_result["invoice_no"]
                    create_date = reco_result["create_date"]
                    check_code = reco_result["check_code"]
                    invoice_code = reco_result["invoice_code"]
                except:
                    total = "NAN"
                    invoice_no = "NAN"
                    create_date = "NAN"
                    check_code = "NAN"
                    invoice_code = "NAN"
                is_invoice = data_dict["is_invoice"]
                billId = data_dict["billId"]
                imageName = data_dict["imageName"]
                applyNum = data_dict["applyNum"]
                department = data_dict["department"]
                query_result = data_dict["query_result"]
                try:
                    code = query_result["code"]
                    description = query_result["description"]
                except:
                    code = "NAN"
                    description = "NAN"

                All_data.append((imageNo, businessType, total, invoice_no, create_date, check_code,
                                 invoice_code, is_invoice, billId, imageName,
                                 applyNum, department, code, description))
            except:
                print("数据格式出错!")
                pass

    return All_data
All_data = get_data(excel_path)
df = pd.DataFrame(All_data, index=None,columns=["imageNo", "businessType", "total","invoice_no", "create_date", "check_code", \
                                                 "invoice_code","is_invoice","billId","imageName",\
                                                 "applyNum","department","code","description"])
df.to_excel('C:\\Users\\Desktop/001.xls')
print("done!")

特殊json文件格式化

最原始数据:
{"41196516":"{\"type\":\"身份证正面\",\"name\":\"徐XX\",\"sex\":\"男\",\"people\":\"汉\",...,"41196243":"{\"error_code\"

处理成如下json文件:(非常不正规)
{"41196516":"{"type":"身份证正面","name":"XX","sex":"","people":"","birthday":"19XX年7XX","address":"广州市花都区*****","id_number":"4401***15","issue_authority":"广州市XXX","validity":"20XX.XX.13-20XX.XX.13","time_cost":{"recognize":348,"preprocess":28},"complete":true,"border_covered":false,"head_covered":false,"head_blurred":false,"gray_image":true,"error_code":0,"error_msg":"OK"}",
"41196243":"{"error_code":40007,"error_msg":"fail to recognize"}",
"41196510":"{"type":"二代身份证","name":"XX","sex":"","people":"","birthday":"19XX年9XX","address":"江苏省江阴市XXX","id_number":"320XXX17","time_cost":{"recognize":398,"preprocess":29},"complete":true,"border_covered":false,"head_covered":false,"head_blurred":false,"gray_image":false,"error_code":0,"error_msg":"OK"}",
"41197139":"{"type":"身份证背面","issue_authority":"佛山市XXX分局","validity":"2005.XX.XX-2025.XX.XX","time_cost":{"recognize":464,"preprocess":48},"complete":true,"error_code":0,"error_msg":"OK"}"}

格式化展示:
{"41196516":"{"type":"身份证正面",
			  "name":"徐XX",
			  "sex":"男",
			  "people":"汉",
			  "birthday":"19XX年7月XX日",
			  "address":"广州市花都区*****号",
			  "id_number":"4401***15",
			  "issue_authority":"广州市XXX局",
			  "validity":"20XX.XX.13-20XX.XX.13",
			  "time_cost":{"recognize":348,"preprocess":28},
			  "complete":true,
			  "border_covered":false,
			  "head_covered":false,
			  "head_blurred":false,
			  "gray_image":true,
			  "error_code":0,
			  "error_msg":"OK"}",
"41196243":"{"error_code":40007,"error_msg":"fail to recognize"}",
"41196510":"{"type":"二代身份证",
			 "name":"魏XX",
			 "sex":"男",
			 "people":"汉",
			 "birthday":"19XX年9月XX日",
			 "address":"江苏省江阴市XXX号",
			 "id_number":"320XXX17",
			 "time_cost":{"recognize":398,"preprocess":29},
			 "complete":true,
			 "border_covered":false,
			 "head_covered":false,
			 "head_blurred":false,
			 "gray_image":false,
			 "error_code":0,
			 "error_msg":"OK"}",
"41197139":"{"type":"身份证背面",
		    "issue_authority":"佛山市XXX分局",
			"validity":"2005.XX.XX-2025.XX.XX",
			"time_cost":{"recognize":464,"preprocess":48},
			"complete":true,
			"error_code":0,
			"error_msg":"OK"}"}

解析代码如下:

import json
import pandas as pd


data_str = open('D:/XXX/XXX文档/reize_result20181227.txt',"r",encoding="utf-8").read()
data_str0 = data_str.replace("\\","")
print(data_str0)

imgName_list = []
def get_data(data_str0):
    All_data = []
    num = data_str0.count("error_code")                        #统计共有多少个json("error_code"每个json都有)
    for i in range(num):
        imgName = data_str0[1:-1].split("\":\"{")[i][-8:]      #获取ImageID([1:-1]去除最外层括号)
        print("imgName", imgName)
        img_str1 = "{"+data_str0[1:-1].split("\":\"{")[i+1].split("}\",\"")[0]+"}"         #获取整个json

        img_str1 = img_str1.replace("\"}\"}}","\"}") if "\"}\"}" in img_str1 else img_str1 #去除末尾多余的符号
        print("img_str1:", img_str1)
        data_list = json.loads(img_str1)

        #########################################################################
        try:
            type_ = data_list["type"]
        except:
            type_ = "NAN"
        try:
            name = data_list["name"]
        except:
            name = "NAN"
        try:
            sex = data_list["sex"]
        except:
            sex = "NAN"
        try:
            people = data_list["people"]
        except:
            people = "NAN"
        try:

            birthday = data_list["birthday"]
        except:
            birthday = "NAN"
        try:
            address = data_list["address"]
        except:
            address = "NAN"
        try:
            id_number = data_list["id_number"]
        except:
            id_number = "NAN"
        try:
            issue_authority = data_list["issue_authority"]
        except:
            issue_authority = "NAN"
        try:
            validity = data_list["validity"]
        except:
            validity = "NAN"
        try:
            time_cost = data_list["time_cost"]
            recognize = time_cost["recognize"]
            preprocess = time_cost["preprocess"]
        except:
            time_cost = "NAN"
            recognize = "NAN"
            preprocess = "NAN"

        try:
            complete = data_list["complete"]
        except:
            complete = "NAN"
        try:
            border_covered = data_list["border_covered"]
        except:
            border_covered = "NAN"
        try:
            head_covered = data_list["head_covered"]
        except:
            head_covered = "NAN"
        try:

            head_blurred = data_list["head_blurred"]
        except:
            head_blurred = "NAN"
        try:
            gray_image = data_list["gray_image"]
        except:
            gray_image = "NAN"



        error_code = data_list["error_code"]
        error_msg = data_list["error_msg"]

        All_data.append((imgName,type_,name,sex,people,birthday,\
                         address,id_number,issue_authority,validity,\
                         recognize,preprocess,complete,border_covered,\
                         head_covered,head_blurred,gray_image,error_code,error_msg))
    return All_data
All_data = get_data(data_str0)

df = pd.DataFrame(All_data, index=None,columns=["imgName", "type_", "name","sex", "people", "birthday", \
                                                 "address","id_number","issue_authority","validity",\
                                                 "recognize","preprocess","complete","border_covered",\
                                                 "head_covered","head_blurred","gray_image","error_code","error_msg"])
df.to_excel('D:/XXX/XXX文档/reize_result20181227.xls')

复杂json解析

  • 报错1:json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)
    原因在于:字符串里用单引号来标识字符。
    解决办法如下:将字符串里的单引号替换成双引号

多票据数据解析(一张图片有几张增值税发票或火车票)

import json


def Read_txt(txt_path):
    f = open(txt_path, "r", encoding="utf-8")
    lines = f.readlines()
    read_txt = []
    for line in lines:
        read_txt.append(line.strip('\n'))
    f.close()
    return read_txt


def Parse_data(data_list,save_path):
    result_str = {}
    # ===================出租车========================
    result_str["taxi_mileage"] = ''
    result_str["taxi_time_get_on"] = ''
    result_str["taxi_amount_little"] = ''
    result_str["taxi_place"] = ''
    result_str["taxi_date"] = ''
    result_str["taxi_invoice_code"] = ''
    result_str["taxi_invoice_no"] = ''
    result_str["taxi_time_get_off"] = ''
    result_str["taxi_waiting_time"] = ''

    # ===================增值税发票========================
    result_str['buyer_name'] = ''
    result_str['amount_big'] = ''
    result_str['amount_little'] = ''
    result_str['invoice_code'] = ''
    result_str['issuer'] = ''
    result_str['seller_tax_id'] = ''
    result_str['machine_code'] = ''
    result_str['checker'] = ''
    result_str['check_code'] = ''

    result_str['seller_bank_info'] = ''
    result_str['buyer_address'] = ''
    result_str['pretax_amount'] = ''
    result_str['date'] = ''
    result_str['receiptor'] = ''
    result_str['invoice_no'] = ''
    result_str['buyer_tax_id'] = ''
    result_str['tax_amount'] = ''
    result_str['seller_name'] = ''
    result_str['seller_address'] = ''
    result_str['invoice_name'] = ''
    result_str['buyer_bank_info'] = ''

    result_str['count'] = ''
    result_str['name'] = ''
    result_str['tax'] = ''
    result_str['unit_price'] = ''
    result_str['amount'] = ''
    result_str['tax_rate'] = ''
    result_str['type'] = ''
    result_str['unit'] = ''
    result_str['tax_rate'] = ''

    # ===================定额发票========================
    result_str["dinge_amount_little"] = ''
    result_str["dinge_invoice_no"] = ''
    result_str["dinge_invoice_code"] = ''
    # ===================火车票========================
    result_str['train_entrance_info'] = ''
    result_str['train_date_time'] = ''
    result_str['train_purchase_type'] = ''
    result_str['train_no'] = ''
    result_str['train_station_to'] = ''
    result_str['train_seat'] = ''
    result_str['train_id_name'] = ''
    result_str['train_little'] = ''
    result_str['train_seat_class'] = ''
    result_str['train_station_from'] = ''
    result_str['train_issuing_station'] = ''
    result_str['train_invoice_no'] = ''
    result_str['train_issuing_no'] = ''
    imgtype = {
        'taxi_ticket': u'\u51fa\u79df\u8f66\u7968',
        'e_invoice': u'\u589e\u503c\u7a0e\u7535\u5b50\u666e\u901a\u53d1\u7968',
        'invoice': u'\u589e\u503c\u7a0e\u666e\u901a\u53d1\u7968',
        'invoice_zhuan':u'\u589e\u503c\u7a0e\u4e13\u7528\u53d1\u7968',
        'juan_invoice': u'\u589e\u503c\u7a0e\u666e\u901a\u53d1\u7968(\u5377\u7968)',
        'dinge_invoice': u'\u5b9a\u989d\u53d1\u7968',
        'train_ticket': u'\u706b\u8f66\u7968'
    }
    f = open(save_path+'result_BX3010000201906_xls.txt','w+')
    f.write(','.join(['file_name','error_code','expense_type_msg','type_msg','expense_type','score',"taxi_mileage","taxi_time_get_on","taxi_amount_little","taxi_place","taxi_date","taxi_invoice_code","taxi_invoice_no","taxi_time_get_off","taxi_waiting_time",'||','buyer_name','amount_big','amount_little','invoice_code','issuer','seller_tax_id','machine_code','checker','check_code','seller_bank_info','buyer_address','pretax_amount','date','receiptor','invoice_no','buyer_tax_id','tax_amount','seller_name','seller_address','invoice_name','buyer_bank_info','count','name','tax','unit_price','amount','tax_rate','type','unit','tax_rate','||','dinge_amount_little','dinge_invoice_no','dinge_invoice_code','||','train_entrance_info','train_date_time','train_purchase_type','train_no','train_station_to','train_seat','train_id_name','train_little','train_seat_class','train_station_from','train_issuing_station','train_invoice_no','train_issuing_no']))
    f.write(u"\n")
    for data in data_list:
        if data.endswith('jpg'):
            filename = data
            f.write(filename)
            f.write(u"\n")
        else:
            file = data.split(",")
            filename = file[0]
            print('filename2:', filename)

            filejson = ','.join(data.split(",")[1:])
            try:
                result_list = json.loads(filejson)
                for i, result_dict in enumerate(result_list):
                    error_code = result_dict['error_code']
                    expense_type_msg = result_dict['expense_type_msg']
                    type_msg = result_dict['type_msg']
                    expense_type = result_dict['expense_type']
                    score = result_dict['score']

                    result = result_dict['recognize_result']
                    if type_msg == imgtype['taxi_ticket']:
                        result_str["taxi_mileage"] = result['mileage']['item_words']
                        result_str["taxi_time_get_on"] = result['time_get_on']['item_words']
                        result_str["taxi_amount_little"] = result['amount_little']['item_words']
                        result_str["taxi_place"] = result['place']['item_words']
                        result_str["taxi_invoice_code"] = result['invoice_code']['item_words']
                        result_str["taxi_date"] = result['date']['item_words']
                        result_str["taxi_invoice_no"] = result['invoice_no']['item_words']
                        result_str["taxi_time_get_off"] = result['time_get_off']['item_words']
                        result_str["taxi_waiting_time"] = result['waiting_time']['item_words']
                    elif type_msg == imgtype['train_ticket']:
                        result_str['train_entrance_info'] = result['entrance_info']['item_words']
                        result_str['train_date_time'] = result['date_time']['item_words']
                        result_str['train_purchase_type'] = result['purchase_type']['item_words']
                        result_str['train_no'] = result['train_no']['item_words']
                        result_str['train_station_to'] = result['station_to']['item_words']
                        result_str['train_seat'] = result['seat']['item_words']
                        result_str['train_id_name'] = result['id_name']['item_words']
                        result_str['train_little'] = result['amount_little']['item_words']
                        result_str['train_seat_class'] = result['seat_class']['item_words']
                        result_str['train_station_from'] = result['station_from']['item_words']
                        result_str['train_issuing_station'] = result['issuing_station']['item_words']
                        result_str['train_invoice_no'] = result['invoice_no']['item_words']
                        result_str['train_issuing_no'] = result['issuing_no']['item_words']
                    elif type_msg == imgtype['dinge_invoice']:
                        result_str["dinge_amount_little"] = result['amount_little']['item_words']
                        result_str["dinge_invoice_no"] = result['invoice_no']['item_words']
                        result_str["dinge_invoice_code"] = result['invoice_code']['item_words']
                    elif type_msg in [imgtype['e_invoice'], imgtype['invoice'], imgtype['invoice_zhuan'],imgtype['juan_invoice']]:
                        result_str['buyer_name'] = result['buyer_name']['item_words']
                        result_str['amount_big'] = result['amount_big']['item_words']
                        result_str['amount_little'] = result['amount_little']['item_words']
                        result_str['invoice_code'] = result['invoice_code']['item_words']
                        try:
                            result_str['issuer'] = result['issuer']['item_words']
                        except:
                            pass

                        result_str['seller_tax_id'] = result['seller_tax_id']['item_words']

                        try:
                            result_str['machine_code'] = result['machine_code']['item_words']
                            result_str['checker'] = result['checker']['item_words']
                        except:
                            pass
                        result_str['check_code'] = result['check_code']['item_words']

                        try:
                            result_str['seller_bank_info'] = result['seller_bank_info']['item_words']
                        except:
                            pass
                        try:
                            # print("result['buyer_address']['item_words']",result['buyer_address']['item_words'])
                            result_str['buyer_address'] = str(result['buyer_address']['item_words']).replace(',','_')
                            result_str['pretax_amount'] = result['pretax_amount']['item_words']
                        except:
                            pass
                        result_str['date'] = result['date']['item_words']
                        try:
                            result_str['receiptor'] = result['receiptor']['item_words']
                        except:
                            pass
                        result_str['invoice_no'] = result['invoice_no']['item_words']
                        result_str['buyer_tax_id'] = result['buyer_tax_id']['item_words']
                        try:
                            result_str['tax_amount'] = result['tax_amount']['item_words']
                        except:
                            pass
                        result_str['seller_name'] = result['seller_name']['item_words']
                        try:
                            result_str['seller_address'] = result['seller_address']['item_words']
                        except:
                            pass
                        result_str['invoice_name'] = result['invoice_name']['item_words']
                        try:
                            result_str['buyer_bank_info'] = result['buyer_bank_info']['item_words']
                        except:
                            pass
                        try:
                            countent = result['details'][0]
                            result_str['count'] = countent['count']['item_words']
                            result_str['name'] = countent['name']['item_words']
                            result_str['tax'] = countent['tax']['item_words']
                            result_str['unit_price'] = countent['unit_price']['item_words']
                            result_str['amount'] = countent['amount']['item_words']
                            result_str['tax_rate'] = countent['tax_rate']['item_words']
                            result_str['type'] = countent['type']['item_words']
                            result_str['unit'] = countent['unit']['item_words']
                            result_str['tax_rate'] = countent['tax_rate']['item_words']
                        except:
                            pass
                    else:
                        pass
                    f.write(','.join(
                        [filename, str(error_code), expense_type_msg, type_msg,
                         expense_type,
                         str(score), str(result_str["taxi_mileage"]), str(result_str["taxi_time_get_on"]),
                         str(result_str["taxi_amount_little"]), str(result_str["taxi_place"]),
                         str(result_str["taxi_date"]),
                         str(result_str["taxi_invoice_code"]), str(result_str["taxi_invoice_no"]),
                         str(result_str["taxi_time_get_off"]), str(result_str["taxi_waiting_time"]), '||',
                         str(result_str['buyer_name']), str(result_str['amount_big']),
                         str(result_str['amount_little']),
                         str(result_str['invoice_code']), str(result_str['issuer']),
                         str(result_str['seller_tax_id']),
                         str(result_str['machine_code']), str(result_str['checker']), str(result_str['check_code']),
                         str(result_str['seller_bank_info']), str(result_str['buyer_address']),
                         str(result_str['pretax_amount']),
                         str(result_str['date']), str(result_str['receiptor']), str(result_str['invoice_no']),
                         str(result_str['buyer_tax_id']), str(result_str['tax_amount']),
                         str(result_str['seller_name']),
                         str(result_str['seller_address']), str(result_str['invoice_name']),
                         str(result_str['buyer_bank_info']),
                         str(result_str['count']), str(result_str['name']), str(result_str['tax']),
                         str(result_str['unit_price']),
                         str(result_str['amount']), str(result_str['tax_rate']), str(result_str['type']),
                         str(result_str['unit']),
                         str(result_str['tax_rate']), '||', str(result_str["dinge_amount_little"]),
                         str(result_str["dinge_invoice_no"]), str(result_str["dinge_invoice_code"]), '||',
                         str(result_str['train_entrance_info']), str(result_str['train_date_time']),
                         str(result_str['train_purchase_type']), str(result_str['train_no']),
                         str(result_str['train_station_to']),
                         str(result_str['train_seat']), str(result_str['train_id_name']),
                         str(result_str['train_little']),
                         str(result_str['train_seat_class']), str(result_str['train_station_from']),
                         str(result_str['train_issuing_station']), str(result_str['train_invoice_no']),
                         str(result_str['train_issuing_no'])]))
                    f.write(u"\n")
            except:
                f.write("json read error!")
                f.write(u"\n")
                print('json read error!')
    f.close()



if __name__ == '__main__':
    txt_path = './ok/result_BX3010000201906.txt'
    save_path = './ok/'
    read_txt = Read_txt(txt_path)
    parse_data = Parse_data(read_txt,save_path)
    print("done!")

写json文件

import json
import os


def get_img(file_path):
	img_path = []
	for path,dirname,filenames in os.walk(file_path):
		for filename in filenames:
			img_path.append(path+"/"+filename)
	return img_path
	
def json_str(file_path):
	dict_str = []
	img_path = get_img(file_path)
	for i in img_path:
		dict_str.append({"ImageName":"/image/bus/"+i,"id":"8abs63twy2001"})
	return dict_str
	
file_path = "./image/ocr"
dict_str = json_str(file_path)
json_str = json.dumps(dict_str)

with open("./dict_str_to_json.json","w") as json_file:
	json_file.write(json_str)
	print("done!")

xml2json

其中xml文件为labelimg软件标注图片生成的

import json
import xml.etree.ElementTree as ET


def convert_annotation(xml_path):
    in_file = open(xml_path)
    tree = ET.parse(in_file)
    root = tree.getroot()

    data = {}
    for obj in root.iter('object'):
        cls = obj.find('name').text
        if cls not in CLASSES:
            print('not a valid object', xml_path)
            print(cls)
        xmlbox = obj.find('bndbox')
        b = (int(xmlbox.find('xmin').text),
             int(xmlbox.find('ymin').text),
             int(xmlbox.find('xmax').text),
             int(xmlbox.find('ymax').text))
        data[str(cls)]=b
        print('data["'+str(cls)+'"]:',data[str(cls)])
    return data


if __name__ == "__main__":
    xml_path = './temp/tempImg_0.xml'
    data = convert_annotation(xml_path)
    json_str = json.dumps(data)

    with open(xml_path[:-4]+".json", "w") as json_file:
        json_file.write(json_str)
        print("done!")

xml2excel

非标准xml解析

{idcard201320110100002=
	<data>
		<message>
			<status>2status>
			<value>识别完成value>
		message>
		<cardsinfo>
			<card type="idcard_face">
				<item desc="姓名">item>
				<item desc="性别">item>
				<item desc="民族">item>
				<item desc="出生">item>
				<item desc="住址">item>
				<item desc="公民身份号码">item>
				<item desc="人像">/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAYEBQYFBAYGBQYHBwYIChAKCgkJChQODwwQFxQYGBcUFhYaHSUfGhsjHBYWICwgIyYnKSopGR8tMC0oMCUoKSj/2wBDAQcHBwoIChMKChMoGhYaKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCj/wAARCAH7AY8DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL+Pu4qNXaRsE8U8w8807YI1zQBEpKtxVe7y4xVlfmah4cnn/Gqleg6UAYczys5AHH0pqeaG4AH4VrMi7zwKjcDPQUgP//Zitem>
				card>
		cardsinfo>
	data>==@@==##}

首先以文本的形式读入,删除xml文本前后多余的字符串,形成标准的xml


	<data>
		<message>
			<status>2status>
			<value>识别完成value>
		message>
		<cardsinfo>
			<card type="idcard_face">
				<item desc="姓名">item>
				<item desc="性别">item>
				<item desc="民族">item>
				<item desc="出生">item>
				<item desc="住址">item>
				<item desc="公民身份号码">item>
				<item desc="人像">/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAYEBQYFBAYGBQYHBwYIChAKCgkJChQODwwQFxQYGBcUFhYaHSUfGhsjHBYWICwgIyYnKSopGR8tMC0oMCUoKSj/2wBDAQcHBwoIChMKChMoGhYaKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCj/wAARCAH7AY8DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL+Pu4qNXaRsE8U8w8807YI1zQBEpKtxVe7y4xVlfmah4cnn/Gqleg6UAYczys5AHH0pqeaG4AH4VrMi7zwKjcDPQUgP//Zitem>
				card>
		cardsinfo>
	data>

以上识别错误的xml没有 标签。
解析代码如下:

#encoding='utf-8'
import csv
import os
import xml.etree.ElementTree as ET

def gettxt(input_txt):
	txt_paths = []
	for (path,dirname,filenames) in os.walk(input_txt):
		for filename in filenames:
			if filename.endswith('.txt'):
				txt_paths.append(path+'/'+filename)
	return txt_paths

def Idcard(txtpath):
	datastr = open(txtpath,'r',encoding='utf-8').read()
	print('type(datastr):',type(datastr))
	datastr2 = datastr[23:-10]
	with open(txtpath[:-4]+'.xml','w',encoding='utf-8') as f:
		f.write(datastr2)
	filename = datastr[1:22]
	print('filename:',filename)
	tree = ET.parse(txtpath[:-4]+'.xml')
	root = tree.getroot()
	
	for obj in root.iter('data'):
		message = obj.find('message')
		status = message.find('status').text
		value = message.find('value').text
		data = {}
		data['姓名'] = ''
		data['性别'] = ''
		data['民族'] = ''
		data['出生'] = ''
		data['住址'] = ''
		data['公民身份号码'] = ''
		try:
			for info in root.iter('item'):
				print('type(info.attrib):', type(info.attrib))
				data[info.attrib['desc']] = info.text
		except:
			pass
	return filename,status,value,data['姓名'],data['性别'],data['民族'],data['出生'],data['住址'],data['公民身份号码']

if __name__ == '__main__':
	xml_path = './idcard2019000001225417.xml'
	excelpath = 'C:\\Users\\user\\Desktop\\wentong\\idcard_face.csv'
	txt_paths = gettxt(xml_path)
	All_data = []
	for txtpath in txt_paths:
		formatdata = Idcard(txtpath)
		All_data.append(formatdata)
	with open(excelpath,'w',newline = '',encoding='utf-8') as csv_file:
		field_idcard = ('filename','status','value','姓名','性别','民族','出生','住址','公民身份号码')
		csv_writer = csv.writer(csv_file)
		csv_writer.writerow(field_idcard)
		csv_writer.writerows(All_data)

以上纯属个人原创仅供新手json解析参考,老手请忽略,谢谢

你可能感兴趣的:(python基础及相关)