数据转换篇---json to xml格式
1、json to xml
2、分割数据集
3、提取数据
参考
1、json to xml
'''
将json文件转为类似voc中的xml格式
'''
import os
import numpy as np
import codecs
from sklearn. model_selection import train_test_split
import json
from glob import glob
import cv2
import shutil
labelme_path = "/app/dataset/json/"
saved_path = "/app/dataset/"
isUseTest= True
if not os. path. exists( saved_path + "Annotations" ) :
os. makedirs( saved_path + "Annotations" )
if not os. path. exists( saved_path + "JPEGImages/" ) :
os. makedirs( saved_path + "JPEGImages/" )
if not os. path. exists( saved_path + "ImageSets/Main/" ) :
os. makedirs( saved_path + "ImageSets/Main/" )
files = glob( labelme_path + "*.json" )
files = [ i. replace( "\\" , "/" ) . split( "/" ) [ - 1 ] . split( ".json" ) [ 0 ] for i in files]
print ( files)
for json_file_ in files:
json_filename = labelme_path + json_file_ + ".json"
json_file = json. load( open ( json_filename, "r" , encoding= "utf-8" ) )
height, width, channels = cv2. imread( saved_path + 'img/' + json_file_ + ".jpg" ) . shape
with codecs. open ( saved_path + "Annotations/" + json_file_ + ".xml" , "w" , "utf-8" ) as xml:
xml. write( '\n' )
xml. write( '\t' + 'ECM' + '\n' )
xml. write( '\t' + json_file_ + ".jpg" + '\n' )
xml. write( '\t\n' )
xml. write( '\t\tECM_Data \n' )
xml. write( '\t\tECM \n' )
xml. write( '\t\tflickr \n' )
xml. write( '\t\tNULL \n' )
xml. write( '\t\n' )
xml. write( '\t\n' )
xml. write( '\t\tNULL \n' )
xml. write( '\t\tXT \n' )
xml. write( '\t\n' )
xml. write( '\t\n' )
xml. write( '\t\t' + str ( width) + '\n' )
xml. write( '\t\t' + str ( height) + '\n' )
xml. write( '\t\t' + str ( channels) + '\n' )
xml. write( '\t\n' )
xml. write( '\t\t0 \n' )
for multi in json_file[ "shapes" ] :
points = np. array( multi[ "points" ] )
labelName= multi[ "label" ]
xmin = min ( points[ : , 0 ] )
xmax = max ( points[ : , 0 ] )
ymin = min ( points[ : , 1 ] )
ymax = max ( points[ : , 1 ] )
label = multi[ "label" ]
if xmax <= xmin:
pass
elif ymax <= ymin:
pass
else :
xml. write( '\t\n' )
xml. write( '\t\t' + labelName+ '\n' )
xml. write( '\t\tUnspecified \n' )
xml. write( '\t\t1 \n' )
xml. write( '\t\t0 \n' )
xml. write( '\t\t\n' )
xml. write( '\t\t\t' + str ( int ( xmin) ) + '\n' )
xml. write( '\t\t\t' + str ( int ( ymin) ) + '\n' )
xml. write( '\t\t\t' + str ( int ( xmax) ) + '\n' )
xml. write( '\t\t\t' + str ( int ( ymax) ) + '\n' )
xml. write( '\t\t\n' )
xml. write( '\t\n' )
print ( json_filename, xmin, ymin, xmax, ymax, label)
xml. write( '' )
image_files = glob( "/app/dataset/img/" + "*.jpg" )
print ( "copy image files to VOC007/JPEGImages/" )
for image in image_files:
shutil. copy( image, saved_path + "JPEGImages/" )
2、分割数据集
import random
import os
XML_FILE_PATH = "/app/dataset/Annotations/"
SAVE_BASE_PATH = "/app/dataset/ImageSets/Main"
train_percent = 0.9
trainval_percent = 1
temp_xml = os. listdir( XML_FILE_PATH)
total_xml = [ ]
for xml in temp_xml:
if xml. endswith( ".xml" ) :
total_xml. append( xml)
num = len ( total_xml)
list = range ( num)
tv = int ( num * trainval_percent)
tr = int ( tv * train_percent)
trainval = random. sample( list , tv)
train = random. sample( trainval, tr)
print ( "train and val size" , tv)
print ( "traub size" , tr)
ftrainval = open ( os. path. join( SAVE_BASE_PATH, 'trainval.txt' ) , 'w' )
ftest = open ( os. path. join( SAVE_BASE_PATH, 'test.txt' ) , 'w' )
ftrain = open ( os. path. join( SAVE_BASE_PATH, 'train.txt' ) , 'w' )
fval = open ( os. path. join( SAVE_BASE_PATH, 'val.txt' ) , 'w' )
for i in list :
name = total_xml[ i] [ : - 4 ] + '\n'
if i in trainval:
ftrainval. write( name)
if i in train:
ftrain. write( name)
else :
fval. write( name)
else :
ftest. write( name)
ftrainval. close( )
ftrain. close( )
fval. close( )
ftest . close( )
3、提取数据
import xml. etree. ElementTree as ET
from os import getcwd
import os
DATA_TXT = "../data/data_txt/{}_{}.txt"
IMAGE_IDS = "/app/project/error_dataset{}/ImageSets/Main/{}.txt"
OPEN_XML_PATH = "/app/project/error_dataset{}/Annotations/{}.xml"
IMAGE_WRITER_PATH = "/app/project/error_dataset{}/JPEGImages/{}.jpg"
sets = [ ( '2022' , 'train' ) , ( '2022' , 'val' ) , ( '2022' , 'test' ) ]
wd = getcwd( )
classes = [ "ElectricBox" , "Dustbin_opening" ]
def convert_annotation ( year, image_id, list_file) :
in_file = open ( OPEN_XML_PATH. format ( year, image_id) )
tree = ET. parse( in_file)
root = tree. getroot( )
list_file. write( IMAGE_WRITER_PATH. format ( year, image_id) )
for obj in root. iter ( 'object' ) :
difficult = obj. find( 'difficult' ) . text
cls = obj. find( 'name' ) . text
if cls not in classes or int ( difficult) == 1 :
continue
cls_id = classes. index( cls)
xmlbox = obj. find( 'bndbox' )
b = ( int ( xmlbox. find( 'xmin' ) . text) , int ( xmlbox. find( 'ymin' ) . text) , int ( xmlbox. find( 'xmax' ) . text) , int ( xmlbox. find( 'ymax' ) . text) )
list_file. write( " " + "," . join( [ str ( a) for a in b] ) + ',' + str ( cls_id) )
list_file. write( '\n' )
for year, image_set in sets:
image_ids = open ( IMAGE_IDS. format ( year, image_set) ) . read( ) . strip( ) . split( )
save_data_path = '/' . join( DATA_TXT. split( '/' ) [ : - 1 ] )
if not os. path. exists( save_data_path) :
os. makedirs( save_data_path)
list_file = open ( DATA_TXT. format ( year, image_set) , 'w' )
for image_id in image_ids:
convert_annotation( year, image_id, list_file)
list_file. close( )
参考
'''
将json文件转为类似voc中的xml格式
'''
import os
import numpy as np
import codecs
from sklearn. model_selection import train_test_split
import json
from glob import glob
import cv2
import shutil
labelme_path = "/app/dataset/json/"
saved_path = "/app/dataset/"
isUseTest= True
if not os. path. exists( saved_path + "Annotations" ) :
os. makedirs( saved_path + "Annotations" )
if not os. path. exists( saved_path + "JPEGImages/" ) :
os. makedirs( saved_path + "JPEGImages/" )
if not os. path. exists( saved_path + "ImageSets/Main/" ) :
os. makedirs( saved_path + "ImageSets/Main/" )
files = glob( labelme_path + "*.json" )
files = [ i. replace( "\\" , "/" ) . split( "/" ) [ - 1 ] . split( ".json" ) [ 0 ] for i in files]
print ( files)
for json_file_ in files:
json_filename = labelme_path + json_file_ + ".json"
json_file = json. load( open ( json_filename, "r" , encoding= "utf-8" ) )
height, width, channels = cv2. imread( saved_path + 'img/' + json_file_ + ".jpg" ) . shape
with codecs. open ( saved_path + "Annotations/" + json_file_ + ".xml" , "w" , "utf-8" ) as xml:
xml. write( '\n' )
xml. write( '\t' + 'ECM' + '\n' )
xml. write( '\t' + json_file_ + ".jpg" + '\n' )
xml. write( '\t\n' )
xml. write( '\t\tECM_Data \n' )
xml. write( '\t\tECM \n' )
xml. write( '\t\tflickr \n' )
xml. write( '\t\tNULL \n' )
xml. write( '\t\n' )
xml. write( '\t\n' )
xml. write( '\t\tNULL \n' )
xml. write( '\t\tXT \n' )
xml. write( '\t\n' )
xml. write( '\t\n' )
xml. write( '\t\t' + str ( width) + '\n' )
xml. write( '\t\t' + str ( height) + '\n' )
xml. write( '\t\t' + str ( channels) + '\n' )
xml. write( '\t\n' )
xml. write( '\t\t0 \n' )
for multi in json_file[ "shapes" ] :
points = np. array( multi[ "points" ] )
labelName= multi[ "label" ]
xmin = min ( points[ : , 0 ] )
xmax = max ( points[ : , 0 ] )
ymin = min ( points[ : , 1 ] )
ymax = max ( points[ : , 1 ] )
label = multi[ "label" ]
if xmax <= xmin:
pass
elif ymax <= ymin:
pass
else :
xml. write( '\t\n' )
xml. write( '\t\t' + labelName+ '\n' )
xml. write( '\t\tUnspecified \n' )
xml. write( '\t\t1 \n' )
xml. write( '\t\t0 \n' )
xml. write( '\t\t\n' )
xml. write( '\t\t\t' + str ( int ( xmin) ) + '\n' )
xml. write( '\t\t\t' + str ( int ( ymin) ) + '\n' )
xml. write( '\t\t\t' + str ( int ( xmax) ) + '\n' )
xml. write( '\t\t\t' + str ( int ( ymax) ) + '\n' )
xml. write( '\t\t\n' )
xml. write( '\t\n' )
print ( json_filename, xmin, ymin, xmax, ymax, label)
xml. write( '' )
image_files = glob( "/app/dataset/img/" + "*.jpg" )
print ( "copy image files to VOC007/JPEGImages/" )
for image in image_files:
shutil. copy( image, saved_path + "JPEGImages/" )
txtsavepath = saved_path + "ImageSets/Main/"
ftrainval = open ( txtsavepath + '/trainval.txt' , 'w' )
ftest = open ( txtsavepath + '/test.txt' , 'w' )
ftrain = open ( txtsavepath + '/train.txt' , 'w' )
fval = open ( txtsavepath + '/val.txt' , 'w' )
total_files = glob( "/app/dataset/Annotations/*.xml" )
total_files = [ i. replace( "\\" , "/" ) . split( "/" ) [ - 1 ] . split( ".xml" ) [ 0 ] for i in total_files]
trainval_files= [ ]
test_files= [ ]
if isUseTest:
trainval_files, test_files = train_test_split( total_files, test_size= 0.2 , random_state= 42 )
else :
trainval_files= total_files
for file in trainval_files:
ftrainval. write( file + "\n" )
train_files, val_files = train_test_split( trainval_files, test_size= 0.15 , random_state= 55 )
for file in train_files:
ftrain. write( file + "\n" )
for file in val_files:
fval. write( file + "\n" )
for file in test_files:
print ( file )
ftest. write( file + "\n" )
ftrainval. close( )
ftrain. close( )
fval. close( )
ftest. close( )