标注文件格式转换:
一、xml和json相互转化
二、xml和txt相互转化
三、txt和json相互转化
一、xml和json相互转化
1、xml转json
import os
import glob
import json
import shutil
import numpy as np
import xml. etree. ElementTree as ET
path2 = "."
START_BOUNDING_BOX_ID = 1
def get ( root, name) :
return root. findall( name)
def get_and_check ( root, name, length) :
vars = root. findall( name)
if len ( vars ) == 0 :
raise NotImplementedError( 'Can not find %s in %s.' % ( name, root. tag) )
if length > 0 and len ( vars ) != length:
raise NotImplementedError( 'The size of %s is supposed to be %d, but is %d.' % ( name, length, len ( vars ) ) )
if length == 1 :
vars = vars [ 0 ]
return vars
def convert ( xml_list, json_file) :
json_dict = { "images" : [ ] , "type" : "instances" , "annotations" : [ ] , "categories" : [ ] }
categories = pre_define_categories. copy( )
bnd_id = START_BOUNDING_BOX_ID
all_categories = { }
for index, line in enumerate ( xml_list) :
xml_f = line
tree = ET. parse( xml_f)
root = tree. getroot( )
filename = os. path. basename( xml_f) [ : - 4 ] + ".jpg"
image_id = 20190000001 + index
size = get_and_check( root, 'size' , 1 )
width = int ( get_and_check( size, 'width' , 1 ) . text)
height = int ( get_and_check( size, 'height' , 1 ) . text)
image = { 'file_name' : filename, 'height' : height, 'width' : width, 'id' : image_id}
json_dict[ 'images' ] . append( image)
for obj in get( root, 'object' ) :
category = get_and_check( obj, 'name' , 1 ) . text
if category in all_categories:
all_categories[ category] += 1
else :
all_categories[ category] = 1
if category not in categories:
if only_care_pre_define_categories:
continue
new_id = len ( categories) + 1
print ( "[warning] category '{}' not in 'pre_define_categories'({}), create new id: {} automatically" . format ( category, pre_define_categories, new_id) )
categories[ category] = new_id
category_id = categories[ category]
bndbox = get_and_check( obj, 'bndbox' , 1 )
xmin = int ( float ( get_and_check( bndbox, 'xmin' , 1 ) . text) )
ymin = int ( float ( get_and_check( bndbox, 'ymin' , 1 ) . text) )
xmax = int ( float ( get_and_check( bndbox, 'xmax' , 1 ) . text) )
ymax = int ( float ( get_and_check( bndbox, 'ymax' , 1 ) . text) )
assert ( xmax > xmin) , "xmax <= xmin, {}" . format ( line)
assert ( ymax > ymin) , "ymax <= ymin, {}" . format ( line)
o_width = abs ( xmax - xmin)
o_height = abs ( ymax - ymin)
ann = { 'area' : o_width* o_height, 'iscrowd' : 0 , 'image_id' :
image_id, 'bbox' : [ xmin, ymin, o_width, o_height] ,
'category_id' : category_id, 'id' : bnd_id, 'ignore' : 0 ,
'segmentation' : [ ] }
json_dict[ 'annotations' ] . append( ann)
bnd_id = bnd_id + 1
for cate, cid in categories. items( ) :
cat = { 'supercategory' : 'none' , 'id' : cid, 'name' : cate}
json_dict[ 'categories' ] . append( cat)
json_fp = open ( json_file, 'w' )
json_str = json. dumps( json_dict)
json_fp. write( json_str)
json_fp. close( )
print ( "------------create {} done--------------" . format ( json_file) )
print ( "find {} categories: {} -->>> your pre_define_categories {}: {}" . format ( len ( all_categories) , all_categories. keys( ) , len ( pre_define_categories) , pre_define_categories. keys( ) ) )
print ( "category: id --> {}" . format ( categories) )
print ( categories. keys( ) )
print ( categories. values( ) )
if __name__ == '__main__' :
classes = [ 'person' ]
pre_define_categories = { }
for i, cls in enumerate ( classes) :
pre_define_categories[ cls] = i + 1
only_care_pre_define_categories = True
train_ratio = 0.9
save_json_train = 'instances_train2017.json'
save_json_val = 'instances_val2017.json'
xml_dir = "data/coco/annotations"
xml_list = glob. glob( xml_dir + "/*.xml" )
xml_list = np. sort( xml_list)
np. random. seed( 100 )
np. random. shuffle( xml_list)
train_num = int ( len ( xml_list) * train_ratio)
xml_list_train = xml_list[ : train_num]
xml_list_val = xml_list[ train_num: ]
convert( xml_list_train, save_json_train)
convert( xml_list_val, save_json_val)
if os. path. exists( path2 + "/annotations" ) :
shutil. rmtree( path2 + "/annotations" )
os. makedirs( path2 + "/annotations" )
if os. path. exists( path2 + "/images/train2017" ) :
shutil. rmtree( path2 + "/images/train2017" )
os. makedirs( path2 + "/images/train2017" )
if os. path. exists( path2 + "/images/val2017" ) :
shutil. rmtree( path2 + "/images/val2017" )
os. makedirs( path2 + "/images/val2017" )
f1 = open ( "train.txt" , "w" )
for xml in xml_list_train:
img = xml[ : - 4 ] + ".jpg"
f1. write( os. path. basename( xml) [ : - 4 ] + "\n" )
shutil. copyfile( img, path2 + "/data/coco/train2017/" + os. path. basename( img) )
f2 = open ( "val.txt" , "w" )
for xml in xml_list_val:
img = xml[ : - 4 ] + ".jpg"
f2. write( os. path. basename( xml) [ : - 4 ] + "\n" )
shutil. copyfile( img, path2 + "/data/coco/val2017/" + os. path. basename( img) )
f1. close( )
f2. close( )
print ( "-------------------------------" )
print ( "train number:" , len ( xml_list_train) )
print ( "val number:" , len ( xml_list_val) )
2、json转xml
import os
import time
import json
import pandas as pd
from tqdm import tqdm
from pycocotools. coco import COCO
anno = 'C:/Users/user/Desktop/val/instances_val2017.json'
xml_dir = 'C:/Users/user/Desktop/val/xml/'
coco = COCO( anno)
cats = coco. loadCats( coco. getCatIds( ) )
dttm = time. strftime( "%Y%m%d%H%M%S" , time. localtime( ) )
def trans_id ( category_id) :
names = [ ]
namesid = [ ]
for i in range ( 0 , len ( cats) ) :
names. append( cats[ i] [ 'name' ] )
namesid. append( cats[ i] [ 'id' ] )
index = namesid. index( category_id)
return index
def convert ( anno, xml_dir) :
with open ( anno, 'r' ) as load_f:
f = json. load( load_f)
imgs = f[ 'images' ]
cat = f[ 'categories' ]
df_cate = pd. DataFrame( f[ 'categories' ] )
df_cate_sort = df_cate. sort_values( [ "id" ] , ascending= True )
categories = list ( df_cate_sort[ 'name' ] )
print ( 'categories = ' , categories)
df_anno = pd. DataFrame( f[ 'annotations' ] )
for i in tqdm( range ( len ( imgs) ) ) :
xml_content = [ ]
file_name = imgs[ i] [ 'file_name' ]
height = imgs[ i] [ 'height' ]
img_id = imgs[ i] [ 'id' ]
width = imgs[ i] [ 'width' ]
version = [ '"1.0"' , '"utf-8"' ]
xml_content. append( " + version[ 0 ] + " " + "encoding=" + version[ 1 ] + "?>" )
xml_content. append( "" )
xml_content. append( " " + file_name + "" )
xml_content. append( " " )
xml_content. append( " " + str ( width) + "" )
xml_content. append( " " + str ( height) + "" )
xml_content. append( " " + "3" + "" )
xml_content. append( " " )
annos = df_anno[ df_anno[ "image_id" ] . isin( [ img_id] ) ]
for index, row in annos. iterrows( ) :
bbox = row[ "bbox" ]
category_id = row[ "category_id" ]
cate_name = categories[ trans_id( category_id) ]
xml_content. append( " " )
xml_content. append( " " + cate_name + "" )
xml_content. append( " 0 " )
xml_content. append( " 0 " )
xml_content. append( " " )
xml_content. append( " " + str ( int ( bbox[ 0 ] ) ) + "" )
xml_content. append( " " + str ( int ( bbox[ 1 ] ) ) + "" )
xml_content. append( " " + str ( int ( bbox[ 0 ] + bbox[ 2 ] ) ) + "" )
xml_content. append( " " + str ( int ( bbox[ 1 ] + bbox[ 3 ] ) ) + "" )
xml_content. append( " " )
xml_content. append( " " )
xml_content. append( "" )
x = xml_content
xml_content = [ x[ i] for i in range ( 0 , len ( x) ) if x[ i] != "\n" ]
xml_path = os. path. join( xml_dir, file_name. split( 'j' ) [ 0 ] + 'xml' )
print ( xml_path)
with open ( xml_path, 'w+' , encoding= "utf8" ) as f:
f. write( '\n' . join( xml_content) )
xml_content[ : ] = [ ]
if __name__ == '__main__' :
convert( anno, xml_dir)
二、xml和txt相互转化
1、xml转txt
import xml. etree. ElementTree as ET
import pickle
import os
from os import listdir, getcwd
from os. path import join
from PIL import Image
sets = [ 'train' , 'test' , 'val' ]
classes = [ 'two_wheeler' ]
def convert ( size, box) :
dw = 1. / size[ 0 ]
dh = 1. / size[ 1 ]
x = ( box[ 0 ] + box[ 1 ] ) / 2.0
y = ( box[ 2 ] + box[ 3 ] ) / 2.0
w = box[ 1 ] - box[ 0 ]
h = box[ 3 ] - box[ 2 ]
x = x* dw
w = w* dw
y = y* dh
h = h* dh
return ( x, y, w, h)
def convert_annotation ( image_id) :
'''
将对应文件名的xml文件转化为label文件,xml文件包含了对应的bunding框以及图片长款大小等信息,
通过对其解析,然后进行归一化最终读到label文件中去,也就是说
一张图片文件对应一个xml文件,然后通过解析和归一化,能够将对应的信息保存到唯一一个label文件中去
labal文件中的格式:calss x y w h 同时,一张图片对应的类别有多个,所以对应的bunding的信息也有多个
'''
in_file = open ( 'data/Annotations/%s.xml' % ( image_id) , encoding= 'utf-8' )
out_file = open ( 'data/labels/%s.txt' % ( image_id) , 'w' , encoding= 'utf-8' )
tree = ET. parse( in_file)
root = tree. getroot( )
size = root. find( 'size' )
w = int ( size. find( 'width' ) . text)
h = int ( size. find( 'height' ) . text)
for obj in root. iter ( 'object' ) :
difficult = obj. find( 'difficult' ) . text
cls = obj. find( 'name' ) . text
if cls not in classes or int ( difficult) == 1 :
continue
cls_id = classes. index( cls)
xmlbox = obj. find( 'bndbox' )
b = ( float ( xmlbox. find( 'xmin' ) . text) , float ( xmlbox. find( 'xmax' ) . text) , float ( xmlbox. find( 'ymin' ) . text) ,
float ( xmlbox. find( 'ymax' ) . text) )
print ( image_id, cls, b)
bb = convert( ( w, h) , b)
out_file. write( str ( cls_id) + " " + " " . join( [ str ( a) for a in bb] ) + '\n' )
wd = getcwd( )
print ( wd)
for image_set in sets:
'''
对所有的文件数据集进行遍历
做了两个工作:
1.讲所有图片文件都遍历一遍,并且将其所有的全路径都写在对应的txt文件中去,方便定位
2.同时对所有的图片文件进行解析和转化,将其对应的bundingbox 以及类别的信息全部解析写到label 文件中去
最后再通过直接读取文件,就能找到对应的label 信息
'''
if not os. path. exists( 'data/labels/' ) :
os. makedirs( 'data/labels/' )
image_ids = open ( 'data/ImageSets/%s.txt' % ( image_set) ) . read( ) . strip( ) . split( )
list_file = open ( 'data/%s.txt' % ( image_set) , 'w' )
for image_id in image_ids:
list_file. write( 'data/images/%s.jpg\n' % ( image_id) )
convert_annotation( image_id)
list_file. close( )
2、txt转xml
import glob
import cv2
xml_head = '''
VOC2007
{} .
The VOC2007 Database
PASCAL VOC2007
flickr
{}
{}
{}
0
'''
xml_obj = '''
{}
Unspecified
0
0
{}
{}
{}
{}
'''
xml_end = '''
'''
root= 'D:/A-new-tjw/works/2022.14-/data/'
labels = [ 'mask' , 'face' , 'incorrect mask' ]
txt_Lists = glob. glob( root + 'train' + '/*.jpg' )
print ( len ( txt_Lists) )
cnt= 0
for txt_path in txt_Lists:
filename= txt_path. split( '\\' )
filename= filename[ - 1 ]
filename= filename. split( '.' )
filename= filename[ 0 ]
txt = root+ 'train_txt/' + filename+ '.txt'
jpg= root+ 'train/' + filename+ '.jpg'
xml= root+ 'train_xml/' + filename+ '.xml'
print ( txt)
print ( jpg)
print ( xml)
obj = ''
img = cv2. imread( jpg)
img_h, img_w = img. shape[ 0 ] , img. shape[ 1 ]
print ( 'h_factor:' , img_h, ' w_factor:' , img_w)
head = xml_head. format ( str ( filename) , str ( img_w) , str ( img_h) , "3" )
with open ( txt, 'r' ) as f:
for line in f. readlines( ) :
yolo_datas = line. strip( ) . split( ' ' )
label = int ( float ( yolo_datas[ 0 ] . strip( ) ) )
center_x = round ( float ( str ( yolo_datas[ 1 ] ) . strip( ) ) * img_w)
center_y = round ( float ( str ( yolo_datas[ 2 ] ) . strip( ) ) * img_h)
bbox_width = round ( float ( str ( yolo_datas[ 3 ] ) . strip( ) ) * img_w)
bbox_height = round ( float ( str ( yolo_datas[ 4 ] ) . strip( ) ) * img_h)
xmin = str ( int ( center_x - bbox_width / 2 ) )
ymin = str ( int ( center_y - bbox_height / 2 ) )
xmax = str ( int ( center_x + bbox_width / 2 ) )
ymax = str ( int ( center_y + bbox_height / 2 ) )
obj += xml_obj. format ( labels[ label] , xmin, ymin, xmax, ymax)
with open ( xml, 'w' ) as f_xml:
f_xml. write( head + obj + xml_end)
cnt += 1
print ( cnt)
三、txt和json相互转化
1、txt转json
import os
import json
import cv2
import random
import time
from PIL import Image
coco_format_save_path= 'D:\\A-new-tjw\\works\\2022.5.19\\people\\labels_json\\val'
yolo_format_classes_path= 'D:\\A-new-tjw\\works\\2022.5.19\\people\\people.names'
yolo_format_annotation_path= 'D:\\A-new-tjw\\works\\2022.5.19\\people\\labels_txt\\val'
img_pathDir= 'D:\\A-new-tjw\\works\\2022.5.19\\people\\images\\val'
with open ( yolo_format_classes_path, 'r' ) as fr:
lines1= fr. readlines( )
categories= [ ]
for j, label in enumerate ( lines1) :
label= label. strip( )
categories. append( { 'id' : j+ 1 , 'name' : label, 'supercategory' : 'None' } )
write_json_context= dict ( )
write_json_context[ 'info' ] = { 'description' : '' , 'url' : '' , 'version' : '' , 'year' : 2021 , 'contributor' : '' , 'date_created' : '2021-07-25' }
write_json_context[ 'licenses' ] = [ { 'id' : 1 , 'name' : None , 'url' : None } ]
write_json_context[ 'categories' ] = categories
write_json_context[ 'images' ] = [ ]
write_json_context[ 'annotations' ] = [ ]
imageFileList= os. listdir( img_pathDir)
for i, imageFile in enumerate ( imageFileList) :
imagePath = os. path. join( img_pathDir, imageFile)
image = Image. open ( imagePath)
W, H = image. size
img_context= { }
img_context[ 'file_name' ] = imageFile
img_context[ 'height' ] = H
img_context[ 'width' ] = W
img_context[ 'date_captured' ] = '2021-07-25'
img_context[ 'id' ] = i
img_context[ 'license' ] = 1
img_context[ 'color_url' ] = ''
img_context[ 'flickr_url' ] = ''
write_json_context[ 'images' ] . append( img_context)
txtFile= imageFile[ : 6 ] + '.txt'
with open ( os. path. join( yolo_format_annotation_path, txtFile) , 'r' ) as fr:
lines= fr. readlines( )
for j, line in enumerate ( lines) :
bbox_dict = { }
class_id, x, y, w, h= line. strip( ) . split( ' ' )
class_id, x, y, w, h = int ( class_id) , float ( x) , float ( y) , float ( w) , float ( h)
xmin= ( x- w/ 2 ) * W
ymin= ( y- h/ 2 ) * H
xmax= ( x+ w/ 2 ) * W
ymax= ( y+ h/ 2 ) * H
w= w* W
h= h* H
bbox_dict[ 'id' ] = i* 10000 + j
bbox_dict[ 'image_id' ] = i
bbox_dict[ 'category_id' ] = class_id+ 1
bbox_dict[ 'iscrowd' ] = 0
height, width= abs ( ymax- ymin) , abs ( xmax- xmin)
bbox_dict[ 'area' ] = height* width
bbox_dict[ 'bbox' ] = [ xmin, ymin, w, h]
bbox_dict[ 'segmentation' ] = [ [ xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax] ]
write_json_context[ 'annotations' ] . append( bbox_dict)
name = os. path. join( coco_format_save_path, "val" + '.json' )
with open ( name, 'w' ) as fw:
json. dump( write_json_context, fw, indent= 2 )
2、json转txt
import json
import os
"存储标签与预测框到txt文件中"
def json_txt ( json_path, txt_path) :
"json_path: 需要处理的json文件的路径"
"txt_path: 将json文件处理后txt文件存放的文件夹名"
if not os. path. exists( txt_path) :
os. mkdir( txt_path)
with open ( json_path, 'r' ) as f:
dict = json. load( f)
images_value = dict . get( "images" )
annotations_value = dict . get( "annotations" )
list = [ ]
for i in images_value:
open ( txt_path + str ( i. get( "id" ) ) + '.txt' , 'w' )
list . append( i. get( "id" ) )
for i in list :
for j in annotations_value:
if j. get( "image_id" ) == i:
num = sum ( j. get( 'bbox' ) )
new_list = [ round ( m / num, 6 ) for m in j. get( 'bbox' ) ]
with open ( txt_path + str ( i) + '.txt' , 'a' ) as file1:
print ( j. get( "category_id" ) , new_list[ 0 ] , new_list[ 1 ] , new_list[ 2 ] , new_list[ 3 ] , file = file1)
"将id对应的标签存储在class.txt中"
def class_txt ( json_path, class_txt_path) :
"json_path: 需要处理的json文件的路径"
"txt_path: 将json文件处理后存放所需的txt文件名"
with open ( json_path, 'r' ) as f:
dict = json. load( f)
categories_value = dict . get( "categories" )
with open ( class_txt_path, 'a' ) as file0:
for i in categories_value:
print ( i. get( "id" ) , i. get( 'name' ) , file = file0)
json_txt( "train.json" , "train_annotations/" )