制作VOC格式的数据集

由原始标注文件(json格式)转换为VOC格式的标注文件

代码中所用的json文件是2019年天池布匹缺陷检测竞赛开源的数据

import os
import numpy as np
import codecs
import json
from glob import glob
import cv2
import shutil
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd

defect_name2label = {
    '破洞': 1, '水渍': 2, '油渍': 2, '污渍': 2, '三丝': 3, '结头': 4, '花板跳': 5, '百脚': 6, '毛粒': 7,
    '粗经': 8, '松经': 9, '断经': 10, '吊经': 11, '粗维': 12, '纬缩': 13, '浆斑': 14, '整经结': 15, '星跳': 16, '跳花': 16,
    '断氨纶': 17, '稀密档': 18, '浪纹档': 18, '色差档': 18, '磨痕': 19, '轧痕': 19, '修痕': 19, '烧毛痕': 19, '死皱': 20, '云织': 20,
    '双纬': 20, '双经': 20, '跳纱': 20, '筘路': 20, '纬纱不良': 20,
}

# 1.标签路径
image_path1 = "/research/tianchi_denim/smartdiagnosisofclothflaw_round1train2_datasets/guangdong1_round1_train2_20190828/defect_Images/"  # 原始labelme标注数据路径
image_path2 = "/research/tianchi_denim/smartdiagnosisofclothflaw_round1train1_datasets/guangdong1_round1_train1_20190818/defect_Images/"  # 原始labelme标注数据路径
saved_path = "VOC2007/"  # 保存路径

# 2.创建要求文件夹
if not os.path.exists(saved_path + "Annotations"):
    os.makedirs(saved_path + "Annotations")
if not os.path.exists(saved_path + "JPEGImages/"):
    os.makedirs(saved_path + "JPEGImages/")
if not os.path.exists(saved_path + "ImageSets/Main/"):
    os.makedirs(saved_path + "ImageSets/Main/")

json_file1 = "/research/tianchi_denim/smartdiagnosisofclothflaw_round1train2_datasets/guangdong1_round1_train2_20190828/Annotations/anno_train.json" #比赛json格式路径
json_file2 = "/research/tianchi_denim/smartdiagnosisofclothflaw_round1train1_datasets/guangdong1_round1_train1_20190818/Annotations/anno_train.json"
files = [1]
# 4.读取标注信息并写入 xml
for json_file in [json_file1, json_file2]:		
    anno_result = pd.read_json(open(json_file, "r"))
    file_name_list = list(set(anno_result['name']))

    for file_name in tqdm(file_name_list):

        with codecs.open(saved_path + "Annotations/" + file_name.split('.')[0] + ".xml", "w", "utf-8") as xml:
            height, width, channels = 1000, 2446, 3

            xml.write('\n')
            xml.write('\t' + 'VOC2007' + '\n')
            xml.write('\t' + file_name + '\n')
            xml.write('\t\n')
            xml.write('\t\tThe UAV autolanding\n')
            xml.write('\t\tUAV AutoLanding\n')
            xml.write('\t\tflickr\n')
            xml.write('\t\tNULL\n')
            xml.write('\t\n')
            xml.write('\t\n')
            xml.write('\t\tNULL\n')
            xml.write('\t\tGuangDongDec\n')
            xml.write('\t\n')
            xml.write('\t\n')
            xml.write('\t\t' + str(width) + '\n')
            xml.write('\t\t' + str(height) + '\n')
            xml.write('\t\t' + str(channels) + '\n')
            xml.write('\t\n')
            xml.write('\t\t0\n')

            bbox = anno_result[anno_result['name'] == file_name]
            for box, defect_name in zip(bbox['bbox'], bbox['defect_name']):
                points = np.array(box)
                xmin = points[0]
                xmax = points[2]
                ymin = points[1]
                ymax = points[3]
                label = defect_name2label[defect_name]
                if xmax <= xmin:
                    pass
                elif ymax <= ymin:
                    pass
                else:
                    xml.write('\t\n')
                    xml.write('\t\t' + str(label) + '\n')
                    xml.write('\t\tUnspecified\n')
                    xml.write('\t\t1\n')
                    xml.write('\t\t0\n')
                    xml.write('\t\t\n')
                    xml.write('\t\t\t' + str(xmin) + '\n')
                    xml.write('\t\t\t' + str(ymin) + '\n')
                    xml.write('\t\t\t' + str(xmax) + '\n')
                    xml.write('\t\t\t' + str(ymax) + '\n')
                    xml.write('\t\t\n')
                    xml.write('\t\n')
                # print(multi['name'],xmin,ymin,xmax,ymax,label)
            xml.write('')

# 5.复制图片到 VOC2007/JPEGImages/下

for image_path in [image_path1, image_path2]:			
    image_files = glob(image_path + "*.jpg")
    print("copy image files to VOC007/JPEGImages/")
    for image in tqdm(image_files):
        shutil.copy(image, saved_path + "JPEGImages/")

# #6.split files for txt
txtsavepath = saved_path + "ImageSets/Main/"
ftrainval = open(txtsavepath + '/trainval.txt', 'w')
ftest = open(txtsavepath + '/test.txt', 'w')
ftrain = open(txtsavepath + '/train.txt', 'w')
fval = open(txtsavepath + '/val.txt', 'w')
total_files = glob("VOC2007/Annotations/*.xml")		#得到Annotation中所有的标注文件
total_files = [i.split("/")[-1].split(".xml")[0] for i in total_files]		#将标注文件的文件名分离出来(不含".xml"后缀)

trainval_files, test_files = train_test_split(total_files, test_size=986, random_state=41)	# random_state:是随机数的种子,随机数种子:其实就是该组随机数的编号,在需要重复试验的时候,保证得到一组一样的随机数
# test_filepath = ""
# trainval
for file in trainval_files:
    ftrainval.write(file + "\n")
# test
for file in test_files:
    ftest.write(file + "\n")
# for file in os.listdir(test_filepath):
#     ftest.write(file.split(".jpg")[0] + "\n")
# split
train_files, val_files = train_test_split(trainval_files, test_size=986, random_state=42)	#test_size:可以为浮点、整数或None,默认为None①若为浮点时,表示测试集占总样本的百分比②若为整数时,表示测试样本样本数③若为None时,test size自动设置成0.25
# train
for file in train_files:
    ftrain.write(file + "\n")
# val
for file in val_files:
    fval.write(file + "\n")

ftrainval.close()
ftrain.close()
fval.close()
ftest.close()

你可能感兴趣的:(python,人工智能,python,开发语言)