Python 制作Pascal VOC数据集

下图是Pascal VOC数据集格式。
1、Annotations目录是存放xml文件;
2、ImageSets目录是存放txt文件,主要是测试集、训练集、验证集等文件名称的集合;
3、JPEGImages目录是存放图片文件(jpg);

Python 制作Pascal VOC数据集_第1张图片
Pascal VOC数据集格式

下面代码主要是生成指定的训练集、验证集。
比如我训练集图片名称前缀是cx_train,验证集图片名称前缀是cx_dev_val,所以在生成txt文件时,就通过判断前缀,来分别写入。

代码实现思路

1、从Annotations目录下读取xml目录;
2、把xml目录,随机重置一下,这样在训练的时候,各个分类是随机读取,不会出现某一个分类聚集读取,从而影响训练效果。当然,你也可以在训练的时候选择随机重置,道理是一样的;
3、创建将要写入的txt文件。这里示范了训练集、验证集,其他的可自行添加;
4、读取目录文件,通过前缀判断,写入txt文件。

# -*- coding: utf-8 -*-
import os
import random


def main():
    # xml path
    xmlfilepath = '/Users/ll/Desktop/VOCdevkit/VOC2020/Annotations/'
    total_xml = os.listdir(xmlfilepath)
    # train percent
    num = len(total_xml)
    print('total count:%d' % num)
    # 目录
    xml_list = range(num)
    # 随机重置样本
    train_val = random.sample(xml_list, num)
    # 写入目录
    setpath = '/Users/ll/Desktop/VOCdevkit/VOC2020/ImageSets/Main/'
    # train
    ptrain = os.path.join(setpath, 'train.txt')
    ftrain = open(ptrain, 'w')
    # val
    pval = os.path.join(setpath, 'val.txt')
    fval = open(pval, 'w')

    for i in xml_list:
        name = total_xml[i]
        if name.endswith('.xml'):
            name = total_xml[i][:-4] + '\n'
            # 通过前缀判断 是训练集,还是验证集
            if 'cx_train' in name:
                ftrain.write(name)
                pass
            elif 'cx_dev_val' in name:
                fval.write(name)
                pass
            else:
                print('name:', name)
        else:
            print('name:', name)
    ftrain.close()
    pass


if __name__ == '__main__':
    main()
    pass

随机数据集

有时候不需要指定数据集,只是从一个大的原始数据集中,随机选取一部分当中训练集、一部分当作验证集、一部分当作测试集。

# -*- coding: utf-8 -*-
import os
import random


def main():
    train_val_percent = 0.9
    train_percent = 0.9
    # xml path
    xmlfilepath = '/Users/ll/Desktop/VOCdevkit/VOC2020/Annotations/'
    total_xml = os.listdir(xmlfilepath)
    # train percent
    num = len(total_xml)
    print('total count:%d' % num)
    xml_list = range(num)
    train_val_count = int(num * train_val_percent)
    train_count = int(train_val_count * train_percent)
    print('train & val count:%d' % train_val_count)
    print('train count:%d' % train_count)
    val_count = train_val_count - train_count
    print('val count:%d' % val_count)
    test_count = num - train_val_count
    print('test count:%d' % test_count)
    train_val = random.sample(xml_list, train_val_count)
    train = random.sample(train_val, train_count)

    setpath = '/Users/ll/Desktop/VOCdevkit/VOC2020/ImageSets/Main/'
    ptrainval = os.path.join(setpath, 'trainval.txt')
    ptrain = os.path.join(setpath, 'train.txt')
    ptest = os.path.join(setpath, 'test.txt')
    pval = os.path.join(setpath, 'val.txt')

    ftrainval = open(ptrainval, 'w')
    ftrain = open(ptrain, 'w')
    ftest = open(ptest, 'w')
    fval = open(pval, 'w')

    for i in xml_list:
        name = total_xml[i]
        if name.endswith('.xml'):
            name = total_xml[i][:-4] + '\n'
            # train & val
            if i in train_val:
                ftrainval.write(name)
                # train
                if i in train:
                    ftrain.write(name)
                else:
                    fval.write(name)
            else:
                ftest.write(name)
        else:
            print('name:', name)
    ftrainval.close()
    ftrain.close()
    fval.close()
    ftest.close()
    pass


if __name__ == '__main__':
    main()
    pass

你可能感兴趣的:(Python 制作Pascal VOC数据集)