1. 数据集简介
PASCAL VOC 挑战赛主要有 Classification、Detection、Segmentation、Person Layout、Action Classification 这几类子任务,一共有20个类别,包括:
- person
- bird, cat, cow, dog, horse, sheep
- aeroplane, bicycle, boat, bus, car, motorbike, train
- bottle, chair, dining table, potted plant, sofa, tv/monitor
文件目录结构:
- Annotations:包含xml文件,其中有检测、分类等任务的标签
- ImageSets:定义了训练集、验证集与测试集的划分
- JPEGImages:原始图像
- SegmentationClass:语义分割的标签 (RGB)
- SegmentationObject:实例分割的标签 (RGB)
PS: 此处主要介绍语义分割部分。
2. 数据集的下载
PASCAL VOC 2012 数据集的下载地址:http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html
由于下载得到的语义分割标签为 RGB 图像,所以需要额外将其转换为灰度图像。
3. 数据集的标签
labels = [
# class name id trainId color
Label( 'background' , 0 , 0 , ( 0, 0, 0) ),
Label( 'aeroplane' , 1 , 1 , ( 128, 0, 0) ),
Label( 'bicycle' , 2 , 2 , ( 0, 128, 0) ),
Label( 'bird' , 3 , 3 , ( 128, 128, 0) ),
Label( 'boat' , 4 , 4 , ( 0, 0, 128) ),
Label( 'bottle' , 5 , 5 , ( 128, 0, 128) ),
Label( 'bus' , 6 , 6 , ( 0, 128, 128) ),
Label( 'car' , 7 , 7 , ( 128, 128, 128) ),
Label( 'cat' , 8 , 8 , ( 64, 0, 0) ),
Label( 'chair' , 9 , 9 , ( 192, 0, 0) ),
Label( 'cow' , 10 , 10 , ( 64, 128, 0) ),
Label( 'dining table' , 11 , 11 , ( 192, 128, 0) ),
Label( 'dog' , 12 , 12 , ( 64, 0, 128) ),
Label( 'horse' , 13 , 13 , ( 192, 0, 128) ),
Label( 'motorbike' , 14 , 14 , ( 64, 128, 128) ),
Label( 'person' , 15 , 15 , ( 192, 128, 128) ),
Label( 'potted plant' , 16 , 16 , ( 0, 64, 0) ),
Label( 'sheep' , 17 , 17 , ( 128, 64, 0) ),
Label( 'sofa' , 18 , 18 , ( 0, 192, 0) ),
Label( 'train' , 19 , 19 , ( 128, 192, 0) ),
Label( 'tv monitor' , 20 , 20 , ( 0, 64, 128) ),
Label( 'bordering region' , 255, 21 , ( 224, 224, 192) ),
]
PS: PASCAL VOC 分割数据集中将物体的边界区域标记为 bordering region,表示这些区域可以是任何类别,包括 background,在计算精度时将忽略该部分像素。
4. 数据集生成
所生成的data list文件格式为:
2007_000032
2007_000039
2007_000063
2007_000068
2007_000121
2007_000170
...
image在JPEGImages目录下,2007_000032.jpg
mask在SegmentationClass目录下,2007_000032_trainIds.png
import os
import sys
import shutil
import numpy as np
from PIL import Image
from collections import namedtuple
Label = namedtuple( 'Label' , [
'name' , # The identifier of this label, e.g. 'car', 'person', ... .
# We use them to uniquely name a class
'id' , # An integer ID that is associated with this label.
# The IDs are used to represent the label in ground truth images
# An ID of -1 means that this label does not have an ID and thus
# is ignored when creating ground truth images (e.g. license plate).
# Do not modify these IDs, since exactly these IDs are expected by the
# evaluation server.
'trainId' , # Feel free to modify these IDs as suitable for your method. Then create
# ground truth images with train IDs, using the tools provided in the
# 'preparation' folder. However, make sure to validate or submit results
# to our evaluation server using the regular IDs above!
# For trainIds, multiple labels might have the same ID. Then, these labels
# are mapped to the same class in the ground truth images. For the inverse
# mapping, we use the label that is defined first in the list below.
# For example, mapping all void-type classes to the same ID in training,
# might make sense for some approaches.
# Max value is 255!
'color' , # The color of this label
] )
labels = [
# name id trainId color
Label( 'background' , 0 , 0 , ( 0, 0, 0) ),
Label( 'aeroplane' , 1 , 1 , ( 128, 0, 0) ),
Label( 'bicycle' , 2 , 2 , ( 0, 128, 0) ),
Label( 'bird' , 3 , 3 , ( 128, 128, 0) ),
Label( 'boat' , 4 , 4 , ( 0, 0, 128) ),
Label( 'bottle' , 5 , 5 , ( 128, 0, 128) ),
Label( 'bus' , 6 , 6 , ( 0, 128, 128) ),
Label( 'car' , 7 , 7 , ( 128, 128, 128) ),
Label( 'cat' , 8 , 8 , ( 64, 0, 0) ),
Label( 'chair' , 9 , 9 , ( 192, 0, 0) ),
Label( 'cow' , 10 , 10 , ( 64, 128, 0) ),
Label( 'dining table' , 11 , 11 , ( 192, 128, 0) ),
Label( 'dog' , 12 , 12 , ( 64, 0, 128) ),
Label( 'horse' , 13 , 13 , ( 192, 0, 128) ),
Label( 'motorbike' , 14 , 14 , ( 64, 128, 128) ),
Label( 'person' , 15 , 15 , ( 192, 128, 128) ),
Label( 'potted plant' , 16 , 16 , ( 0, 64, 0) ),
Label( 'sheep' , 17 , 17 , ( 128, 64, 0) ),
Label( 'sofa' , 18 , 18 , ( 0, 192, 0) ),
Label( 'train' , 19 , 19 , ( 128, 192, 0) ),
Label( 'tv monitor' , 20 , 20 , ( 0, 64, 128) ),
Label( 'bordering region' , 255, 21 , ( 224, 224, 192) ),
]
####################################################################################
num_classes = 22
unspecified_id = num_classes - 1
train_id = list()
valid_labels = dict()
id_key = list()
id_mapping = list()
for label in labels:
train_id.append(label.trainId)
valid_labels[label.name] = label.id
id_key.append(label.trainId)
# encoder: r<<16 + g<<8 + b
color = label.color
encoder = (color[0] << 16) + (color[1] << 8) + color[2]
id_mapping.append(encoder)
assert list(train_id) == sorted(train_id) and len(train_id) == num_classes
temp = list(zip(id_mapping, id_key))
temp.sort()
temp = list(zip(*temp))
id_key = np.array(temp[1], dtype='int')
id_mapping = np.array(temp[0], dtype='int')
print('valid class: ', valid_labels)
print('train_id: ', train_id)
print('unspecified_id: ', unspecified_id)
print('id_key: ', id_key)
print('id_mapping: ', id_mapping)
"""
valid class: {'background': 0, 'aeroplane': 1, 'bicycle': 2, 'bird': 3, 'boat': 4, 'bottle': 5, 'bus': 6, 'car': 7, 'cat': 8, 'chair': 9, 'cow': 10, 'dining table': 11, 'dog': 12, 'horse': 13, 'motorbike': 14, 'person': 15, 'potted plant': 16, 'sheep': 17, 'sofa': 18, 'train': 19, 'tv monitor': 20, 'bordering region': 255}
train_id: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
unspecified_id: 21
id_key: [ 0 4 16 20 2 6 18 8 12 10 14 1 5 17 3 7 19 9 13 11 15 21]
id_mapping: [ 0 128 16384 16512 32768 32896 49152 4194304
4194432 4227072 4227200 8388608 8388736 8404992 8421376 8421504
8437760 12582912 12583040 12615680 12615808 14737600]
"""
####################################################################################
# Path of PASCAL VOC 2012 Dataset
"""
VOCdevkit
├─VOC2012
| ├─ImageSets
| ├─JPEGImages
| └─SegmentationClass
└─generate_voc_data.py
data_list_file:
img_name0
img_name1
img_name2
...
"""
data_dir = os.path.abspath(os.path.dirname(__file__))
img_dir = os.path.join(data_dir, 'VOC2012/JPEGImages')
mask_dir = os.path.join(data_dir, 'VOC2012/SegmentationClass')
img_sets_dir = os.path.join(data_dir, 'VOC2012/ImageSets/Segmentation')
####################################################################################
print()
for split in ('train', 'val', 'trainval', 'test'):
with open(os.path.join(img_sets_dir, '%s.txt' % split), 'r') as f:
img_sets = [i.split()[0] for i in f.readlines()]
for i, img in enumerate(img_sets, 1):
img_path = os.path.join(img_dir, img + '.jpg')
assert os.path.exists(img_path), 'Cannot find the image: {}.'.format(img_path)
if split != 'test':
mask_path = os.path.join(mask_dir, img + '.png')
assert os.path.exists(mask_path), 'Cannot find the mask: {}.'.format(mask_path)
new_mask_path = os.path.join(mask_dir, img + '_trainIds.png')
if not os.path.exists(new_mask_path):
mask = np.array(Image.open(mask_path).convert('RGB'), dtype=np.uint32)
# encoder: r<<16 + g<<8 + b
encoder = np.left_shift(mask[:, :, 0], 16) + np.left_shift(mask[:, :, 1], 8) + mask[:, :, 2]
index = np.digitize(encoder.ravel(), id_mapping, right=True)
new_mask = id_key[index].reshape(encoder.shape).astype('uint8')
new_mask = Image.fromarray(new_mask)
new_mask.save(new_mask_path)
print('\rGenerating masks_trainIds: %d' % i, end=' ')
sys.stdout.flush()
shutil.copy(os.path.join(img_sets_dir, '%s.txt' % split), os.path.join(data_dir, '%s.txt' % split))
print('\n{}: Found {} images in the folder {}.'.format(split, len(img_sets), img_dir))
print('{}: Found {} masks in the folder {}.'.format(split, len(img_sets), mask_dir))
print('{}: Created data list in {}.'.format(split, data_dir))
####################################################################################
# compute class weights
print()
class_count = np.zeros(num_classes, dtype='int64')
# Get the total number of pixels in all train masks for each class
with open(os.path.join(data_dir, 'train.txt'), 'r') as f:
img_sets = [i.split()[0] for i in f.readlines()]
for i, img in enumerate(img_sets, 1):
mask = np.array(Image.open(os.path.join(mask_dir, img + '_trainIds.png')))
class_count += np.histogram(mask, bins=np.arange(num_classes + 1))[0]
print('\rComputing class weight: %d' % i, end=' ')
sys.stdout.flush()
# including unspecified_id
class_p_unspecified = class_count / np.sum(class_count.astype(np.int64))
class_weight_unspecified = 1 / np.log(1.02 + class_p_unspecified)
# excluding unspecified_id
class_p = class_count[:-1] / np.sum(class_count[:-1].astype(np.int64))
class_weight = 1 / np.log(1.02 + class_p)
def array2string(array, format='%.6f'):
return ', '.join([format % i for i in array])
print()
with open(os.path.join(data_dir, 'args.txt'), 'w') as f:
# valid_labels
f.writelines('valid class:\n')
f.writelines('{}\n\n'.format(valid_labels))
# unspecified_id
f.writelines('unspecified_id: {}\n\n'.format(unspecified_id))
# train_id
f.writelines('train_id:\n')
f.writelines(array2string(train_id, '%d') + '\n\n')
# class_count
f.writelines('pixel counts for each class:\n')
f.writelines(array2string(class_count, '%d') + '\n\n')
# class_p_unspecified
f.writelines('class probability including unspecified_id:\n')
f.writelines(array2string(class_p_unspecified) + '\n\n')
# class_weight_unspecified
f.writelines('class weight including unspecified_id:\n')
f.writelines(array2string(class_weight_unspecified) + '\n\n')
# class_p
f.writelines('class probability excluding unspecified_id:\n')
f.writelines(array2string(class_p) + '\n\n')
# class_weight
f.writelines('class weight excluding unspecified_id:\n')
f.writelines(array2string(class_weight) + '\n\n')
print('Generated class weight in {}.'.format(os.path.join(data_dir, 'args.txt')))