voc_dataset
class VOCDataset:
初始化
主要是确定了一下各个文件的目录,确定了class_names
def __init__(self, root, transform=None, target_transform=None, is_test=False, keep_difficult=False, label_file=None):
"""Dataset for VOC data.
Args:
root: the root of the VOC2007 or VOC2012 dataset, the directory contains the following sub-directories:
Annotations, ImageSets, JPEGImages, SegmentationClass, SegmentationObject.
"""
self.root = pathlib.Path(root)
self.transform = transform
self.target_transform = target_transform
if is_test:
image_sets_file = self.root / "ImageSets/Main/test.txt"
else:
image_sets_file = self.root / "ImageSets/Main/trainval.txt"
self.ids = VOCDataset._read_image_ids(image_sets_file) # 得到各个图的编号
self.keep_difficult = keep_difficult
# if the labels file exists, read in the class names
label_file_name = self.root / "labels.txt"
if os.path.isfile(label_file_name):
class_string = ""
with open(label_file_name, 'r') as infile:
for line in infile:
class_string += line.rstrip()
# classes should be a comma separated list
classes = class_string.split(',')
# prepend BACKGROUND as first class
classes.insert(0, 'BACKGROUND')
classes = [ elem.replace(" ", "") for elem in classes]
self.class_names = tuple(classes)
logging.info("VOC Labels read from file: " + str(self.class_names))
else:
logging.info("No labels file, using default VOC classes.")
self.class_names = ('BACKGROUND',
'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair',
'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor')
self.class_dict = {class_name: i for i, class_name in enumerate(self.class_names)}
getitem
image_id获得第index图像的图像名称
从而得到boxes, labels和image的数据
相应的做转换
def __getitem__(self, index):
image_id = self.ids[index]
# boxes:(num_of_objects_per_image, 4), labels:(num_of_objects_per_image, 1)
boxes, labels, is_difficult = self._get_annotation(image_id)
if not self.keep_difficult:
boxes = boxes[is_difficult == 0]
labels = labels[is_difficult == 0]
# image: 第index张图的cv2数据
image = self._read_image(image_id)
# 此时, labels:(num_of_objects_per_image, 1)
if self.transform:
image, boxes, labels = self.transform(image, boxes, labels)
# 此时,labels:(num_of_objects_per_image, 1),维度不变
if self.target_transform:
boxes, labels = self.target_transform(boxes, labels)
# 此时,labels:torch.Size([3000]), 维度得到了统一,因此target_transform是关键
return image, boxes, labels
获得ID信息
# 输入是test.txt文件的地址,将文件名放到ids中去
@staticmethod
def _read_image_ids(image_sets_file):
ids = []
with open(image_sets_file) as f:
for line in f:
ids.append(line.rstrip())
return ids
获得标注信息
根据image_id确定annotation_file的位置,对XML文件做解析,得到bndbox
(假设这张图上一共有num_of_objects_per_image)
最后返回结果是
boxes(框的位置)(num_of_objects_per_image, 4)
labels(第几类物体)(num_of_objects_per_image, 1)
is_difficult(这个目前的情况是一般为空,所以具体什么含义不是很清楚)(num_of_objects_per_image, 4)
def _get_annotation(self, image_id):
annotation_file = self.root / f"Annotations/{image_id}.xml"
objects = ET.parse(annotation_file).findall("object")
boxes = []
labels = []
is_difficult = []
for object in objects:
class_name = object.find('name').text.lower().strip()
# we're only concerned with clases in our list
if class_name in self.class_dict:
bbox = object.find('bndbox')
# VOC dataset format follows Matlab, in which indexes start from 0
x1 = float(bbox.find('xmin').text) - 1
y1 = float(bbox.find('ymin').text) - 1
x2 = float(bbox.find('xmax').text) - 1
y2 = float(bbox.find('ymax').text) - 1
boxes.append([x1, y1, x2, y2])
labels.append(self.class_dict[class_name])
is_difficult_str = object.find('difficult').text
is_difficult.append(int(is_difficult_str) if is_difficult_str else 0)
return (np.array(boxes, dtype=np.float32),
np.array(labels, dtype=np.int64),
np.array(is_difficult, dtype=np.uint8))
获得图像数据
根据image_id确定图像的位置,读取图像,做颜色转换
def _read_image(self, image_id):
image_file = self.root / f"JPEGImages/{image_id}.jpg"
image = cv2.imread(str(image_file))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
return image
labels和bbox的处理
可以发现,在上面两个处理,一个transform,一个target_transform:
if self.transform:
image, boxes, labels = self.transform(image, boxes, labels)
if self.target_transform:
boxes, labels = self.target_transform(boxes, labels)
train_transform: TrainAugmentation
先分析transform,其定义如下。
train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std)
if self.transform:
image, boxes, labels = self.transform(image, boxes, labels)
TrainAugmentation的定义如下。这个写法有点意思。这里augment中的所有集合,输入均为self, img, boxes, labels,输出均为img, boxes, labels
输入维度
- boxes (num_of_objects_per_image, 4)
- labels (num_of_objects_per_image, 1)
输出维度保持不变
class TrainAugmentation:
def __init__(self, size, mean=0, std=1.0):
"""
Args:
size: the size the of final image.
mean: mean pixel value per channel.
"""
self.mean = mean
self.size = size
self.augment = Compose([
ConvertFromInts(), # img图像转np.float32
PhotometricDistort(), # 光谱失真处理
Expand(self.mean),
RandomSampleCrop(),
RandomMirror(),
ToPercentCoords(),
Resize(self.size),
SubtractMeans(self.mean),
lambda img, boxes=None, labels=None: (img / std, boxes, labels),
ToTensor(),
])
def __call__(self, img, boxes, labels):
"""
Args:
img: the output of cv.imread in RGB layout.
boxes: boundding boxes in the form of (x1, y1, x2, y2).
labels: labels of boxes.
"""
return self.augment(img, boxes, labels)
target_transform: MatchPrior
这儿的target_transform定义如下。它的作用就是将标注的数据转换成我们需要的用于anchor回归的数据。
config.priors:(num_priors, 4)
target_transform = MatchPrior(config.priors, config.center_variance,
config.size_variance, 0.5)
MatchPrior具体代码见下(ssd.py)
__init__
用来初始化一些参数
__call__
的写法可以学一下。之后再调用就是走的这条线,把boxes变成我们需要的locations
class MatchPrior(object):
def __init__(self, center_form_priors, center_variance, size_variance, iou_threshold):
self.center_form_priors = center_form_priors
self.corner_form_priors = box_utils.center_form_to_corner_form(center_form_priors)
self.center_variance = center_variance
self.size_variance = size_variance
self.iou_threshold = iou_threshold
def __call__(self, gt_boxes, gt_labels):
if type(gt_boxes) is np.ndarray:
gt_boxes = torch.from_numpy(gt_boxes)
if type(gt_labels) is np.ndarray:
gt_labels = torch.from_numpy(gt_labels)
boxes, labels = box_utils.assign_priors(gt_boxes, gt_labels,
self.corner_form_priors, self.iou_threshold) # 这一步操作
boxes = box_utils.corner_form_to_center_form(boxes)
locations = box_utils.convert_boxes_to_locations(boxes, self.center_form_priors, self.center_variance, self.size_variance)
return locations, labels
看到这儿我有个疑问是这样的:
每张图可能有多个boxes和labels,现在作者虽然都取出来了,但维度应该是[batch_size, boxes_size, labels_size], 这个放到一个Batch里长度不统一,不知道作者是如何实现的。——20190924补充:target_transform: MatchPrior里实现了长度的统一
目前作者写VOC dataset时并没有继续torch中的dataset
assign_priors的代码如下。
def assign_priors(gt_boxes, gt_labels, corner_form_priors,
iou_threshold):
"""Assign ground truth boxes and targets to priors.
Args:
gt_boxes (num_targets, 4): ground truth boxes.
gt_labels (num_targets): labels of targets.
priors (num_priors, 4): corner form priors
Returns:
boxes (num_priors, 4): real values for priors.
labels (num_priros): labels for priors.
"""
# size: num_priors x num_targets iou_of作者定义的算重叠的函数
ious = iou_of(gt_boxes.unsqueeze(0), corner_form_priors.unsqueeze(1))
# size: num_priors
best_target_per_prior, best_target_per_prior_index = ious.max(1)
# size: num_targets
best_prior_per_target, best_prior_per_target_index = ious.max(0)
for target_index, prior_index in enumerate(best_prior_per_target_index):
best_target_per_prior_index[prior_index] = target_index
# 2.0 is used to make sure every target has a prior assigned
best_target_per_prior.index_fill_(0, best_prior_per_target_index, 2)
# size: num_priors
labels = gt_labels[best_target_per_prior_index]
labels[best_target_per_prior < iou_threshold] = 0 # the backgournd id
boxes = gt_boxes[best_target_per_prior_index]
return boxes, labels