<annotation>
<folder>VOC2007folder>
<filename>000001.jpgfilename>
<source>
<database>The VOC2007 Databasedatabase>
<annotation>PASCAL VOC2007annotation>
<image>flickrimage>
<flickrid>341012865flickrid>
source>
<owner>
<flickrid>Fried Camelsflickrid>
<name>Jinky the Fruit Batname>
owner>
<size>
<width>353width>
<height>500height>
<depth>3depth>
size>
<segmented>0segmented>
<object>
<name>dogname>
<pose>Leftpose>
<truncated>1truncated>
<difficult>0difficult>
<bndbox>
<xmin>48xmin>
<ymin>240ymin>
<xmax>195xmax>
<ymax>371ymax>
bndbox>
object>
<object>
<name>personname>
<pose>Leftpose>
<truncated>1truncated>
<difficult>0difficult>
<bndbox>
<xmin>8xmin>
<ymin>12ymin>
<xmax>352xmax>
<ymax>498ymax>
bndbox>
object>
annotation>
一个xml文档须有第一行声明及其文档元素的描述信息(xml详解)。以上标注文本中的第一行
和最后一行 组成一个标签对,是xml文档里唯一的根元素;xml元素可以进行嵌套,所有元素都必须要有结束标签,如
,标签对之间的内容VOC2007
为此元素的内容;
在标注文件中主要需要获取 和
之间的内容。
import xml.etree.ElementTree as ET
anno = ET.parse(annotation_path).getroot()
boxes = []
classes = []
for obj in anno.iter("object"): # 定位到
difficult = int(obj.find("difficult").text) == 1
if not self.use_difficult and difficult:
continue
_box = obj.find("bndbox") # 定位到此
box = [
_box.find("xmin").text, # 继续获取下的4个元素的内容,box = [xmin, ymin, xmax, ymax]
_box.find("ymin").text,
_box.find("xmax").text,
_box.find("ymax").text,
]
TO_REMOVE = 1
box = tuple(
map(lambda x: x - TO_REMOVE, list(map(float, box)))
)
boxes.append(box)
name = obj.find("name").text.lower().strip()
classes.append(self.name2id[name])
boxes = np.array(boxes, dtype=np.float32)
# Refer to "https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/pascal_voc.py#L208-L211"
coco2014
coco2017
json格式文件简介,对象是一个无序的“‘名称/值’对”集合。一个对象以 {
左括号 开始, }
右括号 结束。每个“名称”后跟一个 :
冒号 ;“‘名称/值’ 对”之间使用 ,
逗号 分隔。
coco数据集加载一般使用COCO官方提供的API,coco数据集的标注文件json以字典的形式存储标注信息,主要包括以下关键信息:参考
{
"info": info, # dict
"images": [license], # list ,内部是dict
"annotations": [annotation], # list ,内部是dict
"categories": # list ,内部是dict
}
"info"字段存储数据集信息
"info": { # 数据集信息描述
"description": "COCO 2017 Dataset", # 数据集描述
"url": "http://cocodataset.org", # 下载地址
"version": "1.0", # 版本
"year": 2017, # 年份
"contributor": "COCO Consortium", # 提供者
"date_created": "2017/09/01" # 数据创建日期
},
"images"字段存储图片的信息,每个{}对应一张图片
"images": [
{
"license": 4,
"file_name": "000000397133.jpg", # 图片名
"coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg",# 网路地址路径
"height": 427, # 高
"width": 640, # 宽
"date_captured": "2013-11-14 17:02:52", # 数据获取日期
"flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg",# flickr网路地址
"id": 397133 # 图片的ID编号(每张图片ID是唯一的)
},
……
……
],
"annotations"存储标注信息
"annotations": [
{
"segmentation": [ # 对象的边界点(边界多边形)
[
224.24,297.18,# 第一个点 x,y坐标
228.29,297.18, # 第二个点 x,y坐标
234.91,298.29,
……
……
225.34,297.55
]
],
"area": 1481.3806499999994, # 区域面积
"iscrowd": 0, #
"image_id": 397133, # 对应的图片ID(与images中的ID对应)
"bbox": [217.62,240.54,38.99,57.75], # 定位边框 [x,y,w,h]
"category_id": 44, # 类别ID(与categories中的ID对应)
"id": 82445 # 对象ID,因为每一个图像有不止一个对象,所以要对每一个对象编号(每个对象的ID是唯一的)
},
……
……
]
"categories"保存类别信息
"categories": [ # 类别描述
{
"supercategory": "person", # 主类别
"id": 1, # 类对应的id (0 默认为背景)
"name": "person" # 子类别
},
{
"supercategory": "vehicle",
"id": 2,
"name": "bicycle"
},
{
"supercategory": "vehicle",
"id": 3,
"name": "car"
},
……
……
]
自己构建的Dataset类继承自torch.utils.data.Dataset
,需要实现__init__
、__len__
、__getitem__
三个方法,并定义数据集类别CLASSES_NAME
属性,保存数据集中的所有类别;
在类初始化方法中,初始化数据集路径、图片尺寸、数据集集合、增广等:
def __init__(self, root_dir, train_set=None, resize_size=None, use_difficult=False, is_train=True,
transform=None):
if resize_size is None:
resize_size = [800, 1333]
self.root = root_dir
self.use_difficult = use_difficult
self.train_set = train_set
self.img_ids = list()
for (year, name) in self.train_set:
root_path = os.path.join(self.root, 'VOC' + year)
for line in open(os.path.join(root_path, 'ImageSets/Main', name + '.txt')):
self.img_ids.append((root_path, line.strip()))
self.name2id = dict(zip(VOCDataset.CLASSES_NAME, range(len(VOCDataset.CLASSES_NAME))))
self.id2name = {v: k for k, v in self.name2id.items()}
self.resize_size = resize_size
self.mean = [0.485, 0.456, 0.406]
self.std = [0.229, 0.224, 0.225]
self.train = is_train
self.transform = transform
print("INFO=====>voc dataset init finished ! !")
必须有__len__
方法返回数据集数量:
def __len__(self):
return len(self.img_ids)
__getitem__
方法迭代的从数据集中读取图片和标签数据,以字典形式返回:
def __getitem__(self, index):
img_id = self.img_ids[index]
img = self.load_image(img_id)
annot = self.load_annotations(img_id)
sample = {'img': img, 'annot': annot}
if self.transform:
sample = self.transform(sample)
return sample
分别构建图片加载函数load_image
和标签读取函数load_annotations
:
def load_annotations(self, img_id):
anno = ET.parse(os.path.join(img_id[0], 'Annotations', img_id[1] + '.xml')).getroot()
annotations = np.zeros((0, 5))
for obj in anno.iter("object"):
difficult = int(obj.find("difficult").text) == 1
if not self.use_difficult and difficult:
continue
anno = np.zeros((1, 5))
box = np.zeros((1, 4))
cls = np.zeros((1, 1))
_box = obj.find("bndbox")
name = obj.find("name").text.lower().strip()
box[0, :4] = [_box.find("xmin").text,
_box.find("ymin").text,
_box.find("xmax").text,
_box.find("ymax").text,
]
TO_REMOVE = 1
box = tuple(
map(lambda x: x - TO_REMOVE, list(map(float, box[0])))
)
cls[0, 0] = self.name2id[name]
anno[0, :4] = np.array(box, dtype=np.float32)
anno[0, 4] = cls
annotations = np.append(annotations, anno, axis=0)
return annotations
def load_image(self, img_id):
img_path = os.path.join(img_id[0], 'JPEGImages', img_id[1] + '.jpg')
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img.astype(np.float32) / 255.
在数据集加载过程中,可以对数据进行增广、变形、归一化等操作,分别构建Normalizer
、Augmenter
、Resizer_
class Resizer_(object):
def __init__(self, img_sizes=None):
if img_sizes is None:
img_sizes = [480, 640]
self.img_sizes = img_sizes
def __call__(self, sample):
min_side, max_side = self.img_sizes
image, annots = sample['img'], sample['annot']
height, width, _ = image.shape
smallest_side = min(width, height)
largest_side = max(width, height)
scale = min_side / smallest_side
if largest_side * scale > max_side:
scale = max_side / largest_side
nw, nh = int(scale * width), int(scale * height)
image_resized = cv2.resize(image, (nw, nh))
pad_w = 32 - nw % 32
pad_h = 32 - nh % 32
image_paded = np.zeros(shape=[nh + pad_h, nw + pad_w, 3], dtype=np.uint8)
image_paded[:nh, :nw, :] = image_resized
if annots is None:
return image_paded
else:
annots[..., [0, 2]] = annots[..., [0, 2]] * scale
annots[..., [1, 3]] = annots[..., [1, 3]] * scale
return {'img': torch.from_numpy(image_paded).to(torch.float32), 'annot': torch.from_numpy(annots),
'scale': scale}
class Augmenter(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample, flip_x=0.5):
if np.random.rand() < flip_x:
image, annots = sample['img'], sample['annot']
image = image[:, ::-1, :]
rows, cols, channels = image.shape
x1 = annots[:, 0].copy()
x2 = annots[:, 2].copy()
x_tmp = x1.copy()
annots[:, 0] = cols - x2
annots[:, 2] = cols - x_tmp
sample = {'img': image, 'annot': annots}
return sample
class Normalizer(object):
def __init__(self, mean=None, std=None):
if mean is None:
mean = [0.485, 0.456, 0.406]
if std is None:
std = [0.229, 0.224, 0.225]
self.mean = np.array([[mean]])
self.std = np.array([[std]])
def __call__(self, sample):
image, annots = sample['img'], sample['annot']
return {'img': ((image.astype(np.float32) - self.mean) / self.std), 'annot': annots}
为了保持每一个batch中所有图片的尺寸一致,构建在Dataloader中使用的collate_fn
函数,将batch中所有图片和标签数据对齐:
def collate_fn(sample):
imgs_list = [s['img'] for s in sample]
annots_list = [s['annot'] for s in sample]
scales = [s['scale'] for s in sample]
assert len(imgs_list) == len(annots_list)
batch_size = len(annots_list)
pad_imgs_list = []
pad_annots_list = []
h_list = [int(s.shape[0]) for s in imgs_list]
w_list = [int(s.shape[1]) for s in imgs_list]
max_h = max(np.array(h_list))
max_w = max(np.array(w_list))
for i in range(batch_size):
img = imgs_list[i]
pad_imgs_list.append(
torch.nn.functional.pad(img, (0, 0, 0, int(max_w - img.shape[1]), 0, int(max_h - img.shape[0])), value=0.))
max_num = 0
for i in range(batch_size):
n = annots_list[i].shape[0]
if n > max_num:
max_num = n
for i in range(batch_size):
pad_annots_list.append(
torch.nn.functional.pad(annots_list[i], (0, 0, 0, max_num - annots_list[i].shape[0]), value=-1))
batch_imgs = torch.stack(pad_imgs_list).permute(0, 3, 1, 2)
batch_annots = torch.stack(pad_annots_list)
return {'img': batch_imgs, 'annot': batch_annots, 'scale': scales}
COCO数据集有对应的COCOAPI,基于pytorch环境可以构建继承自torchvision.datasets.CocoDetection
的COCODataset
:
from torchvision.datasets import CocoDetection
import torch
import numpy as np
from torchvision import transforms
import cv2
from PIL import Image
import random
import torch.nn.functional
def flip(img, annots):
img = img.transpose(Image.FLIP_LEFT_RIGHT)
w = img.width
if annots.shape[0] != 0:
xmin = w - annots[:, 2]
xmax = w - annots[:, 0]
annots[:, 2] = xmax
annots[:, 0] = xmin
return img, annots
class COCODataset(CocoDetection):
CLASSES_NAME = (
'__back_ground__', 'person', 'bicycle', 'car', 'motorcycle',
'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
'fire hydrant', 'stop sign', 'parking meter', 'bench',
'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
'sports ball', 'kite', 'baseball bat', 'baseball glove',
'skateboard', 'surfboard', 'tennis racket', 'bottle',
'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli',
'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
'couch', 'potted plant', 'bed', 'dining table', 'toilet',
'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
'book', 'clock', 'vase', 'scissors', 'teddy bear',
'hair drier', 'toothbrush')
def __init__(self, imgs_path, anno_path, resize_size=None, is_train=True, transform=None):
super().__init__(imgs_path, anno_path)
if resize_size is None:
resize_size = [800, 1333]
print("INFO====>check annos, filtering invalid data......")
ids = []
for id in self.ids:
ann_id = self.coco.getAnnIds(imgIds=id, iscrowd=None)
ann = self.coco.loadAnns(ann_id)
if self._has_valid_annotation(ann):
ids.append(id)
self.ids = ids
self.category2id = {v: i + 1 for i, v in enumerate(self.coco.getCatIds())}
self.id2category = {v: k for k, v in self.category2id.items()}
self.transform = transform
self.resize_size = resize_size
self.mean = [0.40789654, 0.44719302, 0.47026115]
self.std = [0.28863828, 0.27408164, 0.27809835]
self.train = is_train
def __getitem__(self, index):
img, ann = super().__getitem__(index)
ann = [o for o in ann if o['iscrowd'] == 0]
boxes = [o['bbox'] for o in ann]
boxes = np.array(boxes, dtype=np.float32)
# xywh-->xyxy
boxes[..., 2:] = boxes[..., 2:] + boxes[..., :2]
if self.train:
if random.random() < 0.5:
img, boxes = flip(img, boxes)
if self.transform is not None:
img, boxes = self.transform(img, boxes)
img = np.array(img)
img, boxes = self.preprocess_img_boxes(img, boxes, self.resize_size)
# img=draw_bboxes(img,boxes)
classes = [o['category_id'] for o in ann]
classes = [self.category2id[c] for c in classes]
img = transforms.ToTensor()(img)
# img= transforms.Normalize(self.mean, self.std,inplace=True)(img)
boxes = torch.from_numpy(boxes)
classes = torch.LongTensor(classes)
return {'img': img, 'boxes': boxes, 'classes': classes}
@staticmethod
def preprocess_img_boxes(image, boxes, input_ksize):
"""
resize image and bboxes
Returns
image_paded: input_ksize
bboxes: [None,4]
"""
min_side, max_side = input_ksize
h, w, _ = image.shape
smallest_side = min(w, h)
largest_side = max(w, h)
scale = min_side / smallest_side
if largest_side * scale > max_side:
scale = max_side / largest_side
nw, nh = int(scale * w), int(scale * h)
image_resized = cv2.resize(image, (nw, nh))
pad_w = 32 - nw % 32
pad_h = 32 - nh % 32
image_paded = np.zeros(shape=[nh + pad_h, nw + pad_w, 3], dtype=np.uint8)
image_paded[:nh, :nw, :] = image_resized
if boxes is None:
return image_paded
else:
boxes[:, [0, 2]] = boxes[:, [0, 2]] * scale
boxes[:, [1, 3]] = boxes[:, [1, 3]] * scale
return image_paded, boxes
@staticmethod
def _has_only_empty_bbox(annot):
return all(any(o <= 1 for o in obj['bbox'][2:]) for obj in annot)
def _has_valid_annotation(self, annot):
if len(annot) == 0:
return False
if self._has_only_empty_bbox(annot):
return False
return True
def collate_fn(self, sample):
imgs_list = [s['img'] for s in sample]
boxes_list = [s['boxes'] for s in sample]
classes_list = [s['classes'] for s in sample]
assert len(imgs_list) == len(boxes_list) == len(classes_list)
batch_size = len(boxes_list)
pad_imgs_list = []
pad_boxes_list = []
pad_classes_list = []
h_list = [int(s.shape[1]) for s in imgs_list]
w_list = [int(s.shape[2]) for s in imgs_list]
max_h = max(np.array(h_list))
max_w = max(np.array(w_list))
for i in range(batch_size):
img = imgs_list[i]
pad_imgs_list.append(transforms.Normalize(self.mean, self.std, inplace=True)(
torch.nn.functional.pad(img, (0, int(max_w - img.shape[2]), 0, int(max_h - img.shape[1])), value=0.)))
max_num = 0
for i in range(batch_size):
n = boxes_list[i].shape[0]
if n > max_num: max_num = n
for i in range(batch_size):
pad_boxes_list.append(
torch.nn.functional.pad(boxes_list[i], (0, 0, 0, max_num - boxes_list[i].shape[0]), value=-1))
pad_classes_list.append(
torch.nn.functional.pad(classes_list[i], (0, max_num - classes_list[i].shape[0]), value=-1))
batch_boxes = torch.stack(pad_boxes_list)
batch_classes = torch.stack(pad_classes_list)
batch_imgs = torch.stack(pad_imgs_list)
return {'img': batch_imgs, 'boxes': batch_boxes, 'classes': batch_classes}
if __name__ == "__main__":
dataset = COCODataset("H:/DataSets/COCO/coco2017/train2017", "H:/DataSets/COCO/coco2017/annotations/instances_train2017.json")
import torch.utils.data
import matplotlib.pyplot as plt
import cv2
import numpy as np
train_loader = torch.utils.data.DataLoader(dataset, batch_size=1)
cmap = plt.get_cmap('tab20b')
colors = [cmap(i) for i in np.linspace(0, 1, len(dataset.CLASSES_NAME))]
for num, data in enumerate(train_loader):
image = np.asarray(data['img'].squeeze(0).permute(1, 2, 0) * 255, dtype='uint8')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
boxes = data['boxes'].squeeze(0)
classes = data['classes'].squeeze(0)
for i, box in enumerate(boxes):
pt1 = (int(box[0]), int(box[1]))
pt2 = (int(box[2]), int(box[3]))
cv2.rectangle(image, pt1, pt2, tuple(255 * j for j in list(colors[int(classes[i])][:3])))
# cv2.rectangle(image, pt1, pt2, (255, 0, 0))
cls = "%s" % (COCODataset.CLASSES_NAME[int(classes[i])])
cv2.putText(image, cls, (int(box[0]), int(box[1])+20), 0, 1, tuple(255 * j for j in list(colors[int(classes[i])][:3])), 2)
cv2.imshow('img', image)
cv2.waitKey(0)