<database>The VOC2007 Databasedatabase>
<annotation>PASCAL VOC2007annotation>
<flickrid>Fried Camelsflickrid>
<name>Jinky the Fruit Batname>
和最后一行 组成一个标签对,是xml文档里唯一的根元素;xml元素可以进行嵌套,所有元素都必须要有结束标签,如
在标注文件中主要需要获取 和
import xml.etree.ElementTree as ET
anno = ET.parse(annotation_path).getroot()
boxes = []
classes = []
for obj in anno.iter("object"): # 定位到
difficult = int(obj.find("difficult").text) == 1
if not self.use_difficult and difficult:
_box = obj.find("bndbox") # 定位到此
box = [
_box.find("xmin").text, # 继续获取下的4个元素的内容,box = [xmin, ymin, xmax, ymax]
box = tuple(
map(lambda x: x - TO_REMOVE, list(map(float, box)))
name = obj.find("name").text.lower().strip()
boxes = np.array(boxes, dtype=np.float32)
# Refer to "https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/pascal_voc.py#L208-L211"
json格式文件简介,对象是一个无序的“‘名称/值’对”集合。一个对象以 {
左括号 开始, }
右括号 结束。每个“名称”后跟一个 :
冒号 ;“‘名称/值’ 对”之间使用 ,
逗号 分隔。
"info": info, # dict
"images": [license], # list ,内部是dict
"annotations": [annotation], # list ,内部是dict
"categories": # list ,内部是dict
"info": { # 数据集信息描述
"description": "COCO 2017 Dataset", # 数据集描述
"url": "http://cocodataset.org", # 下载地址
"version": "1.0", # 版本
"year": 2017, # 年份
"contributor": "COCO Consortium", # 提供者
"date_created": "2017/09/01" # 数据创建日期
"images": [
"license": 4,
"file_name": "000000397133.jpg", # 图片名
"coco_url": "http://images.cocodataset.org/val2017/000000397133.jpg",# 网路地址路径
"height": 427, # 高
"width": 640, # 宽
"date_captured": "2013-11-14 17:02:52", # 数据获取日期
"flickr_url": "http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg",# flickr网路地址
"id": 397133 # 图片的ID编号(每张图片ID是唯一的)
"annotations": [
"segmentation": [ # 对象的边界点(边界多边形)
224.24,297.18,# 第一个点 x,y坐标
228.29,297.18, # 第二个点 x,y坐标
"area": 1481.3806499999994, # 区域面积
"iscrowd": 0, #
"image_id": 397133, # 对应的图片ID(与images中的ID对应)
"bbox": [217.62,240.54,38.99,57.75], # 定位边框 [x,y,w,h]
"category_id": 44, # 类别ID(与categories中的ID对应)
"id": 82445 # 对象ID,因为每一个图像有不止一个对象,所以要对每一个对象编号(每个对象的ID是唯一的)
"categories": [ # 类别描述
"supercategory": "person", # 主类别
"id": 1, # 类对应的id (0 默认为背景)
"name": "person" # 子类别
"supercategory": "vehicle",
"id": 2,
"name": "bicycle"
"supercategory": "vehicle",
"id": 3,
"name": "car"
def __init__(self, root_dir, train_set=None, resize_size=None, use_difficult=False, is_train=True,
if resize_size is None:
resize_size = [800, 1333]
self.root = root_dir
self.use_difficult = use_difficult
self.train_set = train_set
self.img_ids = list()
for (year, name) in self.train_set:
root_path = os.path.join(self.root, 'VOC' + year)
for line in open(os.path.join(root_path, 'ImageSets/Main', name + '.txt')):
self.img_ids.append((root_path, line.strip()))
self.name2id = dict(zip(VOCDataset.CLASSES_NAME, range(len(VOCDataset.CLASSES_NAME))))
self.id2name = {v: k for k, v in self.name2id.items()}
self.resize_size = resize_size
self.mean = [0.485, 0.456, 0.406]
self.std = [0.229, 0.224, 0.225]
self.train = is_train
self.transform = transform
print("INFO=====>voc dataset init finished ! !")
def __len__(self):
return len(self.img_ids)
def __getitem__(self, index):
img_id = self.img_ids[index]
img = self.load_image(img_id)
annot = self.load_annotations(img_id)
sample = {'img': img, 'annot': annot}
if self.transform:
sample = self.transform(sample)
return sample
def load_annotations(self, img_id):
anno = ET.parse(os.path.join(img_id[0], 'Annotations', img_id[1] + '.xml')).getroot()
annotations = np.zeros((0, 5))
for obj in anno.iter("object"):
difficult = int(obj.find("difficult").text) == 1
if not self.use_difficult and difficult:
anno = np.zeros((1, 5))
box = np.zeros((1, 4))
cls = np.zeros((1, 1))
_box = obj.find("bndbox")
name = obj.find("name").text.lower().strip()
box[0, :4] = [_box.find("xmin").text,
box = tuple(
map(lambda x: x - TO_REMOVE, list(map(float, box[0])))
cls[0, 0] = self.name2id[name]
anno[0, :4] = np.array(box, dtype=np.float32)
anno[0, 4] = cls
annotations = np.append(annotations, anno, axis=0)
return annotations
def load_image(self, img_id):
img_path = os.path.join(img_id[0], 'JPEGImages', img_id[1] + '.jpg')
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img.astype(np.float32) / 255.
class Resizer_(object):
def __init__(self, img_sizes=None):
if img_sizes is None:
img_sizes = [480, 640]
self.img_sizes = img_sizes
def __call__(self, sample):
min_side, max_side = self.img_sizes
image, annots = sample['img'], sample['annot']
height, width, _ = image.shape
smallest_side = min(width, height)
largest_side = max(width, height)
scale = min_side / smallest_side
if largest_side * scale > max_side:
scale = max_side / largest_side
nw, nh = int(scale * width), int(scale * height)
image_resized = cv2.resize(image, (nw, nh))
pad_w = 32 - nw % 32
pad_h = 32 - nh % 32
image_paded = np.zeros(shape=[nh + pad_h, nw + pad_w, 3], dtype=np.uint8)
image_paded[:nh, :nw, :] = image_resized
if annots is None:
return image_paded
annots[..., [0, 2]] = annots[..., [0, 2]] * scale
annots[..., [1, 3]] = annots[..., [1, 3]] * scale
return {'img': torch.from_numpy(image_paded).to(torch.float32), 'annot': torch.from_numpy(annots),
'scale': scale}
class Augmenter(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample, flip_x=0.5):
if np.random.rand() < flip_x:
image, annots = sample['img'], sample['annot']
image = image[:, ::-1, :]
rows, cols, channels = image.shape
x1 = annots[:, 0].copy()
x2 = annots[:, 2].copy()
x_tmp = x1.copy()
annots[:, 0] = cols - x2
annots[:, 2] = cols - x_tmp
sample = {'img': image, 'annot': annots}
return sample
class Normalizer(object):
def __init__(self, mean=None, std=None):
if mean is None:
mean = [0.485, 0.456, 0.406]
if std is None:
std = [0.229, 0.224, 0.225]
self.mean = np.array([[mean]])
self.std = np.array([[std]])
def __call__(self, sample):
image, annots = sample['img'], sample['annot']
return {'img': ((image.astype(np.float32) - self.mean) / self.std), 'annot': annots}
def collate_fn(sample):
imgs_list = [s['img'] for s in sample]
annots_list = [s['annot'] for s in sample]
scales = [s['scale'] for s in sample]
assert len(imgs_list) == len(annots_list)
batch_size = len(annots_list)
pad_imgs_list = []
pad_annots_list = []
h_list = [int(s.shape[0]) for s in imgs_list]
w_list = [int(s.shape[1]) for s in imgs_list]
max_h = max(np.array(h_list))
max_w = max(np.array(w_list))
for i in range(batch_size):
img = imgs_list[i]
torch.nn.functional.pad(img, (0, 0, 0, int(max_w - img.shape[1]), 0, int(max_h - img.shape[0])), value=0.))
max_num = 0
for i in range(batch_size):
n = annots_list[i].shape[0]
if n > max_num:
max_num = n
for i in range(batch_size):
torch.nn.functional.pad(annots_list[i], (0, 0, 0, max_num - annots_list[i].shape[0]), value=-1))
batch_imgs = torch.stack(pad_imgs_list).permute(0, 3, 1, 2)
batch_annots = torch.stack(pad_annots_list)
return {'img': batch_imgs, 'annot': batch_annots, 'scale': scales}
from torchvision.datasets import CocoDetection
import torch
import numpy as np
from torchvision import transforms
import cv2
from PIL import Image
import random
import torch.nn.functional
def flip(img, annots):
img = img.transpose(Image.FLIP_LEFT_RIGHT)
w = img.width
if annots.shape[0] != 0:
xmin = w - annots[:, 2]
xmax = w - annots[:, 0]
annots[:, 2] = xmax
annots[:, 0] = xmin
return img, annots
class COCODataset(CocoDetection):
'__back_ground__', 'person', 'bicycle', 'car', 'motorcycle',
'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
'fire hydrant', 'stop sign', 'parking meter', 'bench',
'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
'sports ball', 'kite', 'baseball bat', 'baseball glove',
'skateboard', 'surfboard', 'tennis racket', 'bottle',
'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli',
'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
'couch', 'potted plant', 'bed', 'dining table', 'toilet',
'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
'book', 'clock', 'vase', 'scissors', 'teddy bear',
'hair drier', 'toothbrush')
def __init__(self, imgs_path, anno_path, resize_size=None, is_train=True, transform=None):
super().__init__(imgs_path, anno_path)
if resize_size is None:
resize_size = [800, 1333]
print("INFO====>check annos, filtering invalid data......")
ids = []
for id in self.ids:
ann_id = self.coco.getAnnIds(imgIds=id, iscrowd=None)
ann = self.coco.loadAnns(ann_id)
if self._has_valid_annotation(ann):
self.ids = ids
self.category2id = {v: i + 1 for i, v in enumerate(self.coco.getCatIds())}
self.id2category = {v: k for k, v in self.category2id.items()}
self.transform = transform
self.resize_size = resize_size
self.mean = [0.40789654, 0.44719302, 0.47026115]
self.std = [0.28863828, 0.27408164, 0.27809835]
self.train = is_train
def __getitem__(self, index):
img, ann = super().__getitem__(index)
ann = [o for o in ann if o['iscrowd'] == 0]
boxes = [o['bbox'] for o in ann]
boxes = np.array(boxes, dtype=np.float32)
# xywh-->xyxy
boxes[..., 2:] = boxes[..., 2:] + boxes[..., :2]
if self.train:
if random.random() < 0.5:
img, boxes = flip(img, boxes)
if self.transform is not None:
img, boxes = self.transform(img, boxes)
img = np.array(img)
img, boxes = self.preprocess_img_boxes(img, boxes, self.resize_size)
# img=draw_bboxes(img,boxes)
classes = [o['category_id'] for o in ann]
classes = [self.category2id[c] for c in classes]
img = transforms.ToTensor()(img)
# img= transforms.Normalize(self.mean, self.std,inplace=True)(img)
boxes = torch.from_numpy(boxes)
classes = torch.LongTensor(classes)
return {'img': img, 'boxes': boxes, 'classes': classes}
def preprocess_img_boxes(image, boxes, input_ksize):
resize image and bboxes
image_paded: input_ksize
bboxes: [None,4]
min_side, max_side = input_ksize
h, w, _ = image.shape
smallest_side = min(w, h)
largest_side = max(w, h)
scale = min_side / smallest_side
if largest_side * scale > max_side:
scale = max_side / largest_side
nw, nh = int(scale * w), int(scale * h)
image_resized = cv2.resize(image, (nw, nh))
pad_w = 32 - nw % 32
pad_h = 32 - nh % 32
image_paded = np.zeros(shape=[nh + pad_h, nw + pad_w, 3], dtype=np.uint8)
image_paded[:nh, :nw, :] = image_resized
if boxes is None:
return image_paded
boxes[:, [0, 2]] = boxes[:, [0, 2]] * scale
boxes[:, [1, 3]] = boxes[:, [1, 3]] * scale
return image_paded, boxes
def _has_only_empty_bbox(annot):
return all(any(o <= 1 for o in obj['bbox'][2:]) for obj in annot)
def _has_valid_annotation(self, annot):
if len(annot) == 0:
return False
if self._has_only_empty_bbox(annot):
return False
return True
def collate_fn(self, sample):
imgs_list = [s['img'] for s in sample]
boxes_list = [s['boxes'] for s in sample]
classes_list = [s['classes'] for s in sample]
assert len(imgs_list) == len(boxes_list) == len(classes_list)
batch_size = len(boxes_list)
pad_imgs_list = []
pad_boxes_list = []
pad_classes_list = []
h_list = [int(s.shape[1]) for s in imgs_list]
w_list = [int(s.shape[2]) for s in imgs_list]
max_h = max(np.array(h_list))
max_w = max(np.array(w_list))
for i in range(batch_size):
img = imgs_list[i]
pad_imgs_list.append(transforms.Normalize(self.mean, self.std, inplace=True)(
torch.nn.functional.pad(img, (0, int(max_w - img.shape[2]), 0, int(max_h - img.shape[1])), value=0.)))
max_num = 0
for i in range(batch_size):
n = boxes_list[i].shape[0]
if n > max_num: max_num = n
for i in range(batch_size):
torch.nn.functional.pad(boxes_list[i], (0, 0, 0, max_num - boxes_list[i].shape[0]), value=-1))
torch.nn.functional.pad(classes_list[i], (0, max_num - classes_list[i].shape[0]), value=-1))
batch_boxes = torch.stack(pad_boxes_list)
batch_classes = torch.stack(pad_classes_list)
batch_imgs = torch.stack(pad_imgs_list)
return {'img': batch_imgs, 'boxes': batch_boxes, 'classes': batch_classes}
if __name__ == "__main__":
dataset = COCODataset("H:/DataSets/COCO/coco2017/train2017", "H:/DataSets/COCO/coco2017/annotations/instances_train2017.json")
import torch.utils.data
import matplotlib.pyplot as plt
import cv2
import numpy as np
train_loader = torch.utils.data.DataLoader(dataset, batch_size=1)
cmap = plt.get_cmap('tab20b')
colors = [cmap(i) for i in np.linspace(0, 1, len(dataset.CLASSES_NAME))]
for num, data in enumerate(train_loader):
image = np.asarray(data['img'].squeeze(0).permute(1, 2, 0) * 255, dtype='uint8')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
boxes = data['boxes'].squeeze(0)
classes = data['classes'].squeeze(0)
for i, box in enumerate(boxes):
pt1 = (int(box[0]), int(box[1]))
pt2 = (int(box[2]), int(box[3]))
cv2.rectangle(image, pt1, pt2, tuple(255 * j for j in list(colors[int(classes[i])][:3])))
# cv2.rectangle(image, pt1, pt2, (255, 0, 0))
cls = "%s" % (COCODataset.CLASSES_NAME[int(classes[i])])
cv2.putText(image, cls, (int(box[0]), int(box[1])+20), 0, 1, tuple(255 * j for j in list(colors[int(classes[i])][:3])), 2)
cv2.imshow('img', image)