yolov1 主程序解读:
一:基本流程串讲
1:主程序在train.py中,前面使用了resnnet50和vgg16_bn的一个配置 ,以及把这2个网络参数加载到自定义的网络模型net中,
2:定义了损失函数yoloLoss,具体代码如下:
#损失函数的实现就是和论文中的一致,需要进一步详细解读
criterion = yoloLoss(7, 2, 5, 0.5)
3:定义了优化器:
optimizer = torch.optim.SGD(params, lr=learning_rate, momentum=0.9, weight_decay=5e-4)
4:定义了训练数据集和验证数据集
#训练集
train_dataset = yoloDataset(root=file_root, list_file=['voc2007.txt'], train=True, transform=[transforms.ToTensor()])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
#验证集
test_dataset = yoloDataset(root=test_data_path, list_file='voc2007subtest.txt', train=False,
transform=[transforms.ToTensor()])
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
5:开始训练:
for epoch in range(num_epochs):
net.train()
if epoch == 30:
learning_rate = 0.0001
if epoch == 40:
learning_rate = 0.00001
# optimizer = torch.optim.SGD(net.parameters(),lr=learning_rate*0.1,momentum=0.9,weight_decay=1e-4)
for param_group in optimizer.param_groups:
param_group['lr'] = learning_rate
print('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs))
print('Learning Rate for this epoch: {}'.format(learning_rate))
total_loss = 0.
time_begin = time.time()
for i, (images, target) in enumerate(train_loader):
images = Variable(images)
target = Variable(target)
if use_gpu:
images, target = images.cuda(), target.cuda()
pred = net(images)
loss = criterion(pred, target)
# 原来代码实现
# total_loss += loss.data[0]
total_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 5 == 0:
print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f, average_loss: %.4f'
# %(epoch+1, num_epochs, i+1, len(train_loader), loss.data[0], total_loss / (i+1)))
% (epoch + 1, num_epochs, i + 1, len(train_loader), loss.item(), total_loss / (i + 1)))
num_iter += 1
vis.plot_train_val(loss_train=total_loss / (i + 1))
time_end = time.time()
print('Epoch [%d],cost time [%f]' % (num_epochs, (time_end - time_begin)))
# validation
validation_loss = 0.0
net.eval()
for i, (images, target) in enumerate(test_loader):
images = Variable(images, volatile=True)
target = Variable(target, volatile=True)
if use_gpu:
images, target = images.cuda(), target.cuda()
pred = net(images)
loss = criterion(pred, target)
# validation_loss += loss.data[0]
validation_loss += loss.item()
validation_loss /= len(test_loader)
vis.plot_train_val(loss_val=validation_loss)
if best_test_loss > validation_loss:
best_test_loss = validation_loss
print('get best test loss %.5f' % best_test_loss)
torch.save(net.state_dict(), 'best.pth')
logfile.writelines(str(epoch) + '\t' + str(validation_loss) + '\n')
logfile.flush()
torch.save(net.state_dict(), 'yolo.pth')
二:数据集处理
1:数据预处理
对数据集的定义如下:
def __init__(self, root, list_file, train, transform):
print('data init')
self.root = root
self.train = train
self.transform = transform
self.fnames = []
self.boxes = []
self.labels = []
self.mean = (123, 117, 104) # RGB
print("yoloDataset init :list_file is {0},type is {1}".format(list_file,type(list_file)))
if isinstance(list_file, list):
# Cat multiple list files together.
# This is especially useful for voc07/voc12 combination.
# tmp_file = '/tmp/listfile.txt'
tmp_file = "D:\code\pycharm_workspace\pytorch-YOLO-v1-master\pytorch-YOLO-v1-master\tmp\listfile.txt"
tem_tem_str = (' '.join(list_file), tmp_file)
print("tem_tem_str : {0}".format(tem_tem_str))
# tem_str = 'cat %s > %s ' % (' '.join(list_file), tmp_file)
# print(tem_str)
# os.system('cat %s > %s ' % (' '.join(list_file), tmp_file))
os.system('type %s > %s ' % (' '.join(list_file), tmp_file))
list_file = tmp_file
# print(list_file)
with open(list_file) as f:
lines = f.readlines()
for line in lines:
splited = line.strip().split()
self.fnames.append(splited[0])
num_boxes = (len(splited) - 1) // 5
box = []
label = []
for i in range(num_boxes):
x = float(splited[1 + 5 * i])
y = float(splited[2 + 5 * i])
x2 = float(splited[3 + 5 * i])
y2 = float(splited[4 + 5 * i])
c = splited[5 + 5 * i]
box.append([x, y, x2, y2])
label.append(int(c) + 1)
self.boxes.append(torch.Tensor(box))
self.labels.append(torch.LongTensor(label))
self.num_samples = len(self.boxes)
主要是将在list_file 文件中的数据读取出来之后处理到了自己的成员变量中,其中将图片中元素的相关信息转换存储到类成员变量boxes 和 labels中,简单贴一下输入和输出的数据结构:
输入:主要是listfile.txt中记录的图片信息 包括的是论文中的 x,y,w,h,c,为了方便展示 ,代码中只是用了一个 ,具体内容如下:
listfile.txt中内容
002850.jpg 327 86 481 219 19 196 80 314 192 19 128 226 309 374 8
具体图片长这个样子:
经过 yoloDataset(data.Dataset) 这个类的构造函数处理完成之后,包括了一下几个主要的数据供用来处理:
#boxes[]
print(self.boxes[0].shape)
#输出结果:torch.Size([3, 4])
print(self.boxes)
# 输出结果
[tensor([[327., 86., 481., 219.],
[196., 80., 314., 192.],
[128., 226., 309., 374.]])]
print(self.labels[0].shape)
# 输出结果 torch.Size([3])
print(self.labels)
# 输出结果[tensor([20, 20, 9])]
print("len(self.boxes):{0}".format(len(self.boxes)))
# 输出结果 len(self.boxes):1
# boxes的数量即为listfile.txt 中标签包含的数量
print(self.fnames.__len__())
# 输出结果 1
print("self.fnames:{0}".format(self.fnames))
# 输出结果 self.fnames:['002850.jpg']
以上就是数据预处理,预处理主要干的事情是以下几步 :
1:txt文件中的标签处理成tensor类型,主要是2个tensor,boxes和labels,其中boxes是一个只有一个元素的list,这一个元素是一个(3,4)的tensor,labels 是一个[3]的tensor
2:初始化了一些其他使用的元素,如fnames,fnames是一个list,这个list中包含的是listfile.txt 训练数据中包含的图片
2:处理数据入参
这一步是对初始化完成的数据进行进一步加工,具体关键步骤有以下2步:
#加载训练数据集每一个数据
def __getitem__(self, idx):
print("__getitem__")
fname = self.fnames[idx]
impate_path = os.path.join(self.root + fname)
print(fname)
print("impate_path {0}".format(impate_path))
img = cv2.imread(os.path.join(self.root + fname))
boxes = self.boxes[idx].clone()
labels = self.labels[idx].clone()
if self.train:
# img = self.random_bright(img)
img, boxes = self.random_flip(img, boxes)
img, boxes = self.randomScale(img, boxes)
img = self.randomBlur(img)
img = self.RandomBrightness(img)
img = self.RandomHue(img)
img = self.RandomSaturation(img)
img, boxes, labels = self.randomShift(img, boxes, labels)
img, boxes, labels = self.randomCrop(img, boxes, labels)
# #debug
# box_show = boxes.numpy().reshape(-1)
# print(box_show)
# img_show = self.BGR2RGB(img)
# pt1=(int(box_show[0]),int(box_show[1])); pt2=(int(box_show[2]),int(box_show[3]))
# cv2.rectangle(img_show,pt1=pt1,pt2=pt2,color=(0,255,0),thickness=1)
# plt.figure()
# # cv2.rectangle(img,pt1=(10,10),pt2=(100,100),color=(0,255,0),thickness=1)
# plt.imshow(img_show)
# plt.show()
# #debug
h, w, _ = img.shape
print(" before expand_as ")
print(boxes)
boxes /= torch.Tensor([w, h, w, h]).expand_as(boxes)
print(" after expand_as ")
print(boxes)
img = self.BGR2RGB(img) # because pytorch pretrained model use RGB
img = self.subMean(img, self.mean) # 减去均值
img = cv2.resize(img, (self.image_size, self.image_size))
target = self.encoder(boxes, labels) # 7x7x30
for t in self.transform:
img = t(img)
return img, target
#将标签中的boxes 和labbel 合2 为1 处理成target数据,target数据就是一个tensor
def encoder(self, boxes, labels):
'''
boxes (tensor) [[x1,y1,x2,y2],[]]
labels (tensor) [...]
return 7x7x30
'''
print("encoder")
grid_num = 14
target = torch.zeros((grid_num, grid_num, 30))
cell_size = 1. / grid_num
wh = boxes[:, 2:] - boxes[:, :2]
cxcy = (boxes[:, 2:] + boxes[:, :2]) / 2
for i in range(cxcy.size()[0]):
cxcy_sample = cxcy[i]
ij = (cxcy_sample / cell_size).ceil() - 1 #
target[int(ij[1]), int(ij[0]), 4] = 1
target[int(ij[1]), int(ij[0]), 9] = 1
target[int(ij[1]), int(ij[0]), int(labels[i]) + 9] = 1
xy = ij * cell_size # 匹配到的网格的左上角相对坐标
delta_xy = (cxcy_sample - xy) / cell_size
target[int(ij[1]), int(ij[0]), 2:4] = wh[i]
target[int(ij[1]), int(ij[0]), :2] = delta_xy
target[int(ij[1]), int(ij[0]), 7:9] = wh[i]
target[int(ij[1]), int(ij[0]), 5:7] = delta_xy
print("encoder end")
print("target :{0}".format(target.shape))
#输出结果 :target :torch.Size([14, 14, 30])
return target
以上2个函数主要处理2步:
第一步:拿到构造函数中的boxes 和labels 还有fnames,把图片读取进来,如果是训练数据要对训练数据进行 随机的 翻转(random_flip),缩放(randomScale),均值滤波(randomBlur),随机亮度处理(RandomBrightness),色度处理(RandomHue),随机饱和度处理(RandomSaturation),随机平移处理(randomShift),随机抠图处理(randomCrop),以上这些处理都是随机进行的 ,完成这些之后将图片resize到 论文中说的 图片处理大小,即(448 * 448) 大小的图片
第二步:
对boxes的数据进行编码,在encoder 函数中实现,最后输出的target 为 [14, 14, 30] 的tensor,这个对应论文中的7*7*30的tensor