YoloV1论文代码解读

yolov1 主程序解读:

一:基本流程串讲

1:主程序在train.py中,前面使用了resnnet50和vgg16_bn的一个配置 ,以及把这2个网络参数加载到自定义的网络模型net中,

2:定义了损失函数yoloLoss,具体代码如下:

#损失函数的实现就是和论文中的一致,需要进一步详细解读
criterion = yoloLoss(7, 2, 5, 0.5)

3:定义了优化器:

optimizer = torch.optim.SGD(params, lr=learning_rate, momentum=0.9, weight_decay=5e-4)

4:定义了训练数据集和验证数据集

#训练集
train_dataset = yoloDataset(root=file_root, list_file=['voc2007.txt'], train=True, transform=[transforms.ToTensor()])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
#验证集
test_dataset = yoloDataset(root=test_data_path, list_file='voc2007subtest.txt', train=False,
                           transform=[transforms.ToTensor()])
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

5:开始训练:

for epoch in range(num_epochs):
    net.train()
    if epoch == 30:
        learning_rate = 0.0001
    if epoch == 40:
        learning_rate = 0.00001
    # optimizer = torch.optim.SGD(net.parameters(),lr=learning_rate*0.1,momentum=0.9,weight_decay=1e-4)
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    print('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs))
    print('Learning Rate for this epoch: {}'.format(learning_rate))

    total_loss = 0.
    time_begin = time.time()
    for i, (images, target) in enumerate(train_loader):
        images = Variable(images)
        target = Variable(target)
        if use_gpu:
            images, target = images.cuda(), target.cuda()

        pred = net(images)
        loss = criterion(pred, target)
        # 原来代码实现
        # total_loss += loss.data[0]
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i + 1) % 5 == 0:
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f, average_loss: %.4f'
                  # %(epoch+1, num_epochs, i+1, len(train_loader), loss.data[0], total_loss / (i+1)))
                  % (epoch + 1, num_epochs, i + 1, len(train_loader), loss.item(), total_loss / (i + 1)))
            num_iter += 1
            vis.plot_train_val(loss_train=total_loss / (i + 1))
    time_end = time.time()
    print('Epoch [%d],cost time [%f]' % (num_epochs, (time_end - time_begin)))
    # validation
    validation_loss = 0.0
    net.eval()
    for i, (images, target) in enumerate(test_loader):
        images = Variable(images, volatile=True)
        target = Variable(target, volatile=True)
        if use_gpu:
            images, target = images.cuda(), target.cuda()

        pred = net(images)
        loss = criterion(pred, target)

        # validation_loss += loss.data[0]
        validation_loss += loss.item()

    validation_loss /= len(test_loader)
    vis.plot_train_val(loss_val=validation_loss)

    if best_test_loss > validation_loss:
        best_test_loss = validation_loss
        print('get best test loss %.5f' % best_test_loss)
        torch.save(net.state_dict(), 'best.pth')
    logfile.writelines(str(epoch) + '\t' + str(validation_loss) + '\n')
    logfile.flush()
    torch.save(net.state_dict(), 'yolo.pth')

二:数据集处理

1:数据预处理

对数据集的定义如下:

    def __init__(self, root, list_file, train, transform):
        print('data init')
        self.root = root
        self.train = train
        self.transform = transform
        self.fnames = []
        self.boxes = []
        self.labels = []
        self.mean = (123, 117, 104)  # RGB
        print("yoloDataset init :list_file is {0},type is {1}".format(list_file,type(list_file)))
        if isinstance(list_file, list):
            # Cat multiple list files together.
            # This is especially useful for voc07/voc12 combination.
            # tmp_file = '/tmp/listfile.txt'
            tmp_file = "D:\code\pycharm_workspace\pytorch-YOLO-v1-master\pytorch-YOLO-v1-master\tmp\listfile.txt"

            tem_tem_str = (' '.join(list_file), tmp_file)
            print("tem_tem_str : {0}".format(tem_tem_str))
            # tem_str = 'cat %s > %s ' % (' '.join(list_file), tmp_file)
            # print(tem_str)
            # os.system('cat %s > %s ' % (' '.join(list_file), tmp_file))
            os.system('type %s > %s ' % (' '.join(list_file), tmp_file))
            list_file = tmp_file

        # print(list_file)
        with open(list_file) as f:
            lines = f.readlines()

        for line in lines:
            splited = line.strip().split()
            self.fnames.append(splited[0])
            num_boxes = (len(splited) - 1) // 5
            box = []
            label = []
            for i in range(num_boxes):
                x = float(splited[1 + 5 * i])
                y = float(splited[2 + 5 * i])
                x2 = float(splited[3 + 5 * i])
                y2 = float(splited[4 + 5 * i])
                c = splited[5 + 5 * i]
                box.append([x, y, x2, y2])
                label.append(int(c) + 1)

            self.boxes.append(torch.Tensor(box))
            self.labels.append(torch.LongTensor(label))
        self.num_samples = len(self.boxes)

主要是将在list_file 文件中的数据读取出来之后处理到了自己的成员变量中,其中将图片中元素的相关信息转换存储到类成员变量boxes 和 labels中,简单贴一下输入和输出的数据结构:

输入:主要是listfile.txt中记录的图片信息 包括的是论文中的 x,y,w,h,c,为了方便展示 ,代码中只是用了一个 ,具体内容如下:

listfile.txt中内容
002850.jpg 327 86 481 219 19 196 80 314 192 19 128 226 309 374 8

具体图片长这个样子:

002850.jpg

经过 yoloDataset(data.Dataset) 这个类的构造函数处理完成之后,包括了一下几个主要的数据供用来处理:

#boxes[]
print(self.boxes[0].shape)
#输出结果:torch.Size([3, 4])
print(self.boxes)
# 输出结果 
        [tensor([[327.,  86., 481., 219.],
        [196.,  80., 314., 192.],
        [128., 226., 309., 374.]])]
        
 print(self.labels[0].shape)
 # 输出结果 torch.Size([3])
 print(self.labels)
 # 输出结果[tensor([20, 20,  9])]
    
 print("len(self.boxes):{0}".format(len(self.boxes)))
# 输出结果 len(self.boxes):1
# boxes的数量即为listfile.txt 中标签包含的数量
 print(self.fnames.__len__())
  # 输出结果  1  
 print("self.fnames:{0}".format(self.fnames))
 # 输出结果 self.fnames:['002850.jpg']

以上就是数据预处理,预处理主要干的事情是以下几步 :

1:txt文件中的标签处理成tensor类型,主要是2个tensor,boxes和labels,其中boxes是一个只有一个元素的list,这一个元素是一个(3,4)的tensor,labels 是一个[3]的tensor

2:初始化了一些其他使用的元素,如fnames,fnames是一个list,这个list中包含的是listfile.txt 训练数据中包含的图片

2:处理数据入参

这一步是对初始化完成的数据进行进一步加工,具体关键步骤有以下2步:

 #加载训练数据集每一个数据
    def __getitem__(self, idx):
        print("__getitem__")
        fname = self.fnames[idx]
        impate_path = os.path.join(self.root + fname)
        print(fname)
        print("impate_path {0}".format(impate_path))
        img = cv2.imread(os.path.join(self.root + fname))
        boxes = self.boxes[idx].clone()
        labels = self.labels[idx].clone()

        if self.train:
            # img = self.random_bright(img)

            img, boxes = self.random_flip(img, boxes)
            img, boxes = self.randomScale(img, boxes)

            img = self.randomBlur(img)
            img = self.RandomBrightness(img)
            img = self.RandomHue(img)
            img = self.RandomSaturation(img)
            img, boxes, labels = self.randomShift(img, boxes, labels)
            img, boxes, labels = self.randomCrop(img, boxes, labels)
        # #debug
        # box_show = boxes.numpy().reshape(-1)
        # print(box_show)
        # img_show = self.BGR2RGB(img)
        # pt1=(int(box_show[0]),int(box_show[1])); pt2=(int(box_show[2]),int(box_show[3]))
        # cv2.rectangle(img_show,pt1=pt1,pt2=pt2,color=(0,255,0),thickness=1)
        # plt.figure()

        # # cv2.rectangle(img,pt1=(10,10),pt2=(100,100),color=(0,255,0),thickness=1)
        # plt.imshow(img_show)
        # plt.show()
        # #debug
        h, w, _ = img.shape
        print(" before expand_as ")
        print(boxes)
        boxes /= torch.Tensor([w, h, w, h]).expand_as(boxes)
        print(" after expand_as ")
        print(boxes)
        img = self.BGR2RGB(img)  # because pytorch pretrained model use RGB
        img = self.subMean(img, self.mean)  # 减去均值
        img = cv2.resize(img, (self.image_size, self.image_size))
        target = self.encoder(boxes, labels)  # 7x7x30
        for t in self.transform:
            img = t(img)

        return img, target
#将标签中的boxes 和labbel 合2 为1 处理成target数据,target数据就是一个tensor
def encoder(self, boxes, labels):
        '''
        boxes (tensor) [[x1,y1,x2,y2],[]]
        labels (tensor) [...]
        return 7x7x30
        '''
        print("encoder")
        grid_num = 14
        target = torch.zeros((grid_num, grid_num, 30))
        cell_size = 1. / grid_num
        wh = boxes[:, 2:] - boxes[:, :2]
        cxcy = (boxes[:, 2:] + boxes[:, :2]) / 2
        for i in range(cxcy.size()[0]):
            cxcy_sample = cxcy[i]
            ij = (cxcy_sample / cell_size).ceil() - 1  #
            target[int(ij[1]), int(ij[0]), 4] = 1
            target[int(ij[1]), int(ij[0]), 9] = 1
            target[int(ij[1]), int(ij[0]), int(labels[i]) + 9] = 1
            xy = ij * cell_size  # 匹配到的网格的左上角相对坐标
            delta_xy = (cxcy_sample - xy) / cell_size
            target[int(ij[1]), int(ij[0]), 2:4] = wh[i]
            target[int(ij[1]), int(ij[0]), :2] = delta_xy
            target[int(ij[1]), int(ij[0]), 7:9] = wh[i]
            target[int(ij[1]), int(ij[0]), 5:7] = delta_xy
        print("encoder end")
        print("target :{0}".format(target.shape))
        #输出结果 :target :torch.Size([14, 14, 30])
        return target

以上2个函数主要处理2步:

第一步:拿到构造函数中的boxes 和labels 还有fnames,把图片读取进来,如果是训练数据要对训练数据进行 随机的 翻转(random_flip),缩放(randomScale),均值滤波(randomBlur),随机亮度处理(RandomBrightness),色度处理(RandomHue),随机饱和度处理(RandomSaturation),随机平移处理(randomShift),随机抠图处理(randomCrop),以上这些处理都是随机进行的 ,完成这些之后将图片resize到 论文中说的 图片处理大小,即(448 * 448) 大小的图片

第二步:

对boxes的数据进行编码,在encoder 函数中实现,最后输出的target 为 [14, 14, 30] 的tensor,这个对应论文中的7*7*30的tensor

三:损失函数损失计算

四:数据分类

你可能感兴趣的:(YoloV1论文代码解读)