caffe-ssd之生成voc数据集、转为lmdb、训练和测试全过程详细解析

目录

  • WIDERFACE转VOC数据集
    • 解读dataset.py
    • 数据集结构
    • 注意
  • 将VOC转为lmdb格式
    • create_list.sh
    • create_data.sh
  • 训练数据集
    • ssd_pascal.py
    • model_libs.py
  • 测试

WIDERFACE转VOC数据集

数据集地址 http://shuoyang1213.me/WIDERFACE/

解读dataset.py

import os,cv2,sys,shutil

from xml.dom.minidom import Document

def writexml(filename,saveimg,bboxes,xmlpath):
    doc = Document()

    annotation = doc.createElement('annotation')

    doc.appendChild(annotation)

    folder = doc.createElement('folder')

    folder_name = doc.createTextNode('widerface')
    folder.appendChild(folder_name)
    annotation.appendChild(folder)
    filenamenode = doc.createElement('filename')
    filename_name = doc.createTextNode(filename)
    filenamenode.appendChild(filename_name)
    annotation.appendChild(filenamenode)
    source = doc.createElement('source')
    annotation.appendChild(source)
    database = doc.createElement('database')
    database.appendChild(doc.createTextNode('wider face Database'))
    source.appendChild(database)
    annotation_s = doc.createElement('annotation')
    annotation_s.appendChild(doc.createTextNode('PASCAL VOC2007'))
    source.appendChild(annotation_s)
    image = doc.createElement('image')
    image.appendChild(doc.createTextNode('flickr'))
    source.appendChild(image)
    flickrid = doc.createElement('flickrid')
    flickrid.appendChild(doc.createTextNode('-1'))
    source.appendChild(flickrid)
    owner = doc.createElement('owner')
    annotation.appendChild(owner)
    flickrid_o = doc.createElement('flickrid')
    flickrid_o.appendChild(doc.createTextNode('yanyu'))
    owner.appendChild(flickrid_o)
    name_o = doc.createElement('name')
    name_o.appendChild(doc.createTextNode('yanyu'))
    owner.appendChild(name_o)

    size = doc.createElement('size')
    annotation.appendChild(size)

    width = doc.createElement('width')
    width.appendChild(doc.createTextNode(str(saveimg.shape[1])))
    height = doc.createElement('height')
    height.appendChild(doc.createTextNode(str(saveimg.shape[0])))
    depth = doc.createElement('depth')
    depth.appendChild(doc.createTextNode(str(saveimg.shape[2])))

    size.appendChild(width)

    size.appendChild(height)
    size.appendChild(depth)
    segmented = doc.createElement('segmented')
    segmented.appendChild(doc.createTextNode('0'))
    annotation.appendChild(segmented)
    for i in range(len(bboxes)):
        bbox = bboxes[i]
        objects = doc.createElement('object')
        annotation.appendChild(objects)
        object_name = doc.createElement('name')
        object_name.appendChild(doc.createTextNode('face'))
        objects.appendChild(object_name)
        pose = doc.createElement('pose')
        pose.appendChild(doc.createTextNode('Unspecified'))
        objects.appendChild(pose)
        truncated = doc.createElement('truncated')
        truncated.appendChild(doc.createTextNode('1'))
        objects.appendChild(truncated)
        difficult = doc.createElement('difficult')
        difficult.appendChild(doc.createTextNode('0'))
        objects.appendChild(difficult)
        bndbox = doc.createElement('bndbox')
        objects.appendChild(bndbox)
        xmin = doc.createElement('xmin')
        xmin.appendChild(doc.createTextNode(str(bbox[0])))
        bndbox.appendChild(xmin)
        ymin = doc.createElement('ymin')
        ymin.appendChild(doc.createTextNode(str(bbox[1])))
        bndbox.appendChild(ymin)
        xmax = doc.createElement('xmax')
        xmax.appendChild(doc.createTextNode(str(bbox[0] + bbox[2])))
        bndbox.appendChild(xmax)
        ymax = doc.createElement('ymax')
        ymax.appendChild(doc.createTextNode(str(bbox[1] + bbox[3])))
        bndbox.appendChild(ymax)
    f = open(xmlpath, "w")
    f.write(doc.toprettyxml(indent=''))
    f.close()

#原始数据集所在根目录
rootdir = "/home/zero/face_ws"


def convertimgset(img_set):
	#图片位置的根目录
    imgdir = rootdir + "/WIDER_" + img_set + "/images"
    #描述图片信息的gt.txt
    gtfilepath = rootdir + "/wider_face_split/wider_face_" + img_set + "_bbx_gt.txt"
	#新建写入train/val.txt
    fwrite = open(rootdir + "/ImageSets/Main/" + img_set + ".txt", 'w')

    index = 0

    with open(gtfilepath, 'r') as gtfiles:
        while(True): #true
            filename = gtfiles.readline()[:-1]
            if filename == None or filename == "":
                break
            imgpath = imgdir + "/" + filename
			#原始图片地址
            img = cv2.imread(imgpath)

            if not img.data:
                break;


            numbbox = int(gtfiles.readline())

            bboxes = []

            print(numbbox)

            for i in range(numbbox):
                line = gtfiles.readline()
                lines = line.split(" ")
                lines = lines[0:4]

                bbox = (int(lines[0]), int(lines[1]), int(lines[2]), int(lines[3]))

                if int(lines[2]) < 40 or int(lines[3]) < 40:
                    continue

                bboxes.append(bbox)

                #cv2.rectangle(img, (bbox[0],bbox[1]),(bbox[0]+bbox[2],bbox[1]+bbox[3]),color=(255,255,0),thickness=1)
			#将文件名中的‘/’替换为'_'
            filename = filename.replace("/", "_")

            if len(bboxes) == 0:
                print("no face")
                continue
            #cv2.imshow("img", img)
            #cv2.waitKey(0)
			#将图买移至JPEGImages/中
            cv2.imwrite("{}/JPEGImages/{}".format(rootdir,filename), img)
			#按‘.’切片后取[0]片写入train/val.txt
            fwrite.write(filename.split(".")[0] + "\n")
			#xml文件地址
            xmlpath = "{}/Annotations/{}.xml".format(rootdir,filename.split(".")[0])
			#写入xml内容
            writexml(filename, img, bboxes, xmlpath)

            print("success number is ", index)
            index += 1

    fwrite.close()

if __name__=="__main__":
    img_sets = ["train","val"]
    for img_set in img_sets:
        convertimgset(img_set)
	#将train.txt变为trainval.txt
    shutil.move(rootdir + "/ImageSets/Main/" + "train.txt", rootdir + "/ImageSets/Main/" + "trainval.txt")
    shutil.move(rootdir + "/ImageSets/Main/" + "val.txt", rootdir + "/ImageSets/Main/" + "test.txt")

数据集结构

caffe-ssd之生成voc数据集、转为lmdb、训练和测试全过程详细解析_第1张图片

注意

直接从数据集官网download下来的标注文件wider_face_split中下面两个文件,再生成voc时可能有问题:
wider_face_train_bbx_gt.txt
wider_face_val_bbx_gt.txt
将其中的全为0的行删掉!!

将VOC转为lmdb格式

我们需要三个文件,分别为:
create_list.sh
create_data.sh
labelmap_voc.prototxt

放入下面的路径中:
caffe-ssd之生成voc数据集、转为lmdb、训练和测试全过程详细解析_第2张图片

create_list.sh

#!/bin/bash

root_dir=/home/zero     #xiu
sub_dir=ImageSets/Main
bash_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
for dataset in trainval test
do
  dst_file=$bash_dir/$dataset.txt
  if [ -f $dst_file ]
  then
    rm -f $dst_file
  fi
  for name in face_ws  #xiu
  do
    if [[ $dataset == "test" && $name == "VOC2012" ]]
    then
      continue
    fi
    echo "Create list for $name $dataset..."
    dataset_file=$root_dir/$name/$sub_dir/$dataset.txt

    img_file=$bash_dir/$dataset"_img.txt"
    cp $dataset_file $img_file
    sed -i "s/^/$name\/JPEGImages\//g" $img_file
    sed -i "s/$/.jpg/g" $img_file

    label_file=$bash_dir/$dataset"_label.txt"
    cp $dataset_file $label_file
    sed -i "s/^/$name\/Annotations\//g" $label_file
    sed -i "s/$/.xml/g" $label_file

    paste -d' ' $img_file $label_file >> $dst_file

    rm -f $label_file
    rm -f $img_file
  done

  # Generate image name and size infomation.
  if [ $dataset == "test" ]
  then
    $bash_dir/../../build/tools/get_image_size $root_dir $dst_file $bash_dir/$dataset"_name_size.txt"
  fi

  # Shuffle trainval file.
  if [ $dataset == "trainval" ]
  then
    rand_file=$dst_file.random
    cat $dst_file | perl -MList::Util=shuffle -e 'print shuffle();' > $rand_file
    mv $rand_file $dst_file
  fi
done
cd caffe
#运行sh文件
./data/widerface/create_list.sh
#之后生成
#test.txt
#test_name_size.txt
#trainval.txt

create_data.sh

cur_dir=$(cd $( dirname ${BASH_SOURCE[0]} ) && pwd )
root_dir=$cur_dir/../..

cd $root_dir

redo=1
data_root_dir="/home/zero"
dataset_name="widerface"
mapfile="$root_dir/data/$dataset_name/labelmap_voc.prototxt"
anno_type="detection"
db="lmdb"
min_dim=0
max_dim=0
width=0
height=0

extra_cmd="--encode-type=jpg --encoded"
if [ $redo ]
then
  extra_cmd="$extra_cmd --redo"
fi
for subset in test trainval
do
  python $root_dir/scripts/create_annoset.py --anno-type=$anno_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $root_dir/data/$dataset_name/$subset.txt $data_root_dir/$dataset_name/$db/$dataset_name"_"$subset"_"$db examples/$dataset_name
done

#需要利用到上一部生成的test/trainval.txt
cd caffe
#运行sh文件
./data/widerface/create_data.sh

在根目录下生成两个lmdb文件和再caffe/examples/widerface/中生成两个lmdb软链接
caffe-ssd之生成voc数据集、转为lmdb、训练和测试全过程详细解析_第3张图片
caffe-ssd之生成voc数据集、转为lmdb、训练和测试全过程详细解析_第4张图片

训练数据集

ssd_pascal.py

位于examples/ssd/wider_face/~

ssd_pascal.py用到了
(1)create_list.sh创建的test_name_size.txt;
(2)labelmap_voc.prototxt
(3)create_data.sh创建的两个lmdb软链接widerface_test_lmdb
(4)widerface_trainval_lmdb
此外,还将ssd_pascal.py中调用的主干网络(model_libs.py)VGGNetBody改为了VGGNetBody_small
ssd_pascal.py生成了:
(1)jobs/VGGNet/wider_face/*2)models/VGGNet/wider_face/*

ssd_pascal.py(部分内容):

# The database file for training data. Created by data/VOC0712/create_data.sh
train_data = "examples/widerface/widerface_trainval_lmdb"
# The database file for testing data. Created by data/VOC0712/create_data.sh
test_data = "examples/widerface/widerface_test_lmdb"
# If true, use batch norm for all newly added layers.
# Currently only the non batch norm version has been tested.
use_batchnorm = False
lr_mult = 1
# Use different initial learning rate.
if use_batchnorm:
    base_lr = 0.0004
else:
    # A learning rate for batch_size = 1, num_gpus = 1.
    base_lr = 0.00004

# Modify the job name if you want.
job_name = "SSD_{}".format(resize)
# The name of the model. Modify it if you want.
model_name = "VGG_wider_face_{}".format(job_name)

# Directory which stores the model .prototxt file.
save_dir = "models/VGGNet/wider_face/{}".format(job_name)
# Directory which stores the snapshot of models.
snapshot_dir = "models/VGGNet/wider_face/{}".format(job_name)
# Directory which stores the job script and log file.
job_dir = "jobs/VGGNet/wider_face/{}".format(job_name)
# Directory which stores the detection results.
output_result_dir = "{}/data/VOCdevkit/results/wider_face/{}/Main".format(os.environ['HOME'], job_name)

# model definition files.
train_net_file = "{}/train.prototxt".format(save_dir)
test_net_file = "{}/test.prototxt".format(save_dir)
deploy_net_file = "{}/deploy.prototxt".format(save_dir)
solver_file = "{}/solver.prototxt".format(save_dir)
# snapshot prefix.
snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
# job script path.
job_file = "{}/{}.sh".format(job_dir, model_name)

# Stores the test image names and sizes. Created by data/VOC0712/create_list.sh
name_size_file = "data/widerface/test_name_size.txt"
# The pretrained model. We use the Fully convolutional reduced (atrous) VGGNet.
pretrain_model = "models/VGGNet/VGG_ILSVRC_16_layers_fc_reduced.caffemodel"
# Stores LabelMapItem.
label_map_file = "data/widerface/labelmap_voc.prototxt"

#conv5_3原来为fc7
mbox_source_layers = ['conv4_3', 'conv5_3', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']   # conv5_3
# in percent %
min_ratio = 20
max_ratio = 90
step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
min_sizes = []
max_sizes = []
for ratio in xrange(min_ratio, max_ratio + 1, step):
  min_sizes.append(min_dim * ratio / 100.)
  max_sizes.append(min_dim * (ratio + step) / 100.)
min_sizes = [min_dim * 10 / 100.] + min_sizes
max_sizes = [min_dim * 20 / 100.] + max_sizes
steps = [8, 16, 32, 64, 100, 300]
aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
# L2 normalize conv4_3.
normalizations = [20, -1, -1, -1, -1, -1]
# variance used to encode/decode prior bboxes.
if code_type == P.PriorBox.CENTER_SIZE:
  prior_variance = [0.1, 0.1, 0.2, 0.2]
else:
  prior_variance = [0.1]
flip = True
clip = False

# Solver parameters.
# Defining which GPUs to use.
gpus = "0"
gpulist = gpus.split(",")
num_gpus = len(gpulist)

# Divide the mini-batch to different GPUs.
#根据自己显卡内存修改!!否则报错
batch_size = 16  #32
accum_batch_size = 16  #32
iter_size = accum_batch_size / batch_size ##如果iter_size=1,则前向传播一次后进行一次反向传递,如果=2,则两次前传后进行一次反传,这样做是减少每次传播所占用的内存空间,有的硬件不行的话就无法训练,但是增加iter会使训练时间增加,但是总的迭代次数不变。
solver_mode = P.Solver.CPU
device_id = 0
batch_size_per_device = batch_size
if num_gpus > 0:
  batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
  iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
  solver_mode = P.Solver.GPU
  device_id = int(gpulist[0])

if normalization_mode == P.Loss.NONE:
  base_lr /= batch_size_per_device
elif normalization_mode == P.Loss.VALID:
  base_lr *= 25. / loc_weight
elif normalization_mode == P.Loss.FULL:
  # Roughly there are 2000 prior bboxes per image.
  # TODO(weiliu89): Estimate the exact # of priors.
  base_lr *= 2000.

# Evaluate on whole test set.
num_test_image = 2580
test_batch_size = 8
# Ideally test_batch_size should be divisible by num_test_image,
# otherwise mAP will be slightly off the true value.
test_iter = int(math.ceil(float(num_test_image) / test_batch_size))

solver_param = {
    # Train parameters
    'base_lr': base_lr, #基础学习率
    'weight_decay': 0.0005, #权重衰减项,防止过拟合的一个参数
    'lr_policy': "multistep", #基础学习率的策略,设置为multistep时,需要添加下方stepvalue参数
    'stepvalue': [2000, 5000, 10000], #当我们的数据集不太大时,可以适当减小其值,从而减少迭代次数,更快的显示训练结果
    'gamma': 0.1,
    'momentum': 0.9, #上一次梯度更新的权重
    'iter_size': iter_size,
    'max_iter': 20000, #最大迭代次数
    'snapshot': 100, #将训练出来的model和solver状态进行保存,snapshot用于设置训练多少次后进行保存
    'display': 10, #每迭代1在屏幕打印一次
    'average_loss': 10, #显示最近10次iter的平均loss
    'type': "SGD",
    'solver_mode': solver_mode,
    'device_id': device_id,
    'debug_info': False,
    'snapshot_after_train': True,
    # Test parameters
    'test_iter': [test_iter],
    'test_interval': 1000, #测试间隔,每训练多少次进行一次测试。
    'eval_type': "detection",
    'ap_version': "11point",
    'test_initialization': False,
    }
#调用主干网络
VGGNetBody_small(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
    dropout=False)

AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)

mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)

# Create the MultiBoxLossLayer.
name = "mbox_loss"
mbox_layers.append(net.label)
net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
        loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
        propagate_down=[True, True, False, False])

with open(train_net_file, 'w') as f:
    print('name: "{}_train"'.format(model_name), file=f)
    print(net.to_proto(), file=f)
shutil.copy(train_net_file, job_dir)

# Create test net.
net = caffe.NetSpec()
net.data, net.label = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
        train=False, output_label=True, label_map_file=label_map_file,
        transform_param=test_transform_param)

VGGNetBody_small(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
    dropout=False)

model_libs.py

#位于caffe/python/caffe/model_libs.py

测试

1、测试图片
2、caffemodel
3、deploy.prototxt
4、执行文件 test.py

test.py内容如下:

import os
import cv2
import sys
import numpy as np

caffe_root = "/home/zero/caffe"

os.chdir(caffe_root)

sys.path.insert(0,os.path.join(caffe_root, 'python'))

import caffe

caffe.set_device(0)
caffe.set_mode_gpu()

model_def = "/home/zero/caffe/models/VGGNet/wider_face/SSD_300x300/deploy.prototxt"
model_weight = "/home/zero/caffe/models/VGGNet/wider_face/SSD_300x300/VGG_wider_face_SSD_300x300_iter_1000.caffemodel"

img_path = "/home/zero/caffe/models/VGGNet/wider_face/SSD_300x300/30_Surgeons_Surgeons_30_90.jpg"

net = caffe.Net(model_def,model_weight,caffe.TEST)

image_data = caffe.io.load_image(img_path)

tranformer = caffe.io.Transformer({'data':net.blobs['data'].data.shape})

tranformer.set_transpose('data', (2,0,1))



tranformer.set_mean('data',np.array([104,117,123]))

tranformer.set_raw_scale('data', 255)


tranformer_image = tranformer.preprocess('data', image_data)

net.blobs['data'].reshape(1,3,300,300)

net.blobs['data'].data[...] = tranformer_image

detect_out = net.forward()['detection_out']

print detect_out

det_label = detect_out[0,0,:,1]
det_conf  = detect_out[0,0,:,2]

det_xmin = detect_out[0,0,:,3]
det_ymin = detect_out[0,0,:,4]
det_xmax = detect_out[0,0,:,5]
det_ymax = detect_out[0,0,:,6]

top_indices = [i for i , conf in enumerate(det_conf) if conf >=0.1]

top_conf = det_conf[top_indices]

top_xmin = det_xmin[top_indices]
top_ymin = det_ymin[top_indices]
top_xmax = det_xmax[top_indices]
top_ymax = det_ymax[top_indices]

[height,width,_] = image_data.shape

for i in range(min(5, top_conf.shape[0])):
    xmin = int(top_xmin[i] * width)
    ymin = int(top_ymin[i] * height)
    xmax = int(top_xmax[i] * width)
    ymax = int(top_ymax[i] * height)

    cv2.rectangle(image_data, (xmin,ymin),(xmax,ymax),(255,0,0),5)

cv2.imshow("face", image_data)

cv2.waitKey(0)

你可能感兴趣的:(深度学习,深度学习,caffe)