以前曾尝试过修改 Caffe ImageDataLayer 源码的方式来读取多个 labels - ImageMultilabelDataLayer [Caffe实践 - 基于VGG16 多标签分类的训练与部署].
修改源码的方式可能显得稍微有点繁琐, 毕竟需要重新编译.
这里尝试了一种新的方式来进行多标签自动标注.
与 [Caffe实践 - 基于VGG16 多标签分类的训练与部署] 不同的是, 前者是以 Multi-task 的方式进行处理的,每一个 task 分别是一个label的分类问题. 而这里是以多标签标注方式进行的.
数据形式如, images_labels.txt:
img1.jpg 1 0 1 ... 0
img2.jpg 0 1 0 ... 1
img3.jpg 1 1 0 ... 0
......
每一行是一个数据样本, 其对应的 multilabels 为一个 01 向量的形式. 且所有的样本所对应的 multilabels 向量长度是相同的.
可以类似于 [Caffe实践 - 基于VGG16 多标签分类的训练与部署] 来读取 multilabels 数据.
但这里采用了另外的一种方式:
首先将数据转换为两个文件, 其内容格式分别为:
imageslist.txt
img1.jpg 2
img2.jpg 5
img3.jpg 3
......
每一行对应一个数据样本, 图片 + 标签labels数 的形式.
labelslist.txt
1 0 1 ... 0
0 1 0 ... 1
1 1 0 ... 0
......
每一行对应与一个数据样本的 multilabels. 01 向量.
gen_label_lmdb.py:
#! --*-- coding: utf-8 --*--
import numpy
import argparse
from caffe.proto import caffe_pb2
import lmdb
def parse_args():
parser = argparse.ArgumentParser(description='End-to-end inference')
parser.add_argument('--labels', dest='labels',
help='label txt file',
default=None, type=str )
parser.add_argument('--images', dest='images',
help='image txt file, for keys of datum',
default=None, type=str )
parser.add_argument('--lmdb', dest='lmdb',
help='label lmdb file',
default=None, type=str )
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
return parser.parse_args()
def array_to_datum(arr, label=0):
"""
Converts a 3-dimensional array to datum.
If the array has dtype uint8,
the output data will be encoded as a string.
Otherwise, the output data will be stored in float format.
"""
if arr.ndim != 3:
raise ValueError('Incorrect array shape.')
datum = caffe_pb2.Datum()
datum.channels, datum.height, datum.width = arr.shape
if arr.dtype == numpy.uint8:
datum.data = arr.tostring()
else:
datum.float_data.extend(arr.flat)
datum.label = label
return datum
if __name__ == '__main__':
print 'Starting Generate lmdb'
args = parse_args()
labels_array = numpy.loadtxt(args.labels)
num_samples, num_classes = labels_array.shape
images_keys_list = open(args.images).readlines()
assert num_samples == len(images_keys_list)
labels_db = lmdb.open(args.lmdb, map_size=20 * 1024 * 1024 * 1024)
with labels_db.begin(write=True) as txn:
for idx in range(0, num_samples):
# label data
labels_cur = labels_array[idx,:]
num_labels = labels_cur.size
labels_cur = labels_cur.reshape(num_labels, 1, 1)
# keys for label
key_cur = images_keys_list[idx].split(' ')[0] # image name
key_cur = "{:0>8d}".format(idx) + '_' + key_cur
num_taggs = int(images_keys_list[idx].split(' ')[1])
assert num_taggs == labels_cur.sum()
# create datum
labels_datum = array_to_datum(labels_cur, num_taggs)
# write datum to lmdb
txn.put(key_cur.encode('ascii'), labels_datum.SerializeToString())
if (idx+1)%5000 == 0:
print 'Processed ', idx + 1, 'files in total.'
num_label_db = 0
with labels_db.begin() as txn:
cursor = txn.cursor()
for key, value in cursor:
n_label_db = num_label_db + 1
print 'Total # of item in label lmdb:', num_label_db
命令行运行:
python gen_label_lmdb.py --labels ./data/labelslist.txt --images ./data/imageslist.txt --lmdb ./data/output_lmdb/
name: "resnet101"
layer {
name: "data"
type: "ImageData"
top: "data"
top: "dummylabel"
image_data_param {
source: "/path/to/train_imagelist.txt"
root_folder: "/path/to/images/"
batch_size: 16
new_height: 256
new_width: 256
}
transform_param {
mirror: true
crop_size: 224
mean_value: 104
mean_value: 117
mean_value: 123
}
include {
phase: TRAIN
}
}
layer {
name: "label"
type: "Data"
top: "label"
data_param {
source: "/path/to/train_output_lmdb"
batch_size: 16
backend: LMDB
}
include {
phase: TRAIN
}
}
##
layer {
name: "data"
type: "ImageData"
top: "data"
top: "dummylabel"
image_data_param {
source: "/path/to/test_imagelist.txt"
root_folder: "/path/to/images/"
batch_size: 4
new_height: 256
new_width: 256
}
transform_param {
crop_size: 224
mean_value: 104
mean_value: 117
mean_value: 123
}
include {
phase: TEST
}
}
layer {
name: "label"
type: "Data"
top: "label"
data_param {
source: "/path/to/test_output_lmdb"
batch_size: 4
backend: LMDB
}
include {
phase: TEST
}
}
####### dummylabel #######
# 这里 dummpylabe 是未用到的
layer {
name: "label_silence"
type: "Silence"
bottom: "dummylabel"
}
#### netword define ####
......
####
layer {
name: "fc_labels"
type: "InnerProduct"
bottom: "Pooling_"
top: "fc_labels"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 1
decay_mult: 0
}
inner_product_param {
num_output: 30 # labels 数
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "loss"
type: "SigmoidCrossEntropyLoss" #
bottom: "fc_labels"
bottom: "label"
top: "loss"
}
name: "resnet101"
input: "data"
input_shape {
dim: 1
dim: 3
dim: 224
dim: 224
}
#### netword define ####
......
####
layer {
name: "fc_labels"
type: "InnerProduct"
bottom: "Pooling_"
top: "fc_labels"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 1
decay_mult: 0
}
inner_product_param {
num_output: 30 # labels 数
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0
}
}
}
#!/usr/bin/env python
# --*-- coding: utf-8 --*--
import numpy as np
from PIL import Image
import scipy.misc
import matplotlib.pyplot as plt
import sys
caffe_root = '/path/to/caffe/'
sys.path.insert(0, caffe_root + 'python')
import caffe
caffe.set_mode_gpu()
caffe.set_device(0)
# caffe.set_mode_cpu()
class SimpleTransformer(object):
"""
SimpleTransformer is a simple class for preprocessing and deprocessing
images for caffe.
"""
def __init__(self, mean=[128, 128, 128]):
self.mean = np.array(mean, dtype=np.float32)
self.scale = 1.0
def set_mean(self, mean):
"""
Set the mean to subtract for centering the data.
"""
self.mean = mean
def set_scale(self, scale):
"""
Set the data scaling.
"""
self.scale = scale
def preprocess(self, im):
"""
preprocess() emulate the pre-processing occuring in the vgg16 caffe
prototxt.
"""
im = np.float32(im)
im = im[:, :, ::-1] # change to BGR
im -= self.mean
im *= self.scale
im = im.transpose((2, 0, 1))
return im
if __name__ == '__main__':
print 'Start...'
multilabels = ['a', 'b', 'c', ...]
test_image = '/home/sh/Pictures/upper/10.jpg'
im = np.asarray(Image.open(test_image))
im = scipy.misc.imresize(im, [224, 224])
model_def = '/path/to/deploy.prototxt'
weight_def = '/path/to/multilabel_vgg16_iter_100000.caffemodel'
net = caffe.Net(model_def, weight_def, caffe.TEST)
transformer = ImageTransformer({'data': net.blobs['data'].data.shape})
transformer.set_transpose('data', (2, 0, 1)) # h*w*c -> c*h*w
transformer.set_mean('data', np.array((104, 117, 123))) # mean pixel
transformer.set_raw_scale('data', 255) # the net operates on images in [0,255]
transformer.set_channel_swap('data', (2, 1, 0)) # RGB -> BGR
transformed_image = transformer.preprocess(im)
net.blobs['data'].data[...] = transformed_image
outputs = net.forward()
img_scores = outputs[net.outputs[0]]
img_scores = np.array([1./(1+np.exp(-tmp)) for tmp in img_scores[0]])
pred_labels_index = np.where(img_scores > 0.5)[0].tolist()
print 'pred_labels:', pred_labels_index, '----', len(pred_labels_index)
for tmp in pred_labels_index:
print 'pred_labels:', multilabels[tmp], '----', len(pred_labels_index)
print '--------------------------------------------'
plt.imshow(im)
plt.axis('off')
plt.show()
print 'Done.'
[1] - Caffe实践 - 基于VGG16 多标签分类的训练与部署