最近在进行多标签的数据转换,发现直接使用caffe自带的convert_image不是很方面,就收集了一下用python的处理方法。现整理以备后用。
使用时发现,用python写入lmdb有个问题,如果事先无法知道数据的大小,那么分配的存储空间map_size就不好确定(默认是10M),且不会自动随写入数据的实际大小而调整,所以还是HDF5好处理一下,虽然在caffe中是提倡使用lmdb。
在此也望有经验的前辈能指教一下,用python处理lmdb时,文件的大小如何预分配,或有什么办法能让lmdb的文件大小能随写入或删除数据而自动增减?先谢过啦!
import lmdb
import random
import os
import caffe
def convert_data_lmdb(train_data,train_label,output_data_lmdb,output_labels_lmdb):
"""
Used for save data and multi-labels to lmdbs
call: convert_data_lmdb(train_X,train_y,'train_data_lmdb','train_labels_lmdb')
"""
X = train_data.astype(np.float)
y = train_label.astype(np.float)
X, y = shuffle(X, y, random_state=42) # shuffle train data
# creating images lmdb
in_db = lmdb.open(output_data_lmdb, map_size=X.nbytes*10)
with in_db.begin(write=True) as in_txn :
for in_idx,in_ in enumerate(X) :
im = in_;
im = im[:,:,::-1]
im = im.transpose((2, 0, 1))
im_dat = caffe.io.array_to_datum(im)
#in_txn.put(in_idx.encode('ascii'), im_dat.SerializeToString())
in_txn.put('{:0>10d}'.format(in_idx), im_dat.SerializeToString())
in_db.close()
in_label = lmdb.open(output_labels_lmdb, map_size=y.nbytes*10)
counter_label = 0
with in_label.begin(write=True) as in_txn :
for idx in range(y.shape[0]):
datum = caffe.io.array_to_datum(y[np.newaxis,np.newaxis,idx])
in_txn.put("{:0>10d}".format(counter_label), datum.SerializeToString())
counter_label += 1
in_label.close()
def write_hdf5(filename):
import h5py
IMAGE_SIZE = (96, 96)
LABEL_SIZE = 30 # Multi-labels
MEAN_VALUE = 128
#filename = sys.argv[1]
setname, ext = filename.split('.')
with open(filename, 'r') as f:
lines = f.readlines()
np.random.shuffle(lines)
sample_size = len(lines)
imgs = np.zeros((sample_size, 1,) + IMAGE_SIZE, dtype=np.float32)
scores = np.zeros((sample_size,1) + LABEL_SIZE, dtype=np.float32)
h5_filename = '{}.h5'.format(setname)
with h5py.File(h5_filename, 'w') as h:
for i, line in enumerate(lines):
image_name, score = line[:-1].split()
img = pyplot.imread(image_name)[:, :, 0].astype(np.float32)
img = img.reshape((1, )+img.shape)
#img -= MEAN_VALUE
imgs[i] = img
scores[i,1] = float(score)
if (i+1) % 1000 == 0:
print('processed {} images!'.format(i+1))
h.create_dataset('data', data=imgs)
h.create_dataset('label', data=scores)
with open('{}_h5.txt'.format(setname), 'w') as f:
f.write(h5_filename)
def write_hdf5(data,labels,output_filename):
"""
This function is used to save image data and its label(s) to hdf5 file.
output_file.h5,contain data and label
data.shape is (n,c,h,w)
label.shape is (n,labels)
"""
import h5py
X = data.astype(np.float32)
y = labels.astype(np.float32)
X, y = shuffle(X, y, random_state=42) # shuffle train data
IMAGE_SIZE = (96, 96)
LABEL_SIZE = 30 # Multi-labels
MEAN_VALUE = 128
#filename = sys.argv[1]
setname, ext = output_filename.split('.')
sample_size = X.shape[0]
imgs = np.zeros((sample_size, 1,) + IMAGE_SIZE, dtype=np.float32)
scores = np.zeros((sample_size,LABEL_SIZE), dtype=np.float32)
h5_filename = '{}.h5'.format(setname)
with h5py.File(h5_filename, 'w') as h:
i = 0;
for in_,label in zip(X,y) :
im = in_;
im = im[:,:,::-1]
im = im.transpose((2, 0, 1))
imgs[i] = im
scores[i] = label
i = i + 1;
print('processed {} images!'.format(i))
h.create_dataset('data', data=imgs)
h.create_dataset('label', data=scores)
with open('{}_h5.txt'.format(setname), 'w') as f:
f.write(h5_filename)