LMDB格式的优点:
- 基于文件映射IO(memory-mapped),数据速率更好
- 对大规模数据集更有效.
HDF5的特点:
- 易于读取
- 类似于mat数据,但数据压缩性能更强
- 需要全部读进内存里,故HDF5文件大小不能超过内存,可以分成多个HDF5文件,将HDF5子文件路径写入txt中.
- I/O速率不如LMDB.
import numpy as np
import lmdb
import caffe
lmdb_file = '/path/to/data_lmdb'
N = 1000
# 准备 data 和 labels
X = np.zeros((N, 3, 224, 224), dtype=np.uint8) # data
y = np.zeros(N, dtype=np.int64) # labels
env = lmdb.open(lmdb_file, map_size=int(1e12))
txn = env.begin(write=True)
for i in range(N):
datum = caffe.proto.caffe_pb2.Datum()
datum.channels = X.shape[1]
datum.height = X.shape[2]
datum.width = X.shape[3]
datum.data = X[i].tobytes() # or .tostring() if numpy < 1.9
datum.label = int(y[i])
# 以上五行也可以直接: datum = caffe.io.array_to_datum(data, label)
str_id = '{:08}'.format(i)
txn.put(str_id, datum.SerializeToString())
# in Python3
# txn.put(str_id.encode('ascii'), datum.SerializeToString())
import numpy as np
import lmdb
import caffe
env = lmdb.open('data_lmdb', readonly=True)
txn = env.begin()
lmdb_cursor = txn.cursor()
datum = caffe.proto.caffe_pb2.Datum()
for key, value in lmdb_cursor:
print '{},{}'.format(key, value)
datum.ParseFromString(value)
flat_data = np.fromstring(datum.data, dtype=np.uint8)
data = flat_data.reshape(datum.channels, datum.height, datum.width)
# 或 data = caffe.io.datum_to_array(datum)
labels = datum.label
import h5py
import numpy as np
# 创建HDF5文件
imgsData = np.zeros((10,3,224,224)) # Images
labels = range(10) # Labels
f = h5py.File('HDF5_FILE.h5','w') # 创建一个h5文件
f['data'] = imgsData # 写入Images数据
f['labels'] = labels # 写入Labels数据
f.close() #
# 读取HDF5文件
f = h5py.File('HDF5_FILE.h5','r') # 打开h5文件
f_keys = f.keys()
imgsData = f['data'][:]
labels = f['labels'][:]
f.close()