Python3 转换 SVHN 数据集成 lmdb 格式

该数据集处理的前提是需要先配置好 pycaffe,windows 下得具体方法见:Windows下 Pycaffe 的配置与使用 。然后程序处理方法其实有点类似于 处理 python 接口的 cifar100 数据集,只不过这里处理得是 mat 文件,用 scipy.io 是可以读取 mat 文件。

import numpy as np
import caffe
import lmdb
import scipy.io as sio
import random
from caffe.proto import caffe_pb2


def main():
    train = sio.loadmat('train_32x32.mat')
    test = sio.loadmat('test_32x32.mat')

    train_data = train['X']
    train_label = train['y']
    test_data = test['X']
    test_label = test['y']

    train_data = np.swapaxes(train_data, 0, 3)
    train_data = np.swapaxes(train_data, 1, 2)
    train_data = np.swapaxes(train_data, 2, 3)

    test_data = np.swapaxes(test_data, 0, 3)
    test_data = np.swapaxes(test_data, 1, 2)
    test_data = np.swapaxes(test_data, 2, 3)

    N = train_label.shape[0]
    map_size = train_data.nbytes * 10
    env = lmdb.open('svhn_train_lmdb', map_size=map_size)
    txn = env.begin(write=True)

    # shuffle the training data
    r = list(range(N))
    random.shuffle(r)

    count = 0
    for i in r:
        datum = caffe_pb2.Datum()
        label = int(train_label[i][0])
        if label == 10:
            label = 0
        datum = caffe.io.array_to_datum(train_data[i], label)
        str_id = '{:08}'.format(count)
        txn.put(str_id.encode("ascii"), datum.SerializeToString())

        count += 1
        if count % 1000 == 0:
            print('already handled with {} pictures'.format(count))
        txn.commit()
        txn = env.begin(write=True)

    txn.commit()
    env.close()

    map_size = test_data.nbytes * 10
    env = lmdb.open('svhn_test_lmdb', map_size=map_size)
    txn = env.begin(write=True)
    count = 0
    for i in range(test_data.shape[0]):
        datum = caffe_pb2.Datum()
        label = int(test_label[i][0])
        if label == 10:
            label = 0
        datum = caffe.io.array_to_datum(test_data[i], label)
        str_id = '{:08}'.format(count)
        txn.put(str_id.encode("ascii"), datum.SerializeToString())

        count += 1
        if count % 1000 == 0:
            print('already handled with {} pictures'.format(count))
        txn.commit()
        txn = env.begin(write=True)
    txn.commit()
    env.close()

if __name__ == '__main__':
    main()

在经过上面程序得到 train.lmdb 和 test.lmdb 文件夹后, 均值文件的话,还是直接用 caffe 自带得 compute_image_mean.exe 直接使用train.lmdb来得比较快。

———————————————————

参考博客:http://blog.csdn.net/yj3254/article/details/52370767

你可能感兴趣的:(caffe学习,配置文件)