使用训练好的googlenet caffemodel进行图片分类


1. ubuntu16.04,caffe已成功安装
2. python2.7
3. 我本地的caffe_root: /home/sarahzhou/caffe

下载googlenet caffemodel文件


1 使用caffe提供的脚本,注意在caffe_root下执行

sarahzhou@lenovo:~/caffe$ sudo python scripts/download_model_binary.py models/bvlc_googlenet

脚本的详细使用说明请参考caffe官网的where to get trained models部分。脚本下载完毕的bvlc_googlenet.caffemodel存储在${caffe_root}/models/bvlc_googlenet目录下。

2 直接到berkeleyvision的网站下载


因为我们使用的googlenet caffemodel是用imagenet训练好的,所以我们使用这个model的时候,也需要知道相关的imagenet的数据信息:

sarahzhou@lenovo:~/caffe$ ./data/ilsvrc12/get_ilsvrc_aux.sh 


synsets.txt             //imagenet数据类别的ID,共有1000个类别
synset_words.txt        //类别ID和类别名字的对应关系
imagenet_mean.binaryproto //imagenet数据均值




# coding=utf-8

import numpy as np
import matplotlib.pyplot as plt
import pylab
from PIL import Image
import caffe

caffe_root = "/home/sarahzhou/caffe/"
net_file = caffe_root + "models/bvlc_googlenet/deploy.prototxt"
model = caffe_root + "models/bvlc_googlenet/bvlc_googlenet.caffemodel"

net = caffe.Net(net_file, model, caffe.TEST)
image_mean = np.load(caffe_root + "python/caffe/imagenet/ilsvrc_2012_mean.npy").mean(1).mean(1)

# print 'mean-subtracted values:', zip('RGB', image_mean)
# 输出结果:mean-subtracted values: [('R', 104.0069879317889), ('G', 116.66876761696767), ('B', 122.6789143406786)]

data_shape = net.blobs['data'].data.shape
# print data_shape
# 输出结果:(10, 3, 224, 224) batch_size:10 channels:3 height:224 weight:224

transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
transformer.set_transpose('data', (2, 0, 1))
transformer.set_mean('data', image_mean)
transformer.set_raw_scale('data', 255)
transformer.set_channel_swap('data', (2, 1, 0))

image = caffe.io.load_image('/home/sarahzhou/software/mydogs/dog.jpg')
transformed_image = transformer.preprocess('data', image)
# plt.imshow(image)
# pylab.show()

net.blobs['data'].data[...] = transformed_image
output = net.forward()
output_prob = output['prob'][0]
print 'The predicted class is : ', output_prob.argmax()

label_file = caffe_root + "data/ilsvrc12/synset_words.txt"
labels = np.loadtxt(label_file, str, delimiter='\t')
print 'The label is : ', labels[output_prob.argmax()]

top_inds = output_prob.argsort()[::-1][:5]
print 'probabilities and labels: ', zip(output_prob[top_inds], labels[top_inds])


The predicted class is : 266
The label is : n02113712 miniature poodle
probabilities and labels: [(0.32076371, ‘n02113712 miniature poodle’), (0.15544809, ‘n02096051 Airedale, Airedale terrier’), (0.14952005, ‘n02102973 Irish water spaniel’), (0.11370797, ‘n02113624 toy poodle’), (0.04295243, ‘n02093991 Irish terrier’)]

可以看到利用googlenet的caffe model预测出有0.32076371的概率是 miniature poodle(迷你贵宾)

feature map 尺寸

for layer_name, feature_map in net.blobs.iteritems():
    print layer_name + '\t' + str(feature_map.data.shape)

上段代码输出每层的名称,和该层输出的feature map尺寸,输入结果为:

data (10, 3, 224, 224)
conv1/7x7_s2 (10, 64, 112, 112)
pool1/3x3_s2 (10, 64, 56, 56)
pool1/norm1 (10, 64, 56, 56)
conv2/3x3_reduce (10, 64, 56, 56)
conv2/3x3 (10, 192, 56, 56)
conv2/norm2 (10, 192, 56, 56)
pool2/3x3_s2 (10, 192, 28, 28)
pool2/3x3_s2_pool2/3x3_s2_0_split_0 (10, 192, 28, 28)
pool2/3x3_s2_pool2/3x3_s2_0_split_1 (10, 192, 28, 28)
pool2/3x3_s2_pool2/3x3_s2_0_split_2 (10, 192, 28, 28)
pool2/3x3_s2_pool2/3x3_s2_0_split_3 (10, 192, 28, 28)
inception_3a/1x1 (10, 64, 28, 28)
inception_3a/3x3_reduce (10, 96, 28, 28)
inception_3a/3x3 (10, 128, 28, 28)
inception_3a/5x5_reduce (10, 16, 28, 28)
inception_3a/5x5 (10, 32, 28, 28)
inception_3a/pool (10, 192, 28, 28)
inception_3a/pool_proj (10, 32, 28, 28)
inception_3a/output (10, 256, 28, 28)
inception_3a/output_inception_3a/output_0_split_0 (10, 256, 28, 28)
inception_3a/output_inception_3a/output_0_split_1 (10, 256, 28, 28)
inception_3a/output_inception_3a/output_0_split_2 (10, 256, 28, 28)
inception_3a/output_inception_3a/output_0_split_3 (10, 256, 28, 28)
inception_3b/1x1 (10, 128, 28, 28)
inception_3b/3x3_reduce (10, 128, 28, 28)
inception_3b/3x3 (10, 192, 28, 28)
inception_3b/5x5_reduce (10, 32, 28, 28)
inception_3b/5x5 (10, 96, 28, 28)
inception_3b/pool (10, 256, 28, 28)
inception_3b/pool_proj (10, 64, 28, 28)
inception_3b/output (10, 480, 28, 28)
pool3/3x3_s2 (10, 480, 14, 14)
pool3/3x3_s2_pool3/3x3_s2_0_split_0 (10, 480, 14, 14)
pool3/3x3_s2_pool3/3x3_s2_0_split_1 (10, 480, 14, 14)
pool3/3x3_s2_pool3/3x3_s2_0_split_2 (10, 480, 14, 14)
pool3/3x3_s2_pool3/3x3_s2_0_split_3 (10, 480, 14, 14)
inception_4a/1x1 (10, 192, 14, 14)
inception_4a/3x3_reduce (10, 96, 14, 14)
inception_4a/3x3 (10, 208, 14, 14)
inception_4a/5x5_reduce (10, 16, 14, 14)
inception_4a/5x5 (10, 48, 14, 14)
inception_4a/pool (10, 480, 14, 14)
inception_4a/pool_proj (10, 64, 14, 14)
inception_4a/output (10, 512, 14, 14)
inception_4a/output_inception_4a/output_0_split_0 (10, 512, 14, 14)
inception_4a/output_inception_4a/output_0_split_1 (10, 512, 14, 14)
inception_4a/output_inception_4a/output_0_split_2 (10, 512, 14, 14)
inception_4a/output_inception_4a/output_0_split_3 (10, 512, 14, 14)
inception_4b/1x1 (10, 160, 14, 14)
inception_4b/3x3_reduce (10, 112, 14, 14)
inception_4b/3x3 (10, 224, 14, 14)
inception_4b/5x5_reduce (10, 24, 14, 14)
inception_4b/5x5 (10, 64, 14, 14)
inception_4b/pool (10, 512, 14, 14)
inception_4b/pool_proj (10, 64, 14, 14)
inception_4b/output (10, 512, 14, 14)
inception_4b/output_inception_4b/output_0_split_0 (10, 512, 14, 14)
inception_4b/output_inception_4b/output_0_split_1 (10, 512, 14, 14)
inception_4b/output_inception_4b/output_0_split_2 (10, 512, 14, 14)
inception_4b/output_inception_4b/output_0_split_3 (10, 512, 14, 14)
inception_4c/1x1 (10, 128, 14, 14)
inception_4c/3x3_reduce (10, 128, 14, 14)
inception_4c/3x3 (10, 256, 14, 14)
inception_4c/5x5_reduce (10, 24, 14, 14)
inception_4c/5x5 (10, 64, 14, 14)
inception_4c/pool (10, 512, 14, 14)
inception_4c/pool_proj (10, 64, 14, 14)
inception_4c/output (10, 512, 14, 14)
inception_4c/output_inception_4c/output_0_split_0 (10, 512, 14, 14)
inception_4c/output_inception_4c/output_0_split_1 (10, 512, 14, 14)
inception_4c/output_inception_4c/output_0_split_2 (10, 512, 14, 14)
inception_4c/output_inception_4c/output_0_split_3 (10, 512, 14, 14)
inception_4d/1x1 (10, 112, 14, 14)
inception_4d/3x3_reduce (10, 144, 14, 14)
inception_4d/3x3 (10, 288, 14, 14)
inception_4d/5x5_reduce (10, 32, 14, 14)
inception_4d/5x5 (10, 64, 14, 14)
inception_4d/pool (10, 512, 14, 14)
inception_4d/pool_proj (10, 64, 14, 14)
inception_4d/output (10, 528, 14, 14)
inception_4d/output_inception_4d/output_0_split_0 (10, 528, 14, 14)
inception_4d/output_inception_4d/output_0_split_1 (10, 528, 14, 14)
inception_4d/output_inception_4d/output_0_split_2 (10, 528, 14, 14)
inception_4d/output_inception_4d/output_0_split_3 (10, 528, 14, 14)
inception_4e/1x1 (10, 256, 14, 14)
inception_4e/3x3_reduce (10, 160, 14, 14)
inception_4e/3x3 (10, 320, 14, 14)
inception_4e/5x5_reduce (10, 32, 14, 14)
inception_4e/5x5 (10, 128, 14, 14)
inception_4e/pool (10, 528, 14, 14)
inception_4e/pool_proj (10, 128, 14, 14)
inception_4e/output (10, 832, 14, 14)
pool4/3x3_s2 (10, 832, 7, 7)
pool4/3x3_s2_pool4/3x3_s2_0_split_0 (10, 832, 7, 7)
pool4/3x3_s2_pool4/3x3_s2_0_split_1 (10, 832, 7, 7)
pool4/3x3_s2_pool4/3x3_s2_0_split_2 (10, 832, 7, 7)
pool4/3x3_s2_pool4/3x3_s2_0_split_3 (10, 832, 7, 7)
inception_5a/1x1 (10, 256, 7, 7)
inception_5a/3x3_reduce (10, 160, 7, 7)
inception_5a/3x3 (10, 320, 7, 7)
inception_5a/5x5_reduce (10, 32, 7, 7)
inception_5a/5x5 (10, 128, 7, 7)
inception_5a/pool (10, 832, 7, 7)
inception_5a/pool_proj (10, 128, 7, 7)
inception_5a/output (10, 832, 7, 7)
inception_5a/output_inception_5a/output_0_split_0 (10, 832, 7, 7)
inception_5a/output_inception_5a/output_0_split_1 (10, 832, 7, 7)
inception_5a/output_inception_5a/output_0_split_2 (10, 832, 7, 7)
inception_5a/output_inception_5a/output_0_split_3 (10, 832, 7, 7)
inception_5b/1x1 (10, 384, 7, 7)
inception_5b/3x3_reduce (10, 192, 7, 7)
inception_5b/3x3 (10, 384, 7, 7)
inception_5b/5x5_reduce (10, 48, 7, 7)
inception_5b/5x5 (10, 128, 7, 7)
inception_5b/pool (10, 832, 7, 7)
inception_5b/pool_proj (10, 128, 7, 7)
inception_5b/output (10, 1024, 7, 7)
pool5/7x7_s1 (10, 1024, 1, 1)
loss3/classifier (10, 1000)
prob (10, 1000)

从上述输出结果中就可以看出每层经过运算之后输出到下一层的数据尺寸,比如inception_4b/3x3 (10, 224, 14, 14),其中batch_size:10 channels:224 height:14 weight:14.


for layer_name, kernel in net.params.iteritems():
    print layer_name + '\t' + str(kernel[0].data.shape)


conv1/7x7_s2 (64, 3, 7, 7)
conv2/3x3_reduce (64, 64, 1, 1)
conv2/3x3 (192, 64, 3, 3)
inception_3a/1x1 (64, 192, 1, 1)
inception_3a/3x3_reduce (96, 192, 1, 1)
inception_3a/3x3 (128, 96, 3, 3)
inception_3a/5x5_reduce (16, 192, 1, 1)
inception_3a/5x5 (32, 16, 5, 5)
inception_3a/pool_proj (32, 192, 1, 1)
inception_3b/1x1 (128, 256, 1, 1)
inception_3b/3x3_reduce (128, 256, 1, 1)
inception_3b/3x3 (192, 128, 3, 3)
inception_3b/5x5_reduce (32, 256, 1, 1)
inception_3b/5x5 (96, 32, 5, 5)
inception_3b/pool_proj (64, 256, 1, 1)
inception_4a/1x1 (192, 480, 1, 1)
inception_4a/3x3_reduce (96, 480, 1, 1)
inception_4a/3x3 (208, 96, 3, 3)
inception_4a/5x5_reduce (16, 480, 1, 1)
inception_4a/5x5 (48, 16, 5, 5)
inception_4a/pool_proj (64, 480, 1, 1)
inception_4b/1x1 (160, 512, 1, 1)
inception_4b/3x3_reduce (112, 512, 1, 1)
inception_4b/3x3 (224, 112, 3, 3)
inception_4b/5x5_reduce (24, 512, 1, 1)
inception_4b/5x5 (64, 24, 5, 5)
inception_4b/pool_proj (64, 512, 1, 1)
inception_4c/1x1 (128, 512, 1, 1)
inception_4c/3x3_reduce (128, 512, 1, 1)
inception_4c/3x3 (256, 128, 3, 3)
inception_4c/5x5_reduce (24, 512, 1, 1)
inception_4c/5x5 (64, 24, 5, 5)
inception_4c/pool_proj (64, 512, 1, 1)
inception_4d/1x1 (112, 512, 1, 1)
inception_4d/3x3_reduce (144, 512, 1, 1)
inception_4d/3x3 (288, 144, 3, 3)
inception_4d/5x5_reduce (32, 512, 1, 1)
inception_4d/5x5 (64, 32, 5, 5)
inception_4d/pool_proj (64, 512, 1, 1)
inception_4e/1x1 (256, 528, 1, 1)
inception_4e/3x3_reduce (160, 528, 1, 1)
inception_4e/3x3 (320, 160, 3, 3)
inception_4e/5x5_reduce (32, 528, 1, 1)
inception_4e/5x5 (128, 32, 5, 5)
inception_4e/pool_proj (128, 528, 1, 1)
inception_5a/1x1 (256, 832, 1, 1)
inception_5a/3x3_reduce (160, 832, 1, 1)
inception_5a/3x3 (320, 160, 3, 3)
inception_5a/5x5_reduce (32, 832, 1, 1)
inception_5a/5x5 (128, 32, 5, 5)
inception_5a/pool_proj (128, 832, 1, 1)
inception_5b/1x1 (384, 832, 1, 1)
inception_5b/3x3_reduce (192, 832, 1, 1)
inception_5b/3x3 (384, 192, 3, 3)
inception_5b/5x5_reduce (48, 832, 1, 1)
inception_5b/5x5 (128, 48, 5, 5)
inception_5b/pool_proj (128, 832, 1, 1)
loss3/classifier (1000, 1024)

卷积核和feature map的可视图

def vis_square(data):
    # normalize data for display
    data = (data - data.min()) / (data.max() - data.min())

    # force the number of filters to be square
    n = int(np.ceil(np.sqrt(data.shape[0])))
    padding = (((0, n ** 2 - data.shape[0]),
                (0, 1), (0, 1))  # add some space between filters
               + ((0, 0),) * (data.ndim - 3)) # don't pad the last dimension (if there is one)
    data = np.pad(data, padding, mode='constant', constant_values=1) # pad with ones (white)

    # tile the filters into an image
    data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))
    data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])

filters = net.params['conv1/7x7_s2'][0].data
vis_square(filters.transpose(0, 2, 3, 1))

feat = net.blobs['conv1/7x7_s2'].data[0, :36]

