【开发日记】石头剪刀布之卷积神经网络

在上一篇文章中——石头剪刀布之神经网络训练,利用预训练的MobileNet虽然可以得到很高的精度,而且模型占用的内存很小,只有5M左右。但是,在实际应用的时候,在我的笔记本上识别一张图片需要0.5s(使用如下代码),更不用说在树莓派上运行了。究其原因,是因为MobileNet最少需要128*128的图片输入,因此还是需要不少的运算量。因此,为了减少运行时间和模型的大小,本文将通过Tensorflow,创建一个比较简单的神经网络。

以下为MobileNet识别单张图片代码:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import  numpy as np
import PIL.Image as Image
from pylab import *
import time

# 下载模型
def load_graph(model_file):
  graph = tf.Graph()
  graph_def = tf.GraphDef()

  with open(model_file, "rb") as f:
    graph_def.ParseFromString(f.read())
  with graph.as_default():
    tf.import_graph_def(graph_def)

  return graph

# 识别手势
def recognize(jpg_path, pb_file_path, classes):
  with tf.Graph().as_default():
      
      graph = load_graph(pb_file_path)

      
      with tf.Session(graph=graph) as sess:
          # 获取输入张量
          input_x = graph.get_tensor_by_name("import/input:0")
          # 获取输出张量
          out = graph.get_tensor_by_name("import/final_result:0")
          # 读入待识别图片
          img = Image.open(jpg_path)
          # 该MobileNet模型需要128*128的图片输入
          img = array(img.resize((128, 128)),dtype=float32)
          # 图片预处理
          img = (img-128)*1.0/128
          t1 = time.time()
          img_out = sess.run(out, feed_dict={input_x:np.reshape(img, [-1, 128, 128, 3])})
          t2 = time.time()
          #print("running time: ", t2-t1)
          #print(img_out)
          prediction_labels = np.squeeze(np.argmax(img_out, axis=1))
          print(classes[prediction_labels])
          print('probability: %.3g, running time: %.3g' % (img_out[0][prediction_labels], t2-t1))


if __name__=="__main__":
  classes = ['scissors','others','rock','paper']
  jpg_path = "images/scissors/image671.jpg"
  pb_file_path="model/output_graph.pb"
  recognize(jpg_path, pb_file_path, classes)

本文采用的网络结构如下图所示:

类型 Kernel尺寸/步长(或注释) 输入尺寸
卷积 3*3*16/1  28*28*1
池化 2*2/2 28*28*16
卷积 3*3*32/1 14*14*16
池化 2*2/2 14*14*32
全连接 (7*7*32)*256 1*(7*7*32)
Dropout 注:随机失活 1*256
全连接 256*4 1*256
Softmax 分类输出 1*4

相应的代码如下(model.py):

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np

w = np.int32(28)
h = np.int32(28)

def weight_variable(shape, name):
    '''Define weight

    Args:
      shape: The shape of weight.
      name: The name of weight.

    Returns:
      The Variable object

    '''
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1), name=name)

def bias_variable(shape, name):
    '''Define bias

    Args:
      shape: The shape of bias
      name: The name of bias

    Returns:
      The Variable object
    '''
    return tf.Variable(tf.constant(0.1, shape=shape), name=name)

def model(x, keep_prob):
    '''Build the MNIST model

    Args:
      x: Input, tf.placeholder, the dimension is [-1, 784]
      keep_prob: 

    Returns:
      y: Classification probability


    '''
    
    x_image = tf.reshape(x, [-1, w, h, 1])

    # Conv1
    with tf.name_scope('conv1'):
        W_conv1 = weight_variable([3, 3, 1, 16], name="weight")
        b_conv1 = bias_variable([16], name='bias')
        h_conv1 = tf.nn.relu(
            tf.nn.conv2d(x_image, W_conv1, strides=[1,1,1,1], padding="SAME", name='conv')
            + b_conv1)
        h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1,2,2,1], strides=[1,2,2,1],
                                 padding="SAME", name="pool")

    # Conv2
    with tf.name_scope('conv2'):
        W_conv2 = weight_variable([3, 3, 16, 32], name="weight")
        b_conv2 = bias_variable([32], name='bias')
        h_conv2 = tf.nn.relu(
            tf.nn.conv2d(h_pool1, W_conv2, strides=[1,1,1,1], padding="SAME", name='conv')
            + b_conv2)
        h_pool2 = tf.nn.max_pool(h_conv2, ksize=[1,2,2,1], strides=[1,2,2,1],
                                 padding="SAME", name="pool")

    # fc1
    with tf.name_scope('fc1'):
        W_fc1 = weight_variable([7*7*32, 256], name="weight")
        b_fc1 = bias_variable([256], name='bias')
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*32])
        h_fc1 = tf.nn.relu(
            tf.matmul(h_pool2_flat, W_fc1)+b_fc1)

    # Dropout
    #keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    # fc2
    with tf.name_scope('fc2'):
        W_fc2 = weight_variable([256, 4], name="weight")
        b_fc2 = bias_variable([4], name='bias')
        y = tf.nn.softmax(
            tf.matmul(h_fc1_drop, W_fc2)+b_fc2, name="output")

    return y
        

这样,网络模型就构造完成了,由于我们收集的数据一般是以图片格式存在各个文件中。利用以下代码可以将文件夹中的图片读入内存中,等待训练(read_image.py)

from PIL import Image
import numpy as np
from PIL import Image
from pylab import *
import os
import glob


# 训练时所用输入长、宽和通道大小
w = 28
h = 28
c = 3
# 将标签转换成one-hot矢量
def to_one_hot(label):
    label_one = np.zeros((len(label),np.max(label)+1))
    for i in range(len(label)):
        label_one[i, label[i]]=1
    return label_one
# 读入图片并转化成相应的维度
def read_img(path):
    cate   = [path + x for x in os.listdir(path) if os.path.isdir(path + x)]
    imgs   = []
    labels = []
    for idx, folder in enumerate(cate):
        for im in glob.glob(folder + '/*.jpg'):
            print('reading the image: %s' % (im))
            # 读入图片,转化成灰度图,并缩小到相应维度
            img = array(Image.open(im).convert('L').resize((w,h)),dtype=float32)
            imgs.append(img)
            #img = array(img)
            labels.append(idx)
    data,label = np.asarray(imgs, np.float32), to_one_hot(np.asarray(labels, np.int32))
    
    # 将图片随机打乱
    num_example = data.shape[0]
    arr = np.arange(num_example)
    np.random.shuffle(arr)
    data = data[arr]
    label = label[arr]
    # 80%用于训练,20%用于验证
    ratio = 0.8
    s = np.int(num_example * ratio)
    x_train = data[:s]
    y_train = label[:s]
    x_val   = data[s:]
    y_val   = label[s:]

    x_train = np.reshape(x_train, [-1, w*h])
    x_val = np.reshape(x_val, [-1, w*h])

    return x_train, y_train, x_val, y_val

if __name__=="__main__":
    path = 'images/'
    x_train, y_train, x_val, y_val = read_img(path)

接下来,就可以进行训练了(train.py)

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import argparse
import sys
import model
import read_image
import numpy as np

w = 28
h = 28

def main(args):
    lr = args.learning_rate
    batch_size = args.batch_size
    epoches = args.epoches
    keep_prob_value = args.keep_prob
    train(lr,batch_size, epoches, keep_prob_value)
    

def train(lr, batch_size, epoches, keep_prob_value):
    # 下载图片
    path = 'images/'
    x_train, y_train, x_val, y_val = read_image.read_img(path)
    
    x = tf.placeholder(tf.float32, [None, w*h], name="images")
    y_ = tf.placeholder(tf.float32, [None, 4], name="labels")
    keep_prob = tf.placeholder(tf.float32,name="keep_prob")
    y = model.model(x, keep_prob)
    
    # Cost function
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_*tf.log(y),
                                                  reduction_indices=[1]),name="corss_entropy")

    train_step = tf.train.AdamOptimizer(lr).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
    saver = tf.train.Saver()

    # Start training
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(epoches):
            iters = np.int32(len(x_train)/batch_size)+1
            for j in range(iters):
                if j==iters-1:
                    batch0 = x_train[j*batch_size:]
                    batch1 = y_train[j*batch_size:]
                else:
                    batch0 = x_train[j*batch_size:(j+1)*batch_size]
                    batch1 = y_train[j*batch_size:(j+1)*batch_size]
                if i%25==0:
                    train_accuracy = sess.run(accuracy,
                                              feed_dict={x:batch0, y_:batch1,
                                              keep_prob: keep_prob_value})
                    print("step %d, training accuracy %g" % (i, train_accuracy))

                    # Save model
                    saver_path = saver.save(sess,"model/model.ckpt")
                    print("Model saved in file:", saver_path)
                sess.run(train_step, feed_dict={x:batch0, y_:batch1,
                                                keep_prob:keep_prob_value})

        test_accuracy = sess.run(accuracy, feed_dict={x:x_val,
                                                      y_:y_val,
                                                      keep_prob: 1.0})
        print("test accuracy %g" % test_accuracy)
    

def parse_arguments(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument('--learning_rate', type=float,
                        help="learning rate", default=1e-4)
    parser.add_argument('--batch_size', type=float,
                        help="batch_size", default=50)
    parser.add_argument('--epoches', type=float,
                        help="max iterations", default=50)
    parser.add_argument('--keep_prob', type=float,
                        help="keep prob", default=0.5)
    return parser.parse_args(argv)

if __name__=="__main__":
    main(parse_arguments(sys.argv[1:]))

运行上述代码,就可以进行训练了。我现在收集的数据大概每类200张左右,训练精度和验证精度都在89%左右,模型大小也在5M左右。下一步可以通过调参进一步提高精度。

接下来可以通过以下代码看识别一张图片需要多长时间(tesy.py)

from PIL import Image
from matplotlib.pylab import *
import numpy as np
import argparse
import tensorflow as tf
import time

w = 28
h = 28
classes = ['others','paper','rock','scissors']

def main(args):
    '''
    filename = args.filename
    model_dir = args.model_dir
    '''
    filename = 'images/paper/image516.jpg'
    model_dir = 'model/model.ckpt'
    
    # Restore model
    saver = tf.train.import_meta_graph(model_dir+".meta")
    
    with tf.Session() as sess:
        saver.restore(sess, model_dir)
        x = tf.get_default_graph().get_tensor_by_name("images:0")
        keep_prob = tf.get_default_graph().get_tensor_by_name("keep_prob:0")
        y = tf.get_default_graph().get_tensor_by_name("fc2/output:0")
        
        
        # Read image
        pil_im = array(Image.open(filename).convert('L').resize((w,h)),dtype=float32)
        #pil_im = (255-pil_im)/255.0
        pil_im = pil_im.reshape((1,w*h))
       
        time1 = time.time()
        prediction = sess.run(y, feed_dict={x:pil_im,keep_prob: 1.0})
        index = np.argmax(prediction)
        time2 = time.time()
        print("The classes is: %s. (the probability is %g)" % (classes[index], prediction[0][index]))
        
        print("Using time %g" % (time2-time1))
def parse_arguments(argv):
    parser = argparse.ArgumentParser()

    parser.add_argument('--filename', type=str,
                        help="The image name",default="images/paper/image104.jpg")
    parser.add_argument('--model_dir', type=str,
                        help="learning rate", default="model/model.ckpt")
    return parser.parse_args(argv)

if __name__=="__main__":
    main(parse_arguments(sys.argv[1:]))

经过测试,在树莓派上的运行时间大概在25ms左右,还算比较符合要求。






你可能感兴趣的:(卷积神经网络,机器学习,人工智能,图像识别,开发日记)