(附完整python源码)基于tensorflow、opencv的入门案例_发票识别三:发票数据集制作和cnn网络训练

 

1 制作数据集合

1.1 在word上输入一行数字,我用的是Calibri字体,已经比较接近发票数字了。网友们可以自行定义字体。

 

1.2 读入图片为灰度图,threshold化;并将图片颜色反转:字体为白背景为黑;对图像进行各种类型的膨胀,多样化数据。

 

 
  1. # encoding: utf-8

  2. import cv2

  3. import numpy as np

  4. import os

  5.  
  6. img0 = cv2.imread("./number.jpg",0)

  7. _, img0 = cv2.threshold(img0, 100, 255, cv2.THRESH_BINARY)

  8. img0 = 255 - img0 #反转:文字置为白色,背景置为黑色

  9. #进行三种尺度的膨胀

  10. element1 = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 1))

  11. element2 = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 3))

  12. element3 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))

  13.  
  14. img1 = cv2.dilate(img0, element1, iterations = 1)

  15. img2 = cv2.dilate(img0, element2, iterations = 1)

  16. img3 = cv2.dilate(img0, element3, iterations = 1)

  17. #保存原始图和膨胀图,共四张

  18. img = [img0,img1,img2,img3]

 

1.3 计算垂直直方图,用投影法分割出各个数字。

 
  1. #对img0、img1、img2、img3进行分割,并保存分割后的图片

  2. img_seg = []

  3. for img_ in img:

  4. img_seg.append(seg_num(img_))

编写的函数如下:

 
  1. def cal_hist(img_,flag = 0):

  2. return np.sum(img_,axis = flag)

  3.  
  4. class Find_num_region:

  5. def __init__(self,img,hist):

  6. self.cursor = -1

  7. self.hist = hist

  8. self.img = img

  9. def next(self):

  10. #将游标的位置前移一步,并返回所在检索位的矩形框

  11. self.cursor = self.cursor+1

  12. return self.hist[self.cursor]

  13. def hasNext(self):

  14. #判断是否已经检查完了所有矩形框

  15. return len(self.hist)> self.cursor + 1

  16. def find_start(self):

  17. while(self.hasNext()):

  18. hist_num = self.next()

  19. if hist_num > 0:

  20. return self.cursor

  21. def find_end(self):

  22. while(self.hasNext()):

  23. hist_num = self.next()

  24. if hist_num == 0:

  25. return self.cursor

  26. def get_num_region(self,flag = 0):

  27. start = self.find_start()

  28. end = self.find_end()

  29. if flag == 0:

  30. return self.img.copy()[:,start:end]

  31. else:

  32. return self.img.copy()[start:end,:]

  33.  
  34. def seg_num(img_):

  35. hist_1 = cal_hist(img_,1)

  36. find_num_ = Find_num_region(img_,hist_1)

  37. img = find_num_.get_num_region(1)

  38. hist_2 = cal_hist(img_)

  39. find_num = Find_num_region(img_,hist_2)

  40. img_number = []

  41. for i in range(10):

  42. print i

  43. img_number.append( find_num.get_num_region() )

  44. return img_number

 

1.4 图片的size设置为(28,28)对图片进行,进行缩放、旋转(仿射变换),然后增加随机噪声。

1.5 将数据保存npy格式,共10000组,每个数字满足“粗细、旋转角度、缩放比例、噪声分布”的多样化。数据集制作完毕。(数据量大的话,建议使用tfrecord格式)

 
  1. img_arr = np.zeros((10000,28,28))

  2. label_arr = np.zeros((10000,10))

  3. for num in range(250): # 随机角度、缩放、噪声,200次

  4.     for i in range(10): #“0-9”共10个数

  5.         for j in range(4): #不同膨胀比

  6.             angle_ = np.random.uniform(-20,20)

  7.             scale_ = np.random.uniform(0.9,1.2)

  8.             img_c = change_(img_seg[j][i],angle_,scale_)

  9.             noise_num = np.random.randint(8,38)#噪声点个数

  10.             img_c = add_noise(img_c,noise_num)

  11.             img_arr[40*num+4*i+j] = img_c

  12.             label_arr[40*num+4*i+j][i] = 1

  13. np.save("img.npy",np.array(img_arr))

  14. np.save("label.npy",np.array(label_arr))

  15.  

编写的函数如下:

 
  1. #将图像设置为28×28,不拉伸,全0填充

  2. def change_(img,angle_,scale_):

  3. length = 28

  4. h,w = img.shape

  5. H = np.float32([[1,0,(length-w)/2],[0,1,(length-h)/2]])

  6. img = cv2.warpAffine(img,H,(length,length))

  7. M = cv2.getRotationMatrix2D((length/2,length/2),angle_,scale_)

  8. return cv2.warpAffine(img,M,(length,length))

  9. def add_noise(img,amout):

  10.     length = 28

  11.     for i in range(amout):

  12.         rand_ = int(np.random.rand()*length*length)

  13.         row = int(rand_/length)

  14.         col = int(rand_%length)

  15.         if img[row,col] == 0:

  16.             img[row,col] = 255

  17.         else:

  18.             img[row,col] = 0

  19.     return img

 

2 cnn训练

2.1  使用网络上的一个常见数字识别网络结构:2层卷积,2层全连接。

 
  1. #encoding:utf-8

  2. import tensorflow as tf

  3. import numpy as np

  4. def weight_variable(shape):

  5. initial = tf.truncated_normal(shape, stddev=0.1)

  6. return tf.Variable(initial)

  7. def bias_variable(shape):

  8. initial = tf.constant(0.1, shape=shape)

  9. return tf.Variable(initial)

  10. def conv2d(x, W):

  11. return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

  12. def pool(x):

  13. return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

  14. def inference(x_,keep_prob):

  15. with tf.variable_scope("layer_conv1"): #卷积

  16. W_conv1 = weight_variable([5, 5, 1, 32])

  17. b_conv1 = bias_variable([32])

  18. layer_conv1 = tf.nn.relu(conv2d(x_, W_conv1) + b_conv1)

  19. layer_pool1 = pool(layer_conv1)

  20. with tf.variable_scope("layer_conv2"):#卷积

  21. W_conv2 = weight_variable([5, 5, 32, 64])

  22. b_conv2 = bias_variable([64])

  23. layer_conv2 = tf.nn.relu(conv2d(layer_pool1, W_conv2) + b_conv2)

  24. layer_pool2 = pool(layer_conv2)

  25. with tf.variable_scope("layer_fc3"): #全连接

  26. W_fc3 = weight_variable([7 * 7 * 64, 1024])

  27. b_fc3 = bias_variable([1024])

  28. reshape_pool3 = tf.reshape(layer_pool2, [-1, 7*7*64])

  29. layer_fc3 = tf.nn.relu(tf.matmul(reshape_pool3, W_fc3) + b_fc3)

  30. with tf.variable_scope("dropout4"):

  31. fc_drop4 = tf.nn.dropout(layer_fc3, keep_prob)

  32. with tf.variable_scope("layer_fc5"): #全连接

  33. W_fc5 = weight_variable([1024, 10])

  34. b_fc5 = bias_variable([10])

  35. predict_ = tf.nn.softmax(tf.matmul(fc_drop4, W_fc5) + b_fc5)

  36. return predict_

  37. def train(x_train,y_train,x_test,y_test):

  38. batch_size = 230 #每个批次的大小

  39. all_size = y_train.shape[0] #train集的大小

  40. x_ = tf.placeholder("float", shape=[None,28,28,1],name='x_input')

  41. y_ = tf.placeholder("float", shape=[None,10],name='y_input')

  42. keep_prob = tf.placeholder("float")

  43. predict_ = inference(x_,keep_prob)

  44. cross_entropy = -tf.reduce_sum(y_*tf.log(predict_))

  45. train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

  46. #测试准确率

  47. correct_prediction = tf.equal(tf.argmax(predict_,1),tf.argmax(y_,1))

  48. accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))

  49. saver = tf.train.Saver()

  50. with tf.Session() as sess:

  51. sess.run(tf.initialize_all_variables())

  52. for i in range(500):

  53. start = (i*batch_size)%(all_size-1)

  54. if start <(all_size-batch_size):

  55. end1 = start+batch_size

  56. end2 = 0

  57. else:

  58. end1 = (all_size-1)

  59. end2 = start+batch_size-(all_size-1)

  60. in_x = np.concatenate((x_train[start:end1],x_train[0:end2]),axis=0)

  61. in_y = np.concatenate((y_train[start:end1],y_train[0:end2]),axis=0)

  62. sess.run(train_step, feed_dict={x_:in_x,y_:in_y,keep_prob:0.5})

  63. if i%50 == 0:

  64. print "第",i,"次迭代:"

  65. print "test集精度:", sess.run(accuracy, feed_dict={x_:x_test,y_:y_test,keep_prob:1.0})

  66. print "train集精度:",sess.run(accuracy, feed_dict={x_:in_x,y_:in_y,keep_prob:1.0})

  67. saver.save(sess,"./Model/model.ckpt") #

  68. def main():

  69. x_input = np.load("img.npy") #读入图片

  70. y_input = np.load("label.npy") #读入标签

  71. x_input = x_input/255.0 #原二值化图像的像素值分别为“0”,“255”。将“255”置为“1”

  72. x_train,y_train,x_test,y_test = x_input[0:8000,:],y_input[0:8000,:],x_input[8000:10000,:],y_input[8000:10000,:]

  73. train(x_train,y_train,x_test,y_test)

  74. if __name__ == "__main__":

  75. main()

2.2 模型很快收敛,精度100%。

你可能感兴趣的:(文本检测)