直接写FCN跳跃性有点大,那么这一节就用一个简单的例子来回顾一下卷积神经网络。下一节写一写RCNN,再下一节写一些yolo或SSD,再下一节就写FCN,这样保持连贯性。
先看本节内容。
我们把验证码的识别任务转换为一个分类任务,以便于神经网络干他最擅长的事情。这就是端到端的识别,而非传统的先对字符分割,分别训练,最后识别。所以本节任务就很简单,输入一堆验证码的图片,告诉标签,不用其他的操作,直接训练。然后对于模型,也是输入一个图,给个结果。这种方法的效果应该有限。为什么呢,比如说,由26个大写英文字符和1-9数字构成的验证码,就有34^4中情况,训练样本如果完全覆盖,不考虑各种变形的情况下,训练数据量就已经很大了。
1. 开始之前
你需要安装pip install captcha
2. 正式开始
这一部分参考了MNIST_data 手写字识别程序,以及其他的程序。
2.1 生成训练集和测试集
在使用的时候,训练集很大,我生成了120万张,而对于大写字母和1-9数字构成的验证码,总数超过150万张。
测试集合可以随机的生成1000张或其他。
MY_generate_image.py
from captcha.image import ImageCaptcha
from random import sample
import os
from PIL import Image
import numpy as np
import string
image = ImageCaptcha()
characters = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789")
svPath = '.\\train_image' # test_image
def generate_data(digits_num, output, total):
num = 0
while(numA
01000000000000000000000000000000000-->B
......
00000000000000000000000000000000010-->8
00000000000000000000000000000000001-->9
所以:
AB9C可以表示为一个向量:
10000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000100100000000000000000000000000000000
A B 9 C
'''
# generate lable
def generate_lable(batch_size, imgs):
num=batch_size
Y = np.zeros([num,4,35])
for i in range(num):
Y[i,0,''.join(characters).find(imgs[i][0])] = 1
Y[i,1,''.join(characters).find(imgs[i][1])] = 1
Y[i,2,''.join(characters).find(imgs[i][2])] = 1
Y[i,3,''.join(characters).find(imgs[i][3])] = 1 #img[x][y]代表第x张图片的第y位字母
Y = np.reshape(Y,(num,4*35))
return Y
generate_data(4, svPath, 1300000) #产生四个字符长度的验证码,共120万张,包含一大半的验证码,我也不清楚训练的时候到底需要多少
imgs = os.listdir(svPath)
leng = len(imgs)
labels = generate_lable(leng, imgs)
np.savetxt(".\\new\\label.txt", labels, fmt='%d')
2.2 构建模型
模型包括基本的前向网络和反向传播网络(求解损失函数),以及测试代码
MY_train_model.py
#总共 5 层网络,前 3 层为卷积层,第 4、5 层为全连接层。对 4 层隐藏层都进行 dropout。网络结构如下所示: input——>conv——>pool——>dropout——>conv——>pool——>dropout——>conv——>pool——>dropout——>fully connected layer——>dropout——>fully connected layer——>output
# -*- coding: utf-8 -*
import tensorflow as tf
import math
import os
import numpy as np
from PIL import Image, ImageFilter
import string
import sys
import random
characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789"
img_w = 160
img_h = 60
img_char_num = 4
char_tmp_len = len(characters)
batch_size = 64
svPath = './train_image'
imgs = os.listdir(svPath)
random.shuffle(imgs)
length = len(imgs)
batchNum = length / batch_size
learn_alpha = 1e-4
def get_x_y(batch_step, batch_size):
# read image and generate lable
X = np.zeros([batch_size, img_h, img_w, 1])
img = np.zeros((img_h, img_w, 1),dtype=np.uint8)
Y = np.zeros([batch_size, img_char_num, char_tmp_len])
for j in range(1, batch_size):
img = Image.open(svPath + '\\' + imgs[batch_step * batch_size + j]).convert('L')
I_array = np.array(img)
X[j] = np.reshape(I_array, [img_h, img_w, 1])/255.0
Y[j,0,characters.find(imgs[batch_step * batch_size + j][0])] = 1
Y[j,1,characters.find(imgs[batch_step * batch_size + j][1])] = 1
Y[j,2,characters.find(imgs[batch_step * batch_size + j][2])] = 1
Y[j,3,characters.find(imgs[batch_step * batch_size + j][3])] = 1
Y = np.reshape(Y, (batch_size, img_char_num * char_tmp_len))
return X, Y
def weight_kernel(k_shape, regular):
wkernel = tf.Variable(tf.truncated_normal(k_shape, stddev=0.1))
#if regular != None:
# tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regular)(wkernel))
return wkernel
def bias(b_shape):
bkernel = tf.Variable(tf.random_normal(b_shape))
return bkernel
def conv2(X, wkernel):
conv2D_R = tf.nn.conv2d(X,wkernel,strides=[1, 1, 1, 1], padding='SAME')
return conv2D_R
def max_pool(conv2D_R):
max_R = tf.nn.max_pool(conv2D_R, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
return max_R
def forward_model(X, keep_prob, regular):
#conv + max pool + drop out
kshape1 = [5, 5, 1, 32]
wkernel1 = weight_kernel(kshape1, regular)
b_shape1 = [32]
bkernel1 = bias(b_shape1)
conv2D_R1 = conv2(X, wkernel1)
R_Add1 = tf.nn.bias_add(conv2D_R1, bkernel1)
act_R1 = tf.nn.relu(R_Add1)
max_R1 = max_pool(act_R1)
drop_R1 = tf.nn.dropout(max_R1,keep_prob)
conv_w = math.ceil(img_w / 2)
conv_h = math.ceil(img_h / 2)
#conv + max pool + drop out
kshape2 = [5, 5, 32, 64]
wkernel2 = weight_kernel(kshape2, regular)
b_shape2 = [64]
bkernel2 = bias(b_shape2)
conv2D_R2 = conv2(drop_R1, wkernel2)
R_Add2 = tf.nn.bias_add(conv2D_R2, bkernel2)
act_R2 = tf.nn.relu(R_Add2)
max_R2 = max_pool(act_R2)
drop_R2 = tf.nn.dropout(max_R2,keep_prob)
conv_w = math.ceil(conv_w / 2)
conv_h = math.ceil(conv_h / 2)
#conv + max pool + drop out
kshape3 = [5, 5, 64, 64]
wkernel3 = weight_kernel(kshape3, regular)
b_shape3 = [64]
bkernel3 = bias(b_shape3)
conv2D_R3 = conv2(drop_R2, wkernel3)
R_Add3 = tf.nn.bias_add(conv2D_R3, bkernel3)
act_R3 = tf.nn.relu(R_Add3)
max_R3 = max_pool(act_R3)
drop_R3 = tf.nn.dropout(max_R3,keep_prob)
conv_w = math.ceil(conv_w / 2)
conv_h = math.ceil(conv_h / 2)
#full connect + drop out
kshape4 = [64 * conv_w * conv_h, 1024]
wkernel4 = weight_kernel(kshape4, regular)
b_shape4 = [1024]
bkernel4 = bias(b_shape4)
dropout4_flat = tf.reshape(drop_R3,[-1, 64 * conv_w * conv_h])
mul_R4 = tf.matmul(dropout4_flat, wkernel4)
R_Add4 = tf.nn.bias_add(mul_R4, bkernel4)
R_fc1 = tf.nn.relu(R_Add4)
R_fc1_drop = tf.nn.dropout(R_fc1, keep_prob)
#full connect
kshape5 = [1024, img_char_num * char_tmp_len]
wkernel5 = weight_kernel(kshape5, regular)
b_shape5 = [img_char_num * char_tmp_len]
bkernel5 = bias(b_shape5)
mul_R5 = tf.matmul(R_fc1_drop, wkernel5)
R_Add5 = tf.add(mul_R5, bkernel5)
return R_Add5
def backward_model():
x = tf.placeholder(tf.float32, [None, img_h, img_w, 1])
y_ = tf.placeholder(tf.float32, [None, img_char_num * char_tmp_len])
keep_prob = tf.placeholder(tf.float32)
y = forward_model(x, keep_prob, 0.)
sigExp = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_, logits=y)
cross_entropy = tf.reduce_mean(sigExp)
loss_ = cross_entropy# + tf.add_n(tf.get_collection('losses'))
train_step = tf.train.AdamOptimizer(learn_alpha).minimize(loss_)
predict = tf.reshape(y, [-1,img_char_num, char_tmp_len])
real = tf.reshape(y_,[-1,img_char_num, char_tmp_len])
correct_prediction = tf.equal(tf.argmax(predict,2), tf.argmax(real,2))
correct_prediction = tf.cast(correct_prediction, tf.float32)
accuracy = tf.reduce_mean(correct_prediction)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
step = 1
while True:
batch_x,batch_y = get_x_y(step, batch_size)
_,loss = sess.run([train_step,loss_],feed_dict={x: batch_x, y_: batch_y, keep_prob: 0.7})
print ('step:%d,loss:%f' % (step,loss))
if (step + 1) > batchNum: # still use these train examples
step = 1
if step % 100 == 0:
step += 1
batch_x_test,batch_y_test = get_x_y(step, batch_size*2)
acc = sess.run(accuracy, feed_dict={x: batch_x_test, y_: batch_y_test, keep_prob: 1.})
print ('----------------------->step:%d, accuracy:%f' % (step,acc))
if acc > 0.99:
saver.save(sess,"./train_model/train_model.ckpt")
break
step += 1
test_images_path = "./test_images"
test_imgs = os.listdir(test_images_path)
length = len(test_imgs)
def test():
test_images, test_label = get_x_y(1, length)
x = tf.placeholder(tf.float32, [None, img_h, img_w, 1])
keep_prob = tf.placeholder(tf.float32)
y = forward_model(x, keep_prob, 0)
predict = tf.argmax(tf.reshape(y, [-1,img_char_num, char_tmp_len]))
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver.restore(sess, "./train_model/train_model.ckpt")
pre_list = sess.run(predict, feed_dict={x: batch_x_test, keep_prob: 1.})
for i in test_label:
print(i)
for j in pre_list:
print(j)
2.3 调用函数
直接导入调用
MY_train.py
#-*- coding:utf-8 -*-
import MY_train_model
if __name__ == '__main__':
MY_train_model.backward_model()
MY_main.py
#-*- coding:utf-8 -*-
import MY_train_model
if __name__ == '__main__':
MY_train_model.test()
2.4 在anaconda prompt 中
激活tensorflow
activate tensorflow-gpu
运行脚本
python MY_train.py
下一节,我们写一写RCNN。
本文已同步更新到公众号,欢迎订阅