什么是Chars74K,这里先贴图
下面直接贴代码还有一些心得体会,细节不做详细描述,如果有问题可以留言。
import os import cv2 from random import randint, uniform import numpy from skimage.io import imshow from skimage import transform, filters, exposure import cPickle import gzip import sys import time import theano import theano.tensor as T from theano.tensor.signal import downsample from theano.tensor.nnet import conv import csv DIC1 = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: 'A', 11: 'B', 12: 'C', 13: 'D', 14: 'E', 15: 'F', 16: 'G', 17: 'H', 18: 'I', 19: 'J', 20: 'K', 21: 'L', 22: 'M', 23: 'N', 24: 'O', 25: 'P', 26: 'Q', 27: 'R', 28: 'S', 29: 'T', 30: 'U', 31: 'V', 32: 'W', 33: 'X', 34: 'Y', 35: 'Z', 36: 'a', 37: 'b', 38: 'c', 39: 'd', 40: 'e', 41: 'f', 42: 'g', 43: 'h', 44: 'i', 45: 'j', 46: 'k', 47: 'l', 48: 'm', 49: 'n', 50: 'o', 51: 'p', 52: 'q', 53: 'r', 54: 's', 55: 't', 56: 'u', 57: 'v', 58: 'w', 59: 'x', 60: 'y', 61: 'z'} DIC2 = {'1': 1, '0': 0, '3': 3, '2': 2, '5': 5, '4': 4, '7': 7, '6': 6, '9': 9, '8': 8, 'A': 10, 'C': 12, 'B': 11, 'E': 14, 'D': 13, 'G': 16, 'F': 15, 'I': 18, 'H': 17, 'K': 20, 'J': 19, 'M': 22, 'L': 21, 'O': 24, 'N': 23, 'Q': 26, 'P': 25, 'S': 28, 'R': 27, 'U': 30, 'T': 29, 'W': 32, 'V': 31, 'Y': 34, 'X': 33, 'Z': 35, 'a': 36, 'c': 38, 'b': 37, 'e': 40, 'd': 39, 'g': 42, 'f': 41, 'i': 44, 'h': 43, 'k': 46, 'j': 45, 'm': 48, 'l': 47, 'o': 50, 'n': 49, 'q': 52, 'p': 51, 's': 54, 'r': 53, 'u': 56, 't': 55, 'w': 58, 'v': 57, 'y': 60, 'x': 59, 'z': 61} def load_test_data(): address = [] data = [] for i in range(6284,12504): address = './testResized/'+str(i)+'.Bmp' img = cv2.imread(address,0) img = img.reshape(64*64)/255. data.append(img) shared_x = theano.shared(numpy.asarray(data,dtype=theano.config.floatX),borrow=True) #print len(data) 6220 return shared_x def load_data_cv(train_path = './trainLabels.csv'): print('Read training data ....') with open(train_path,'r') as reader: reader.readline() train_data = [] #地址 train_label = [] #标签 for line in reader.readlines(): data = line.rstrip().split(',') train_data.append('./trainResized/'+data[0]+'.Bmp') train_label.append(data[1]) return train_data,train_label def load_data(): address , labels = load_data_cv() value = set(labels) dic = [] for i in value: dic.append(i) dic.sort() #print dic dic1 = dict() #用于恢复从0-61数字对应各个字符 dic2 = dict() #用于 映射到0-61的数字,用于softmax函数 for i in range(len(dic)): dic1[i] = dic[i] for i in dic1.keys(): dic2[dic1[i]] = i for i in range(len(labels)): labels[i] = dic2[labels[i]] print labels[:20] #打印前20个 data = [] for i in address: img = cv2.imread(i,0) img = img.reshape(64*64)/255. data.append(img) numpy.amax(img) shared_x = theano.shared(numpy.asarray(data,dtype=theano.config.floatX),borrow=True) shared_y = theano.shared(numpy.asarray(labels,dtype=theano.config.floatX),borrow=True) shared_y = T.cast(shared_y,'int32') return shared_x,shared_y class LogisticRegression(object): """Multi-class Logistic Regression Class The logistic regression is fully described by a weight matrix :math:`W` and bias vector :math:`b`. Classification is done by projecting data points onto a set of hyperplanes, the distance to which is used to determine a class membership probability. """ def __init__(self, input, n_in, n_out): """ Initialize the parameters of the logistic regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # start-snippet-1 # initialize with 0 the weights W as a matrix of shape (n_in, n_out) self.W = theano.shared( value=numpy.zeros( (n_in, n_out), dtype=theano.config.floatX ), name='W', borrow=True ) # initialize the baises b as a vector of n_out 0s self.b = theano.shared( value=numpy.zeros( (n_out,), dtype=theano.config.floatX ), name='b', borrow=True ) # symbolic expression for computing the matrix of class-membership # probabilities # Where: # W is a matrix where column-k represent the separation hyper plain for # class-k # x is a matrix where row-j represents input training sample-j # b is a vector where element-k represent the free parameter of hyper # plain-k self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) #softmax()函数 # symbolic description of how to compute prediction as class whose # probability is maximal self.y_pred = T.argmax(self.p_y_given_x, axis=1) # argmax(f(x))是使得 f(x)取得最大值所对应的变量x # end-snippet-1 # parameters of the model self.params = [self.W, self.b] def negative_log_likelihood(self, y): """Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. .. math:: \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label Note: we use the mean instead of the sum so that the learning rate is less dependent on the batch size """ # start-snippet-2 # y.shape[0] is (symbolically) the number of rows in y, i.e., # number of examples (call it n) in the minibatch # T.arange(y.shape[0]) is a symbolic vector which will contain # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of # Log-Probabilities (call it LP) with one row per example and # one column per class LP[T.arange(y.shape[0]),y] is a vector # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is # the mean (across minibatch examples) of the elements in v, # i.e., the mean log-likelihood across the minibatch. return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) # end-snippet-2 def errors(self, y): """Return a float representing the number of errors in the minibatch over the total number of examples of the minibatch ; zero one loss over the size of the minibatch :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label """ # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError( 'y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type) ) # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction return T.mean(T.neq(self.y_pred, y)) else: raise NotImplementedError() class HiddenLayer(object): def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh): #nnet.sigmoid): #T.tanh): self.input = input if W is None: W_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.W = W self.b = b lin_output = T.dot(input, self.W) + self.b #W(784,500) self.output = ( lin_output if activation is None else activation(lin_output) ) self.params = [self.W, self.b] class LeNetConvPoolLayer(object): """Pool Layer of a convolutional network """ def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)): """ :param filter_shape: (number of filters, num input feature maps, filter height, filter width) :param image_shape: (batch size, num input feature maps, image height, image width) """ assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared( numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX ), borrow=True ) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) # convolve input feature maps with filters conv_out = conv.conv2d( input=input, filters=self.W, filter_shape=filter_shape, image_shape=image_shape ) # downsample each feature map individually, using maxpooling pooled_out = downsample.max_pool_2d( input=conv_out, ds=poolsize, ignore_border=True ) # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) ''' http://deeplearning.net/software/theano/library/tensor/basic.html ''' # store parameters of this layer self.params = [self.W, self.b] def submit_74k(nkerns=[128,128,256,256], batch_size=311): rng = numpy.random.RandomState(23455) test_set = load_test_data() n_test_batches = test_set.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels layer0_input = x.reshape((batch_size, 1, 64, 64)) #batch_size 500 # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 64, 64), filter_shape=(nkerns[0], 1, 3, 3), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 31, 31), filter_shape=(nkerns[1], nkerns[0], 3, 3), poolsize=(2, 2), ) layer2 = LeNetConvPoolLayer( rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 14, 14), filter_shape=(nkerns[2], nkerns[1], 3, 3), poolsize=(2, 2), ) layer3 = LeNetConvPoolLayer( rng, input=layer2.output, image_shape=(batch_size, nkerns[2], 6, 6), filter_shape=(nkerns[3], nkerns[2], 3, 3), poolsize=(2, 2), ) layer4_input = layer3.output.flatten(2) # construct a fully-connected sigmoidal layer layer4 = HiddenLayer( rng, input=layer4_input, n_in=nkerns[3] * 2 * 2, n_out=2048, activation=T.tanh ) layer5 = HiddenLayer( rng, input=layer4.output, n_in=2048, n_out=2048, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer6 = LogisticRegression(input=layer5.output, n_in=2048, n_out=62) # the cost we minimize during training is the NLL of the model cost = layer6.negative_log_likelihood(y) # create a list of all model parameters to be fit by gradient descent params = layer6.params + layer5.params + layer4.params + layer3.params + layer2.params+ layer1.params + layer0.params layers = [layer0,layer1,layer2,layer3,layer4,layer5,layer6] # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer6.y_pred, givens={ x: test_set[index * batch_size: (index + 1) * batch_size], } ) save_file = open('weightCNN') for layer in layers: w , b = layer.params w.set_value(cPickle.load(save_file),borrow=True) b.set_value(cPickle.load(save_file),borrow=True) save_file.close() prediction = [] pred = [] for i in range(n_test_batches): pred = test_model(i) prediction.append(pred) return prediction def evaluate_74k(learning_rate=0.1, n_epochs=10, nkerns=[128,128,256,256], batch_size=200): #nkerns=[20, 50] rng = numpy.random.RandomState(23455) train_set_x, train_set_y = load_data() valid_set_x, valid_set_y = train_set_x, train_set_y test_set_x, test_set_y = train_set_x, train_set_y # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ''' ###################### # BUILD ACTUAL MODEL # ###################### ''' print '... building the model' start_time = time.clock() # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 64, 64)) #batch_size 500 # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 64, 64), filter_shape=(nkerns[0], 1, 3, 3), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 31, 31), filter_shape=(nkerns[1], nkerns[0], 3, 3), poolsize=(2, 2), ) layer2 = LeNetConvPoolLayer( rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 14, 14), filter_shape=(nkerns[2], nkerns[1], 3, 3), poolsize=(2, 2), ) layer3 = LeNetConvPoolLayer( rng, input=layer2.output, image_shape=(batch_size, nkerns[2], 6, 6), filter_shape=(nkerns[3], nkerns[2], 3, 3), poolsize=(2, 2), ) layer4_input = layer3.output.flatten(2) # construct a fully-connected sigmoidal layer layer4 = HiddenLayer( rng, input=layer4_input, n_in=nkerns[3] * 2 * 2, n_out=2048, activation=T.tanh ) layer5 = HiddenLayer( rng, input=layer4.output, n_in=2048, n_out=2048, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer6 = LogisticRegression(input=layer5.output, n_in=2048, n_out=62) # the cost we minimize during training is the NLL of the model cost = layer6.negative_log_likelihood(y) # create a list of all model parameters to be fit by gradient descent params = layer6.params + layer5.params + layer4.params + layer3.params + layer2.params+ layer1.params + layer0.params layers = [layer0,layer1,layer2,layer3,layer4,layer5,layer6] ''' ############### # TRAIN MODEL # ############### ''' grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer6.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) train_model = theano.function( [index], layer6.y_pred, #输出数值没有影响,可以用于打印要观察的数据 #cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] }, ) if os.path.exists('weightCNN'): save_file = open('weightCNN') for layer in layers: w , b = layer.params w.set_value(cPickle.load(save_file),borrow=True) b.set_value(cPickle.load(save_file),borrow=True) save_file.close() test_model = theano.function( [index], layer6.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) test_losses = [test_model(i) for i in xrange(n_test_batches)] this_test_loss = numpy.mean(test_losses) print 'the loss rate is',this_test_loss*100,'%' else: # create a list of gradients for all model parameters ############### # TRAIN MODEL # ############### print '... training' start_time = time.clock() for i in range(n_epochs): errors = [] for j in range(n_train_batches): pred = train_model(j) #print pred pred = train_model(0) print pred[:20] for p in range(10): errors.append(test_model(p)) print 'the ',i,' epoch error rate is ',numpy.mean(errors)*100,'%' end_time = time.clock() t = (end_time - start_time) / 60. print 'the code running time is ',t,' minites' save_file = open('weightCNN','wb') for layer in layers: w , b = layer.params cPickle.dump(w.get_value(borrow=True),save_file,-1) cPickle.dump(b.get_value(borrow=True),save_file,-1) save_file.close() if __name__ == '__main__': if os.path.exists('weightCNN'): evaluate_74k(batch_size=200,nkerns=[128,128,256,256]) else: evaluate_74k(batch_size=200,nkerns=[128,128,256,256],n_epochs=50) ''' prediction = submit_74k(nkerns=[128,128,256,256]) tmp = [] for i in prediction: for j in i: tmp.append(DIC1[j]) result =['H','E','I','p','T','t','o','d','B','H','N','W','n','H','e','s','A','R','X','S','M','1','A','8','A','S','B','U','n','n','A','W','e','V','c','a','L','h','o','i','W','I','A','m','g','I','u','A','i','V','u','E','i','a','W','U','b','S','n','8','0','P','E','7','R','f','c','p','W','I','o','6','0','J','r','N','d','W','N','H','X','M','Z','P','H','I','Y','h','N','R','A','y','L','A','F','S','T','M','O','E','R','R','S','G','n','o','A','e','T','2','A','P','I','m','S','A','t','S','t','E','N','V','W','m','I','5','N','C'] #6411=C e = 0. for i in range(len(result)): if(result[i]!=tmp[i]): e+=1. print 'the correct rate is ',100-e/len(result)*100.,'%' print('Saving...') with open('test.csv', 'w') as writer: writer.write('"ID","Class"\n') for i in range(6284,12504): writer.write(str(i) + ',"' + str(tmp[i-6284]) + '"\n') '''最后的识别率
技术有点菜,代码不是很整洁,而且最后测试的时候直接是人眼观察是否过拟合。。。
因为theano的建模确实比较复杂,不是很好用,所以有些地方做的很是随意,只是先看一下效果,之后会继续优化,不过可能不会再使用theano,而是换做keras。这里稍微说一下几个深度学习的框架,之前最早装的是caffe,不过上面的例程实在是太简单明了,对于我这种初学者完全不知所云。后来看到theano,发现里面的教程非常细,可以让你理解各个网络层的实际意义,而且可以看到每一步的数学推导。所以我觉得初学深度学习从theano入门还是很好的,不过theano也有缺点,就是每一步太详细,每个参数都要自己设置,特别是每一层之间的数据处理之后多大,下一层都需要再手动修改,测试过程中工作量确实有些大。keras是在theano的基础上再做封装,使用起来比较方便。
代码里有关于theano测试权值的保存,以后如果有要用到项目中去可以提取出权值来做识别,这里推荐一篇很好的文章http://blog.csdn.net/qiaofangjie/article/details/18042407 现在只是看了部分代码,等有时间准备自己实现一下代码,应该还是很有意义的。
下面说几个遇到的问题,1、训练的图片数据需要归一化,也就是/255.这一步,如果没有的话几乎无法收敛,这个bug当时找了好一段,开始还以为自己理解不够建模过程中有问题。2、放入softmax函数的y数值必须是0-N-1的数字,所以有一段代码用到了python的字典功能来修改字符。3、记得看到有一篇文章说relu函数比tanh和sigmoid的识别效果要好,想试一下,在theano的官网上也找到了relu函数的描述,但是使用的时候不知道为什么说找不到,不过keras里可以使用。4、还有dropout层对于防止过拟合有一定的帮助,但是比较可惜,没有在theano里找到,keras里也有。5、后来想通过增加数据集(旋转、缩放)的方式来提高识别率,不过感觉代码有问题,速度特别慢,迭代一轮要好久,中午吃饭回来不知道点了什么程序退出了,崩溃。之后再做考虑。