文字识别算法总结
1.CTPN
2.CNN+RNN
3.CNN+STN+RNN算法汇总
先介绍CNN+RNN文字识别
#CNN+RNNfrom tensorflow.keras import backend as Kfrom tensorflow.keras.models import *from tensorflow.keras.layers import *import tensorflow as tfimport numpy as np# 根据TensorFlow不同的版本,引入不同的GRU库if isNewAPI == True: from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNGRUelse: from tensorflow.keras.layers import CuDNNGRUdef FeatureExtractor(x): # 定义卷积网络,提取图片特征 for i in range(5): for j in range(2): x = Conv2D(32 * 2 ** min(i, 3), kernel_size=3, padding='same', activation='relu')(x) x = BatchNormalization()(x) x = Conv2D(32 * 2 ** min(i, 3), kernel_size=3, strides=2 if i < 2 else (2, 1), padding='same', activation='relu')(x) x = BatchNormalization()(x) return xdef RNNFeature(x): x = Permute((2, 1, 3))(x) # 转换维度 # 转化为适合于RNN网络的输入格式 x = TimeDistributed(Flatten())(x) # 32个序列 # 定义基于GRU Cell的双向RNN网络 x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x) x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x) return xdef CRNN(model_config):# 定义函数,搭建CRNN模型 # 定义模型的输入节点 input_tensor = Input((model_config['tagsize'][0], model_config['tagsize'][1], model_config['ch'])) x = FeatureExtractor(input_tensor) # 提取图片特征 x = RNNFeature(x) # 转化成RNN特征 # 用全连接网络实现输出层 y_pred = Dense(model_config['outputdim'], activation='softmax')(x) # 在计算CTC Loss时,模型输出的序列个数32必须要大于样本标签序列个数label_len print('y_pred:', y_pred.get_shape()) # (batch,32,66) # 将各个网络层连起来,组合层模型 CRNN_model = Model(inputs=input_tensor, outputs=y_pred, name="CRNN_model") return CRNN_model # 返回CRNN模型#定义CTC损失函数def ctc_lambda_func(y_true, y_pred, model_config, **kwargs): # 在2。0下没有**kwargs会编译不过 outputstep = y_pred.get_shape()[1] # 获得输入数据的序列长度 # 为批次中的每个数据,单独指定序列长度 input_length = np.asarray([[outputstep]] * model_config['batchsize'], dtype=np.int) label_length = np.asarray([[model_config['label_len']]] * model_config['batchsize']) # input_length必须大于label_length,否则会提示无效的ctc return K.ctc_batch_cost(y_true, y_pred, input_length, label_l
介绍CNN+STN+RNN识别复杂场景
#文字识别STN# STN转换层class STNtransformer(tf.keras.layers.Layer): def __init__(self, output_size, **kwargs): # 初始化 self.output_size = output_size super(STNtransformer, self).__init__(**kwargs) def compute_output_shape(self, input_shapes): # 输出形状 height, width = self.output_size num_channels = input_shapes[0][-1] return (None, height, width, num_channels) def call(self, inputtensors, mask=None): # 调用方法 X, transformation = inputtensors output = self._transform(X, transformation, self.output_size) return output def _transform(self, X, affine_transformation, output_size): # 转换方法 num_channels = X.shape[-1] batch_size = K.shape(X)[0] # 将转换参数变为[2,3]矩阵 transformations = tf.reshape(affine_transformation, shape=(batch_size, 2, 3)) # 根据输出大小生成原始坐标(batch_size, 3, height * width) regular_grids = self._make_regular_grids(batch_size, *output_size) print('regular_grids', K.shape(regular_grids)) # 在原始坐标上按照转换参数进行仿射变换,生成映射坐标(batch_size, 2, height * width) sampled_grids = K.batch_dot(transformations, regular_grids) # 根据映射坐标从原始图片上取值并填充到目标图片 interpolated_image = self._interpolate(X, sampled_grids, output_size) # 设置目标图片的形状 interpolated_image = tf.reshape( interpolated_image, tf.stack([batch_size, output_size[0], output_size[1], num_channels])) return interpolated_image def _make_regular_grids(self, batch_size, height, width): # 根据输出大小生成原始坐标 # 按照目标图片尺寸,生成坐标(所有坐标值域都在[-1,1]之间) x_linspace = tf.linspace(-1., 1., width) y_linspace = tf.linspace(-1., 1., height) x_coordinates, y_coordinates = tf.meshgrid(x_linspace, y_linspace) x_coordinates = K.flatten(x_coordinates) y_coordinates = K.flatten(y_coordinates) # 组成3列矩阵,最后一列填充1 ones = tf.ones_like(x_coordinates) grid = tf.concat([x_coordinates, y_coordinates, ones], 0) # 支持批次操作,按照批次复制原始坐标 grid = K.flatten(grid) grids = K.tile(grid, K.stack([batch_size])) return tf.reshape(grids, (batch_size, 3, height * width)) def _interpolate(self, image, sampled_grids, output_size): # 根据坐标获取像素值 batch_size = K.shape(image)[0] height = K.shape(image)[1] width = K.shape(image)[2] num_channels = K.shape(image)[3] # 取出映射坐标 x = tf.cast(K.flatten(sampled_grids[:, 0:1, :]), dtype='float32') y = tf.cast(K.flatten(sampled_grids[:, 1:2, :]), dtype='float32') # 还原映射坐标对应原图的值域,由[-1,1]到[0,width]和[0,height] x = .5 * (x + 1.0) * tf.cast(width, dtype='float32') y = .5 * (y + 1.0) * tf.cast(height, dtype='float32') # 将转化后的坐标变为整数,同时算出相邻坐标 x0 = K.cast(x, 'int32') x1 = x0 + 1 y0 = K.cast(y, 'int32') y1 = y0 + 1 # 截断出界的坐标 max_x = int(K.int_shape(image)[2] - 1) max_y = int(K.int_shape(image)[1] - 1) x0 = K.clip(x0, 0, max_x) x1 = K.clip(x1, 0, max_x) y0 = K.clip(y0, 0, max_y) y1 = K.clip(y1, 0, max_y) # 适配批次处理 pixels_batch = K.arange(0, batch_size) * (height * width) pixels_batch = K.expand_dims(pixels_batch, axis=-1) flat_output_size = output_size[0] * output_size[1] base = K.repeat_elements(pixels_batch, flat_output_size, axis=1) base = K.flatten(base) # 批次中每个图片的起始索引 # 计算4个点在原图上的索引 base_y0 = base + (y0 * width) base_y1 = base + (y1 * width) indices_a = base_y0 + x0 indices_b = base_y1 + x0 indices_c = base_y0 + x1 indices_d = base_y1 + x1 # 将原图展开,所有批次的图片都连在一起 flat_image = tf.reshape(image, shape=(-1, num_channels)) flat_image = tf.cast(flat_image, dtype='float32') # 按照索引取值 pixel_values_a = tf.gather(flat_image, indices_a) pixel_values_b = tf.gather(flat_image, indices_b) pixel_values_c = tf.gather(flat_image, indices_c) pixel_values_d = tf.gather(flat_image, indices_d) x0 = tf.cast(x0, 'float32') x1 = tf.cast(x1, 'float32') y0 = tf.cast(y0, 'float32') y1 = tf.cast(y1, 'float32') # 计算4个点的有效区域 area_a = tf.expand_dims(((x1 - x) * (y1 - y)), 1) area_b = tf.expand_dims(((x1 - x) * (y - y0)), 1) area_c = tf.expand_dims(((x - x0) * (y1 - y)), 1) area_d = tf.expand_dims(((x - x0) * (y - y0)), 1) # 按照区域大小对像素加权求和 values_a = area_a * pixel_values_a values_b = area_b * pixel_values_b values_c = area_c * pixel_values_c values_d = area_d * pixel_values_d return values_a + values_b + values_c + values_dif __name__ == '__main__': # 测试STN层 import imageio import matplotlib.pyplot as plt im = imageio.imread(r'./girl.jpg') plt.figure( figsize=(12,9) ) plt.imshow(im) plt.show() im = im / 255. im = im.reshape(1, 800, 600, 3) im = im.astype('float32') sampling_size = (400, 300) dense1 = tf.keras.layers.Dense(6, kernel_initializer='zeros', bias_initializer=tf.keras.initializers.constant( [[0.5, 0, 0.1], [0, 0.5, -0.5]])) # ([[1.,0,0],[0,1.,0]])) locnet = tf.zeros([1, 800 * 600 * 3]) locnet = dense1(locnet) print(locnet) x = STNtransformer(sampling_size)([im, locnet]) plt.imshow( (x.numpy()[0]*255).astype(np.uint8) ) plt.show()
CNN+RNN架构图分析
1.优化器Adom对比实验 Amsgard优化器综合性能优于Adom优化器(Adom验证集上的准确率较差)
2.批量归一化和激活函数的位置关系 BN放在rule后,sigmoid前面 rule:【0:++】对符号敏感度更高,BN放在rule前面容易丢失数据 sigmoid:【-7:7】过大或者过小的数值,无法收敛
3.广义的STN模型:不仅可以对输入的图片进行仿射变换,还可以将CNN的输出特征当做图像进行STN变换
4.将STN广义化后,可以用于神经网络的任意两层之间,对特征数据变换,从而使其表达更准确。