OCR文字识别算法总结

1.学习内容:

文字识别算法总结

1.CTPN

2.CNN+RNN

3.CNN+STN+RNN算法汇总

先介绍CNN+RNN文字识别

#CNN+RNNfrom tensorflow.keras import backend as Kfrom tensorflow.keras.models import *from tensorflow.keras.layers import *import tensorflow as tfimport numpy as np# 根据TensorFlow不同的版本,引入不同的GRU库if isNewAPI == True:    from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNGRUelse:    from tensorflow.keras.layers import CuDNNGRUdef FeatureExtractor(x):  # 定义卷积网络,提取图片特征    for i in range(5):        for j in range(2):            x = Conv2D(32 * 2 ** min(i, 3), kernel_size=3, padding='same', activation='relu')(x)            x = BatchNormalization()(x)        x = Conv2D(32 * 2 ** min(i, 3), kernel_size=3, strides=2 if i < 2 else (2, 1), padding='same',                   activation='relu')(x)        x = BatchNormalization()(x)    return xdef RNNFeature(x):    x = Permute((2, 1, 3))(x)  # 转换维度    # 转化为适合于RNN网络的输入格式    x = TimeDistributed(Flatten())(x)  # 32个序列    # 定义基于GRU Cell的双向RNN网络    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)    return xdef CRNN(model_config):# 定义函数,搭建CRNN模型    #    定义模型的输入节点    input_tensor = Input((model_config['tagsize'][0], model_config['tagsize'][1], model_config['ch']))    x = FeatureExtractor(input_tensor)  # 提取图片特征    x = RNNFeature(x)  # 转化成RNN特征    # 用全连接网络实现输出层    y_pred = Dense(model_config['outputdim'], activation='softmax')(x)    # 在计算CTC Loss时,模型输出的序列个数32必须要大于样本标签序列个数label_len    print('y_pred:', y_pred.get_shape())  # (batch,32,66)    # 将各个网络层连起来,组合层模型    CRNN_model = Model(inputs=input_tensor, outputs=y_pred, name="CRNN_model")    return CRNN_model  # 返回CRNN模型#定义CTC损失函数def ctc_lambda_func(y_true, y_pred, model_config, **kwargs):  # 在2。0下没有**kwargs会编译不过    outputstep = y_pred.get_shape()[1]  # 获得输入数据的序列长度    # 为批次中的每个数据,单独指定序列长度    input_length = np.asarray([[outputstep]] * model_config['batchsize'], dtype=np.int)    label_length = np.asarray([[model_config['label_len']]] * model_config['batchsize'])    # input_length必须大于label_length,否则会提示无效的ctc    return K.ctc_batch_cost(y_true, y_pred, input_length, label_l

介绍CNN+STN+RNN识别复杂场景

 

#文字识别STN# STN转换层class STNtransformer(tf.keras.layers.Layer):    def __init__(self, output_size, **kwargs):  # 初始化        self.output_size = output_size        super(STNtransformer, self).__init__(**kwargs)    def compute_output_shape(self, input_shapes):  # 输出形状        height, width = self.output_size        num_channels = input_shapes[0][-1]        return (None, height, width, num_channels)    def call(self, inputtensors, mask=None):  # 调用方法        X, transformation = inputtensors        output = self._transform(X, transformation, self.output_size)        return output    def _transform(self, X, affine_transformation, output_size):  # 转换方法        num_channels = X.shape[-1]        batch_size = K.shape(X)[0]        # 将转换参数变为[2,3]矩阵        transformations = tf.reshape(affine_transformation, shape=(batch_size, 2, 3))        # 根据输出大小生成原始坐标(batch_size, 3, height * width)        regular_grids = self._make_regular_grids(batch_size, *output_size)        print('regular_grids', K.shape(regular_grids))        # 在原始坐标上按照转换参数进行仿射变换,生成映射坐标(batch_size, 2, height * width)        sampled_grids = K.batch_dot(transformations, regular_grids)        # 根据映射坐标从原始图片上取值并填充到目标图片        interpolated_image = self._interpolate(X, sampled_grids, output_size)        # 设置目标图片的形状        interpolated_image = tf.reshape(            interpolated_image, tf.stack([batch_size, output_size[0], output_size[1], num_channels]))        return interpolated_image    def _make_regular_grids(self, batch_size, height, width):  # 根据输出大小生成原始坐标        # 按照目标图片尺寸,生成坐标(所有坐标值域都在[-1,1]之间)        x_linspace = tf.linspace(-1., 1., width)        y_linspace = tf.linspace(-1., 1., height)        x_coordinates, y_coordinates = tf.meshgrid(x_linspace, y_linspace)        x_coordinates = K.flatten(x_coordinates)        y_coordinates = K.flatten(y_coordinates)        # 组成3列矩阵,最后一列填充1        ones = tf.ones_like(x_coordinates)        grid = tf.concat([x_coordinates, y_coordinates, ones], 0)        # 支持批次操作,按照批次复制原始坐标        grid = K.flatten(grid)        grids = K.tile(grid, K.stack([batch_size]))        return tf.reshape(grids, (batch_size, 3, height * width))    def _interpolate(self, image, sampled_grids, output_size):  # 根据坐标获取像素值        batch_size = K.shape(image)[0]        height = K.shape(image)[1]        width = K.shape(image)[2]        num_channels = K.shape(image)[3]        # 取出映射坐标        x = tf.cast(K.flatten(sampled_grids[:, 0:1, :]), dtype='float32')        y = tf.cast(K.flatten(sampled_grids[:, 1:2, :]), dtype='float32')        # 还原映射坐标对应原图的值域,由[-1,1]到[0,width]和[0,height]        x = .5 * (x + 1.0) * tf.cast(width, dtype='float32')        y = .5 * (y + 1.0) * tf.cast(height, dtype='float32')        # 将转化后的坐标变为整数,同时算出相邻坐标        x0 = K.cast(x, 'int32')        x1 = x0 + 1        y0 = K.cast(y, 'int32')        y1 = y0 + 1        # 截断出界的坐标        max_x = int(K.int_shape(image)[2] - 1)        max_y = int(K.int_shape(image)[1] - 1)        x0 = K.clip(x0, 0, max_x)        x1 = K.clip(x1, 0, max_x)        y0 = K.clip(y0, 0, max_y)        y1 = K.clip(y1, 0, max_y)        # 适配批次处理        pixels_batch = K.arange(0, batch_size) * (height * width)        pixels_batch = K.expand_dims(pixels_batch, axis=-1)        flat_output_size = output_size[0] * output_size[1]        base = K.repeat_elements(pixels_batch, flat_output_size, axis=1)        base = K.flatten(base)  # 批次中每个图片的起始索引        # 计算4个点在原图上的索引        base_y0 = base + (y0 * width)        base_y1 = base + (y1 * width)        indices_a = base_y0 + x0        indices_b = base_y1 + x0        indices_c = base_y0 + x1        indices_d = base_y1 + x1        # 将原图展开,所有批次的图片都连在一起        flat_image = tf.reshape(image, shape=(-1, num_channels))        flat_image = tf.cast(flat_image, dtype='float32')        # 按照索引取值        pixel_values_a = tf.gather(flat_image, indices_a)        pixel_values_b = tf.gather(flat_image, indices_b)        pixel_values_c = tf.gather(flat_image, indices_c)        pixel_values_d = tf.gather(flat_image, indices_d)        x0 = tf.cast(x0, 'float32')        x1 = tf.cast(x1, 'float32')        y0 = tf.cast(y0, 'float32')        y1 = tf.cast(y1, 'float32')        # 计算4个点的有效区域        area_a = tf.expand_dims(((x1 - x) * (y1 - y)), 1)        area_b = tf.expand_dims(((x1 - x) * (y - y0)), 1)        area_c = tf.expand_dims(((x - x0) * (y1 - y)), 1)        area_d = tf.expand_dims(((x - x0) * (y - y0)), 1)                        # 按照区域大小对像素加权求和        values_a = area_a * pixel_values_a        values_b = area_b * pixel_values_b        values_c = area_c * pixel_values_c        values_d = area_d * pixel_values_d        return values_a + values_b + values_c + values_dif __name__ == '__main__':  # 测试STN层    import imageio    import matplotlib.pyplot as plt    im = imageio.imread(r'./girl.jpg')    plt.figure( figsize=(12,9) )    plt.imshow(im)    plt.show()        im = im / 255.    im = im.reshape(1, 800, 600, 3)    im = im.astype('float32')    sampling_size = (400, 300)    dense1 = tf.keras.layers.Dense(6, kernel_initializer='zeros',                                   bias_initializer=tf.keras.initializers.constant(                                       [[0.5, 0, 0.1], [0, 0.5, -0.5]]))  # ([[1.,0,0],[0,1.,0]]))    locnet = tf.zeros([1, 800 * 600 * 3])    locnet = dense1(locnet)    print(locnet)     x = STNtransformer(sampling_size)([im, locnet])    plt.imshow( (x.numpy()[0]*255).astype(np.uint8) )    plt.show()

 

2.架构图分析

 CNN+RNN架构图分析

OCR文字识别算法总结_第1张图片

  

 

CNN+STN+RNN架构图分析

OCR文字识别算法总结_第2张图片

 

3.模型优化处理

1.优化器Adom对比实验     Amsgard优化器综合性能优于Adom优化器(Adom验证集上的准确率较差)

2.批量归一化和激活函数的位置关系 BN放在rule后,sigmoid前面 rule:【0:++】对符号敏感度更高,BN放在rule前面容易丢失数据 sigmoid:【-7:7】过大或者过小的数值,无法收敛

3.广义的STN模型:不仅可以对输入的图片进行仿射变换,还可以将CNN的输出特征当做图像进行STN变换

4.将STN广义化后,可以用于神经网络的任意两层之间,对特征数据变换,从而使其表达更准确。

 

 

你可能感兴趣的:(人工智能,python算法,tensorflow,算法,深度学习,keras)