


项目来源:飞桨学习赛:中文场景文字识别 - 飞桨AI Studio




















In [ ]

!cd ~/work && git clone -b develop

In [ ]

!cd ~/work/PaddleOCR 
!pip install -r requirements.txt && python install



In [ ]

!cd ~/data/data62842/ && unzip
!cd ~/data/data62843/ && unzip

In [ ]

!cd ~/data/data62842/ && mv train_images ../ && mv train_label.csv ../
!cd ~/data/data62843/ && mv test_images ../ 

In [5]

%cd data/data62842
!cat mv train_label.csv | head  -n 10


  • 首先,考虑使用轻量模型会有一定精度损失,采用经典网络ResNet34。
  • 其次,为了进一步增强识别效果及模型泛化行,参考其他项目使用text_render进行数据增广。
  • 最后,使用text_renderer进行数据增广,修改text_render/configs/default.yaml配置,以下为更改后的模版,主要将三项做修改,分别是font_color的enable设为True,img_bg的enable设为False,seamless_clone的enable设为True。


# Small font_size will make text looks like blured/prydown
  min: 14
  max: 23

# choose Text color range
# color boundary is in R,G,B format
  enable: True
    fraction: 0.5
    l_boundary: [0,0,150]
    h_boundary: [60,60,255]
    fraction: 0.5
    l_boundary: [139,70,19]
    h_boundary: [160,82,43]

# By default, text is drawed by Pillow with (
# If `random_space` is enabled, some text will be drawed char by char with a random space
  enable: false
  fraction: 0.3
  min: -0.1 # -0.1 will make chars very close or even overlapped
  max: 0.1

# Do remap with sin()
# Currently this process is very slow!
  enable: false
  fraction: 0.3
  period: 360  # degree, sin 函数的周期
  min: 1 # sin 函数的幅值范围
  max: 5

# random crop text height
  enable: false
  fraction: 0.5

  # top and bottom will applied equally
    min: 5
    max: 10 # in pixel, this value should small than img_height
    min: 5
    max: 10 # in pixel, this value should small than img_height

# Use image in bg_dir as background for text
  enable: false
  fraction: 0.5

# Not work when random_space applied
  enable: false
  fraction: 0.5

  # lighter than word color
    enable: true
    fraction: 0.5

  # darker than word color
    enable: true
    fraction: 0.5

# Use opencv seamlessClone() to draw text on background
# For some background image, this will make text image looks more real
  enable: true
  fraction: 0.5

  max_x: 25
  max_y: 25
  max_z: 3

  enable: true
  fraction: 0.03

# If an image is applied blur, it will not be applied prydown
  enable: true
  fraction: 0.03
  max_scale: 1.5 # Image will first resize to 1.5x, and than resize to 1x

  enable: true
  fraction: 0.3

    enable: true
    fraction: 0.25

    enable: true
    fraction: 0.25

    enable: true
    fraction: 0.25

    enable: true
    fraction: 0.25

  enable: false
  fraction: 0.05

    enable: true
    fraction: 0.2

    enable: false
    fraction: 0.2

    enable: false
    fraction: 0.3

    enable: false
    fraction: 0.3

  enable: false
    fraction: 0.5
    l_boundary: [0,0,0]
    h_boundary: [64,64,64]
    fraction: 0.5
    l_boundary: [0,0,150]
    h_boundary: [60,60,255]

# These operates are applied on the final output image,
# so actually it can also be applied in training process as an data augmentation method.

# By default, text is darker than background.
# If `reverse_color` is enabled, some images will have dark background and light text
  enable: false
  fraction: 0.5

  enable: false
  fraction: 0.1

  enable: false
  fraction: 0.1

In [ ]

!cd ~/work && git clone
!cd ~/work/text_renderer && pip install -r requirements.txt
  • CRNN网络主要考虑图片高度。以4为倍数,通过统计训练集的图像尺寸,训练时图片高度设为48。

In [ ]

import glob
import os
import cv2

def get_aspect_ratio(img_set_dir):
    m_width = 0
    m_height = 0
    width_dict = {}
    height_dict = {}
    images = glob.glob(img_set_dir+'*.jpg')
    for image in images:
        img = cv2.imread(image)
        width_dict[int(img.shape[1])] = 1 if (int(img.shape[1])) not in width_dict else 1 + width_dict[int(img.shape[1])]
        height_dict[int(img.shape[0])] = 1 if (int(img.shape[0])) not in height_dict else 1 + height_dict[int(img.shape[0])]
        m_width += img.shape[1]
        m_height += img.shape[0]
    m_width = m_width/len(images)
    m_height = m_height/len(images)
    aspect_ratio = m_width/m_height
    width_dict = dict(sorted(width_dict.items(), key=lambda item: item[1], reverse=True))
    height_dict = dict(sorted(height_dict.items(), key=lambda item: item[1], reverse=True))
    return aspect_ratio,m_width,m_height,width_dict,height_dict
aspect_ratio,m_width,m_height,width_dict,height_dict = get_aspect_ratio("/home/aistudio/data/train_images/")
print("aspect ratio is: {}, mean width is: {}, mean height is: {}".format(aspect_ratio,m_width,m_height))
print("Width dict:{}".format(width_dict))
print("Height dict:{}".format(height_dict))
import pandas as pd

def Q2B(s):
    if inside_code==0x3000:
    if inside_code<0x0020 or inside_code>0x7e: #转完之后不是半角字符返回原来的字符
        return s
    return chr(inside_code)

def stringQ2B(s):
    return "".join([Q2B(c) for c in s])

def is_chinese(s):
    for c in s:
        if c < u'\u4e00' or c > u'\u9fa5':
            return False
    return True

def is_number(s):
    for c in s:
        if c < u'\u0030' or c > u'\u0039':
            return False
    return True

def is_alphabet(s):
    for c in s:
        if c < u'\u0061' or c > u'\u007a':
            return False
    return True

def del_other(s):
    res = str()
    for c in s:
        if not (is_chinese(c) or is_number(c) or is_alphabet(c)):
            c = ""
        res += c
    return res

df = pd.read_csv("/home/aistudio/data/train_label.csv", encoding="gbk")
name, value = list(, list(df.value)
for i, label in enumerate(value):
    # 全角转半角
    label = stringQ2B(label)
    # 大写转小写
    label = "".join([c.lower() for c in label])
    # 删除所有空格符号
    label = del_other(label)
    value[i] = label

# 删除标签为""的行
data = zip(name, value)
data = list(filter(lambda c: c[1]!="", list(data)))
# 保存到work目录
with open("/home/aistudio/data/train_label.txt", "w") as f:
    for line in data:
        f.write(line[0] + "\t" + line[1] + "\n")

# 记录训练集中最长标签
label_max_len = 0
with open("/home/aistudio/data/train_label.txt", "r") as f:
    for line in f:
        name, label = line.strip().split("\t")
        if len(label) > label_max_len:
            label_max_len = len(label)

print("label max len: ", label_max_len)
def create_label_list(train_list):
    classSet = set()
    with open(train_list) as f:
        for line in f:
            img_name, label = line.strip().split("\t")
            for e in label:
    # 在类的基础上加一个blank
    classList = sorted(list(classSet))
    with open("/home/aistudio/data/label_list.txt", "w") as f:
        for idx, c in enumerate(classList):
            f.write("{}\t{}\n".format(c, idx))
    # 为数据增广提供词库
    with open("/home/aistudio/work/text_renderer/data/chars/ch.txt", "w") as f:
        for idx, c in enumerate(classList):
    return classSet

classSet = create_label_list("/home/aistudio/data/train_label.txt")
print("classify num: ", len(classSet))
aspect ratio is: 3.451128333333333, mean width is: 165.65416, mean height is: 48.0
Height dict:{48: 50000}
label max len:  77
classify num:  3096
  • 生成字符长度为1,2,3,4,5的数据集各2000张,共10000张。

  • 部分生成图片示例如:


In [ ]

!cd ~/work/text_renderer && python --length 1 --img_width 32 --img_height 48 --chars_file "./data/chars/ch.txt" --corpus_mode 'random' --num_img 2000
!cd ~/work/text_renderer && python --length 2 --img_width 64 --img_height 48 --chars_file "./data/chars/ch.txt" --corpus_mode 'random' --num_img 2000
!cd ~/work/text_renderer && python --length 3 --img_width 96 --img_height 48 --chars_file "./data/chars/ch.txt" --corpus_mode 'random' --num_img 2000
!cd ~/work/text_renderer && python --length 4 --img_width 128 --img_height 48 --chars_file "./data/chars/ch.txt" --corpus_mode 'random' --num_img 2000
!cd ~/work/text_renderer && python --length 5 --img_width 160 --img_height 48 --chars_file "./data/chars/ch.txt" --corpus_mode 'random' --num_img 2000
Total fonts num: 1
Background num: 1
Generate text images in ./output/default
2000/2000 100%
Finish generate data: 5.758 s
Total fonts num: 1
Background num: 1
Generate more text images in ./output/default. Start index 2000
2000/2000 100%
Finish generate data: 8.900 s
Total fonts num: 1
Background num: 1
Generate more text images in ./output/default. Start index 4000
2000/2000 100%
Finish generate data: 11.872 s
Total fonts num: 1
Background num: 1
Generate more text images in ./output/default. Start index 6000
2000/2000 100%
Finish generate data: 16.067 s
Total fonts num: 1
Background num: 1
Generate more text images in ./output/default. Start index 8000
2000/2000 100%
Finish generate data: 19.593 s
  • 将生成的数据集与原数据集合并

In [ ]

!cp ~/work/text_renderer/output/default/*.jpg ~/data/train_images

In [ ]

import os

with open('work/text_renderer/output/default/tmp_labels.txt','r',encoding='utf-8') as src_label:
    with open('data/train_label.txt','a',encoding='utf-8') as dst_label:
        lines = src_label.readlines()
        for line in lines:
            [img,text] = line.split(' ')


  • 加载CRNN预训练模型
  • 改变默认输入图片尺寸,变为为height48,width256
  • 优化学习率策略,通过cosine_decay和warmup策略加快模型收敛


  • 本项目模型采用文字识别经典CRNN模型(CNN+RNN+CTC),其中部分模型代码经过PaddleOCR源码改编,完成识别模型的搭建、训练、评估和预测过程。训练时可以手动更改config配置文件(数据训练、加载、评估验证等参数),默认采用优化器采用Adam,使用CTC损失函数。本项目采用ResNet34作为骨干网络。




In [ ]

!cd ~/work/PaddleOCR && mkdir pretrain_weights && cd pretrain_weights && wget
--2022-09-30 17:16:20--
Resolving (,, 2409:8c04:1001:1002:0:ff:b001:368a
Connecting to (||:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 297577776 (284M) [application/x-tar]
Saving to: ‘ch_ppocr_server_v1.1_rec_pre.tar’

ch_ppocr_server_v1. 100%[===================>] 283.79M  46.8MB/s    in 5.3s    

2022-09-30 17:16:26 (53.2 MB/s) - ‘ch_ppocr_server_v1.1_rec_pre.tar’ saved [297577776/297577776]

In [ ]

!cd ~/work/PaddleOCR/pretrain_weights && tar -xf ch_ppocr_server_v1.1_rec_pre.tar
  • 在PaddleOCR/configs/rec中,添加训练配置文件 my_rec_ch_train.yml和my_rec_ch_reader.yml
  algorithm: CRNN
  use_gpu: true
  epoch_num: 201
  log_smooth_window: 20
  print_batch_step: 10
  save_model_dir: ./output/my_rec_ch
  save_epoch_step: 50
  eval_batch_step: 100000000
  train_batch_size_per_card: 64
  test_batch_size_per_card: 64
  image_shape: [3, 48, 256]
  max_text_length: 80
  character_type: ch
  character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt
  loss_type: ctc
  distort: true
  use_space_char: true
  reader_yml: ./configs/rec/my_rec_ch_reader.yml
  pretrain_weights: ./pretrain_weights/ch_ppocr_server_v1.1_rec_pre/best_accuracy

  function: ppocr.modeling.architectures.rec_model,RecModel

  function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
  layers: 34

  function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
  encoder_type: rnn
  fc_decay: 0.00004
    hidden_size: 256
  function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss

  function: ppocr.optimizer,AdamDecay
  base_lr: 0.0001
  l2_decay: 0.00004
  beta1: 0.9
  beta2: 0.999
    function: cosine_decay_warmup
    step_each_epoch: 1000
    total_epoch: 201
    warmup_minibatch: 2000
  num_workers: 1
  img_set_dir: /home/aistudio/data/train_images
  label_file_path: /home/aistudio/data/train_label.txt
  img_set_dir: /home/aistudio/data/train_images
  label_file_path: /home/aistudio/data/train_label.txt



4.1 训练模型

  • 根据修改后的配置文件,输入以下命令就可以开始训练。

In [33]

!cd ~/work/PaddleOCR && python tools/ -c configs/rec/my_rec_ch_train.yml



In [34]

!cd ~/work/PaddleOCR && python tools/ -c configs/rec/my_rec_ch_train.yml -o Global.checkpoints=./output/my_rec_ch/iter_epoch_27 Global.save_inference_dir=./inference/CRNN_R34
2022-09-30 22:57:53,971-INFO: {'Global': {'debug': False, 'algorithm': 'CRNN', 'use_gpu': True, 'epoch_num': 201, 'log_smooth_window': 20, 'print_batch_step': 10, 'save_model_dir': './output/my_rec_ch', 'save_epoch_step': 3, 'eval_batch_step': 100000000, 'train_batch_size_per_card': 64, 'test_batch_size_per_card': 64, 'image_shape': [3, 48, 256], 'max_text_length': 80, 'character_type': 'ch', 'character_dict_path': './ppocr/utils/ppocr_keys_v1.txt', 'loss_type': 'ctc', 'distort': True, 'use_space_char': True, 'reader_yml': './configs/rec/my_rec_ch_reader.yml', 'pretrain_weights': './pretrain_weights/ch_ppocr_server_v1.1_rec_pre/best_accuracy', 'checkpoints': './output/my_rec_ch/iter_epoch_27', 'save_inference_dir': './inference/CRNN_R34', 'infer_img': None}, 'Architecture': {'function': 'ppocr.modeling.architectures.rec_model,RecModel'}, 'Backbone': {'function': 'ppocr.modeling.backbones.rec_resnet_vd,ResNet', 'layers': 34}, 'Head': {'function': 'ppocr.modeling.heads.rec_ctc_head,CTCPredict', 'encoder_type': 'rnn', 'fc_decay': 4e-05, 'SeqRNN': {'hidden_size': 256}}, 'Loss': {'function': 'ppocr.modeling.losses.rec_ctc_loss,CTCLoss'}, 'Optimizer': {'function': 'ppocr.optimizer,AdamDecay', 'base_lr': 0.0001, 'l2_decay': 4e-05, 'beta1': 0.9, 'beta2': 0.999, 'decay': {'function': 'cosine_decay_warmup', 'step_each_epoch': 1000, 'total_epoch': 201, 'warmup_minibatch': 2000}}, 'TrainReader': {'reader_function': ',SimpleReader', 'num_workers': 8, 'img_set_dir': '/home/aistudio/data/train_images', 'label_file_path': '/home/aistudio/data/train_label.txt'}, 'EvalReader': {'reader_function': ',SimpleReader', 'img_set_dir': '/home/aistudio/data/train_images', 'label_file_path': '/home/aistudio/data/train_label.txt'}, 'TestReader': {'reader_function': ',SimpleReader'}}
W0930 22:57:54.222055 22198] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 11.2, Runtime API Version: 9.0
W0930 22:57:54.227607 22198] device: 0, cuDNN Version: 7.6.
2022-09-30 22:57:57,220-INFO: Finish initing model from ./output/my_rec_ch/iter_epoch_27
inference model saved in ./inference/CRNN_R34/model and ./inference/CRNN_R34/params
save success, output_name_list: ['decoded_out', 'predicts']



import sys 
import os

from paddleocr import PaddleOCR
import numpy as np
import glob
import time
if __name__=='__main__':

    # Preference
    img_set_dir = os.path.join('..','data','test_images','')
    # Load model
    use_gpu = True 
    use_angle_cls = False 
    det = False
    det_model_dir = os.path.join('PaddleOCR','inference','ch_ppocr_mobile_v1.1_det_infer')
    cls_model_dir = os.path.join('PaddleOCR','inference','ch_ppocr_mobile_v1.1_cls_infer')
    rec_model_dir = os.path.join('PaddleOCR','inference','CRNN_R34')
    ocr = PaddleOCR(use_angle_cls=use_angle_cls, lang="ch",use_gpu=use_gpu,use_space_char=False,gpu_mem=4000,
                    det = det,
                    rec_image_shape = '3, 48, 256', 
                    rec_algorithm = 'CRNN',
                    max_text_length = 80,
                    det_model_dir = det_model_dir,
                    cls_model_dir = cls_model_dir,
                    rec_model_dir = rec_model_dir

    # Load data in a folder
    images = glob.glob(img_set_dir+'*.jpg')

    log_file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
    # Print result to a file
    with open(log_file_name+'.txt','w') as fid:
        #Inference in a folder
        for image in images:
            result = ocr.ocr(image, cls=use_angle_cls,det=det)
            if result is None:
                print('Test {} failed.'.format(image.replace(img_set_dir,'')))

            for info in result:
                pred_label = info[0]
    print("Finished predicting {} images!".format(len(images)))

In [35]

!python ~/work/


In [17]

#查看结果 txt文件生成
%cd /home/aistudio/
!cat 2022-09-30-22-58-06.txt | head -n 2






  • PaddleOCR官方教程
  • PaddleOCR:中文场景文字识别
