从这节开始,就正式进入GCN项目的代码环节,我们先使用PaddleOCR,识别出火车票上的文字信息,并提取文字所位置,整理后另存为一个csv文件。
项目的目的是要做信息抽取,所以对训练和测试数据,需要手动打上标签,供算法学习。
1、新建文件
# process/ocr.py from paddleocr import paddleocr, PaddleOCR import pandas as pd from glob import glob import os import cv2 from tqdm import tqdm import logging # 屏蔽调试错误 paddleocr.logging.disable(logging.DEBUG)
2、定义OCR识别类
主要功能是识别信息并存储csv文件,如果需要导出识别框标记的图片,则可以传入marked_path。
class OCR(): def __init__(self): self.ocr = PaddleOCR() def scan(self, file_path, output_path, marked_path=None): # 文字识别 info = self.ocr.ocr(file_path, cls=False) df = pd.DataFrame(columns=['x1', 'y1', 'x2', 'y2', 'text']) for i, item in enumerate(info[0]): # 保留左上和右下坐标 ((x1, y1), _, (x2, y2), _), (text, _) = item df.loc[i] = list(map(int, [x1, y1, x2, y2])) + [text] # 保存识别结果 df.to_csv(output_path) # 判断是否需要保存标记文件 if marked_path: self.marked(df, file_path, marked_path) # 导出带标记的图片 def marked(self, df, file_path, marked_path): # 加载图片 img = cv2.imread(file_path) for x1, y1, x2, y2, _ in df.values: # 画矩形(注意坐标值必须为整数) cv2.rectangle(img, (x1, y1), (x2, y2), color=(0, 0, 255), thickness=4) cv2.imwrite(marked_path, img)
3、识别测试和训练图片
if __name__ == '__main__': ocr = OCR() for file_path in tqdm(glob('../input/imgs/train/' + '*.*')): _, file_name = os.path.split(file_path) output_path = '../output/train/csv/' + file_name + '.csv' marked_path = '../output/train/imgs_marked/' + file_name ocr.scan(file_path, output_path, marked_path) for file_path in tqdm(glob('../input/imgs/test/' + '*.*')): _, file_name = os.path.split(file_path) output_path = '../output/test/csv/' + file_name + '.csv' marked_path = '../output/test/imgs_marked/' + file_name ocr.scan(file_path, output_path, marked_path)
4、手动打标签
a) 标签示例
,label ,ticket_num ,starting_station ,destination_station ,train_num ,other ,date ,seat_number ,ticket_price ,ticket_grade ,name
b) 标记后的csv文件示例
,x1,y1,x2,y2,text,label 0,814,41,1086,107,=检票:2,other 1,109,54,332,98,F049081,ticket_num 2,134,119,423,185,德州东站,starting_station 3,758,114,1053,184,温岭站,destination_station 4,530,128,660,183,G55,train_num 5,158,190,395,233,Dezhoudong,other 6,826,183,982,235,Wenling,other 7,94,247,634,289,2017年10月21日09:26开,date 8,741,244,962,288,06车08C号,seat_number 9,485,302,583,353,网折,other 10,822,300,960,350,二等座,ticket_grade 11,97,310,308,351,¥541.5元,ticket_price 12,98,369,412,408,限乘当日当次车,other 13,101,486,736,528,1329271981****0518崔厚良,name 14,223,559,751,590,买票请到12306发货请到95306,other 15,289,607,681,641,中国铁路祝您旅途愉快,other 16,105,670,749,712,66809300121022F049081德州东售,other
更多内容:请查阅「陈华编程官网」