写在前面:常见的目标检测框架,如faster-rcnn,SSD等,一般都提供了对pascal_voc数据集的读取接口。而将目标检测的框架应用到场景文字检测上,往往可以取得不错的效果。那么,接下来,我就介绍一下,将几种常见的场景文字数据集,如ICDAR2013,ICDAR2015,ICDAR2017,coco-text等转换为pascal_voc数据集的方式。
1.ICDAR2013
ICDAR2013包括四个文件夹,分别是:
训练图像集:Challenge2_Training_Task12_Images
训练标注集:Challenge2_Training_Task1_GT
测试图像集:Challenge2_Test_Task12_Images
测试标注集:Challenge2_Test_Task1_GT
标注格式:
xmin, ymin, xmax, ymax, text
举例:38, 43, 920, 215, "Tiredness"
准备工作:新建文件夹VOC2007,目录如下:
VOC2007
-VOC2007/Annotations
-VOC2007/ImageSets
-VOC2007/ImageSets/Main
-VOC2007/ImageSets/Main/test.txt
-VOC2007/ImageSets/Main/trainval.txt
-VOC2007/JPEGImages
转换代码如下,其中转换的结果在VOC2007文件夹下面:
#! /usr/bin/python
#coding:utf-8
import os, sys
import glob
from PIL import Image
# target dir
base_dir = "VOC2007"
target_img_dir = base_dir + "/" + "JPEGImages"
target_ann_dir = base_dir + "/" + "Annotations"
target_set_dir = base_dir + "/" + "ImageSets"
# source train dir
train_img_dir = "Challenge2_Training_Task12_Images"
train_txt_dir = "Challenge2_Training_Task1_GT"
# source test dir
test_img_dir = "Challenge2_Test_Task12_Images"
test_txt_dir = "Challenge2_Test_Task1_GT"
# rename and move img to target_img_dir
# train img
for file in os.listdir(train_img_dir):
os.rename(os.path.join(train_img_dir,file),os.path.join(target_img_dir,"ICDAR2013_Train_" + os.path.basename(file)))
# test img
for file in os.listdir(test_img_dir):
os.rename(os.path.join(test_img_dir,file),os.path.join(target_img_dir,"ICDAR2013_Test_" + os.path.basename(file)))
# create annotations to target_ann_dir
img_Lists = glob.glob(target_img_dir + '/*.jpg')
img_basenames = [] # e.g. 100.jpg
for item in img_Lists:
img_basenames.append(os.path.basename(item))
img_names = [] # e.g. 100
for item in img_basenames:
temp1, temp2 = os.path.splitext(item)
img_names.append(temp1)
for img in img_names:
im = Image.open((target_img_dir + '/' + img + '.jpg'))
width, height = im.size
# open the crospronding txt file
if 'Train' in img:
gt = open(train_txt_dir + '/gt_' + img.split('_')[-1] + '.txt').read().splitlines()
# write in xml file
xml_file = open((target_ann_dir + '/' + img + '.xml'), 'w')
xml_file.write('\n')
xml_file.write(' VOC2007 \n')
xml_file.write(' ' + str(img) + '.jpg' + ' \n')
xml_file.write(' \n')
xml_file.write(' ' + str(width) + ' \n')
xml_file.write(' ' + str(height) + ' \n')
xml_file.write(' 3 \n')
xml_file.write(' \n')
# write the region of text on xml file
for img_each_label in gt:
spt = img_each_label.split(' ')
xml_file.write(' \n')
xml_file.write(' ')
if 'Test' in img:
gt = open(test_txt_dir + '/gt_img_' + img.split('_')[-1] + '.txt').read().splitlines()
# write in xml file
xml_file = open((target_ann_dir + '/' + img + '.xml'), 'w')
xml_file.write('\n')
xml_file.write(' VOC2007 \n')
xml_file.write(' ' + str(img) + '.jpg' + ' \n')
xml_file.write(' \n')
xml_file.write(' ' + str(width) + ' \n')
xml_file.write(' ' + str(height) + ' \n')
xml_file.write(' 3 \n')
xml_file.write(' \n')
# write the region of text on xml file
for img_each_label in gt:
spt = img_each_label.split(',')
xml_file.write(' \n')
xml_file.write(' ')
# write info into target_set_dir
img_lists = glob.glob(target_ann_dir + '/*.xml')
img_names = []
for item in img_lists:
temp1, temp2 = os.path.splitext(os.path.basename(item))
img_names.append(temp1)
train_fd = open(target_set_dir + "/Main/trainval.txt", 'w')
test_fd = open(target_set_dir + "/Main/test.txt", 'w')
for item in img_names:
if 'Train' in item:
train_fd.write(str(item) + '\n')
if 'Test' in item:
test_fd.write(str(item) + '\n')
2.ICDAR2015
ICDAR2015包含三个文件夹:
训练图像集:ch4_training_images
训练标注集:ch4_training_localization_transcription_gt
测试图像集:ch4_test_images
其中,ICDAR2015不包含测试标注集,但提供了测试web接口。因此,这里只对训练集进行了转换。
标注格式:x1,y1,x2,y2,x3,y3,x4,y4,text
其中,x1,y1为左上角坐标,x2,y2为右上角坐标,x3,y3为右下角坐标,x4,y4为左下角坐标。‘###’表示text难以辨认。
377,117,463,117,465,130,378,130,Genaxis Theatre
374,155,409,155,409,170,374,170,###
创建文件夹的准备工作同上。
转换代码如下,同样,将结果保存在voc2007中。
#! /usr/bin/python
#coding:utf-8
import os, sys
import glob
from PIL import Image
import cv2
import numpy as np
# target dir
base_dir = "ICDAR2015/VOC2007"
target_img_dir = base_dir + "/" + "JPEGImages/"
target_ann_dir = base_dir + "/" + "Annotations/"
target_set_dir = base_dir + "/" + "ImageSets/"
# source train dir
train_img_dir = "ch4_training_images/"
train_txt_dir = "ch4_training_localization_transcription_gt/"
test_img_dir = "ch4_test_images"
# rename and move img to target_img_dir
# train img
for file in os.listdir(train_img_dir):
os.rename(os.path.join(train_img_dir,file),os.path.join(target_img_dir,"ICDAR2015_Train_" + os.path.basename(file)))
for file in os.listdir(test_img_dir):
os.rename(os.path.join(test_img_dir,file),os.path.join(target_img_dir,"ICDAR2015_Test_" + os.path.basename(file)))
gt_list = []
for file_name in os.listdir(target_img_dir):
img_list.append(file_name)
for idx in range(len(img_list)):
img_name = target_img_dir + img_list[idx]
gt_name = train_txt_dir + 'gt_img_' + img_list[idx].split('.')[0].split('_')[3]+'.txt'
#print gt_name
gt_obj = open(gt_name, 'r')
gt_txt = gt_obj.read()
gt_split = gt_txt.split('\n')
img = cv2.imread(img_name)
im = Image.open(img_name)
imgwidth, imgheight = im.size
# write in xml file
xml_file = open((target_ann_dir + img_list[idx].split('.')[0] + '.xml'), 'w')
xml_file.write('\n')
xml_file.write(' VOC2007 \n')
xml_file.write(' ' + img_list[idx] + ' \n')
xml_file.write(' \n')
xml_file.write(' ' + str(imgwidth) + ' \n')
xml_file.write(' ' + str(imgheight) + ' \n')
xml_file.write(' 3 \n')
xml_file.write(' \n')
f = False
difficult = 0
for gt_line in open(gt_name):
gt_ind = gt_line.split(',')
if len(gt_ind) > 3:
gt_ind[0] = filter(str.isdigit, gt_ind[0])
pt1 = (int(gt_ind[0]), int(gt_ind[1]))
pt2 = (int(gt_ind[2]), int(gt_ind[3]))
pt3 = (int(gt_ind[4]), int(gt_ind[5]))
pt4 = (int(gt_ind[6]), int(gt_ind[7]))
dtxt = gt_ind[8]
if "###" in dtxt:
difficult = 1
else:
difficult = 0
edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) * (pt1[1] - pt2[1]))
edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) * (pt2[1] - pt3[1]))
angle = 0
if edge1 > edge2:
width = edge1
height = edge2
if pt1[0] - pt2[0] != 0:
angle = -np.arctan(float(pt1[1] - pt2[1]) / float(pt1[0] - pt2[0])) / 3.1415926 * 180
else:
angle = 90.0
elif edge2 >= edge1:
width = edge2
height = edge1
#print pt2[0], pt3[0]
if pt2[0] - pt3[0] != 0:
angle = -np.arctan(float(pt2[1] - pt3[1]) / float(pt2[0] - pt3[0])) / 3.1415926 * 180
else:
angle = 90.0
if angle < -45.0:
angle = angle + 180
x_ctr = float(pt1[0] + pt3[0]) / 2#pt1[0] + np.abs(float(pt1[0] - pt3[0])) / 2
y_ctr = float(pt1[1] + pt3[1]) / 2#pt1[1] + np.abs(float(pt1[1] - pt3[1])) / 2
# write the region of text on xml file
xml_file.write(' \n')
xml_file.write(' ')
# write info into target_set_dir
img_lists = glob.glob(target_ann_dir + '/*.xml')
img_names = []
for item in img_lists:
temp1, temp2 = os.path.splitext(os.path.basename(item))
img_names.append(temp1)
train_fd = open(target_set_dir + "/Main/trainval.txt", 'w')
for item in img_names:
train_fd.write(str(item) + '\n')
3.ICDAR2017-Chinese
ICDAR2017-Chinese包含两个文件夹:
训练集:icdar2017rctw_train_v1.2
-icdar2017rctw_train_v1.2/train
训练图像文件和训练标注文件都放在了icdar2017rctw_train_v1.2/train文件夹中。
标注格式:x1,y1,x2,y2,x3,y3,x4,y4,difficult,text
其中,x1,y1为左上角坐标,x2,y2为右上角坐标,x3,y3为右下角坐标,x4,y4为左下角坐标。difficult为1表示text难以辨认,0表示容易辨认。
举例:390,902,1856,902,1856,1225,390,1225,0,"金氏眼镜"
测试集:icdar2017rctw_test
创建文件夹的准备工作同上。
转换代码如下,同样,将结果保存在voc2007中。
#! /usr/bin/python
#coding:utf-8
import os, sys
import glob
from PIL import Image
import cv2
import numpy as np
# target dir
base_dir = "ICDAR2017/VOC2007"
target_img_dir = base_dir + "/" + "JPEGImages/"
target_ann_dir = base_dir + "/" + "Annotations/"
target_set_dir = base_dir + "/" + "ImageSets/"
# source train dir
train_img_dir = "icdar2017rctw_train_v1.2/train/"
train_txt_dir = "icdar2017rctw_train_v1.2/train/"
# rename and move img to target_img_dir
# train img
for file in os.listdir(train_img_dir):
os.rename(os.path.join(train_img_dir,file),os.path.join(target_img_dir,"ICDAR2017_Train_" + os.path.basename(file)))
gt_list = []
img_list = []
for file_name in os.listdir(target_img_dir):
if file_name.split('.')[-1] == 'jpg':
print file_name
img_list.append(file_name)
for file_name in os.listdir(target_img_dir):
if file_name.split('.')[-1] == 'txt':
print file_name
gt_list.append(file_name)
for idx in range(len(img_list)):
img_name = target_img_dir + img_list[idx]
gt_name = train_txt_dir + 'image_' + img_list[idx].split('.')[0].split('_')[3]+'.txt'
#print gt_name
gt_obj = open(gt_name, 'r')
gt_txt = gt_obj.read()
gt_split = gt_txt.split('\n')
img = cv2.imread(img_name)
im = Image.open(img_name)
imgwidth, imgheight = im.size
# write in xml file
xml_file = open((target_ann_dir + img_list[idx].split('.')[0] + '.xml'), 'w')
xml_file.write('\n')
xml_file.write(' VOC2007 \n')
xml_file.write(' ' + img_list[idx] + ' \n')
xml_file.write(' \n')
xml_file.write(' ' + str(imgwidth) + ' \n')
xml_file.write(' ' + str(imgheight) + ' \n')
xml_file.write(' 3 \n')
xml_file.write(' \n')
f = False
for gt_line in open(gt_name):
gt_ind = gt_line.split(',')
if len(gt_ind) > 3:
#condinate_list = gt_ind[2].split(',')
#print gt_ind
pt1 = (int(gt_ind[0]), int(gt_ind[1]))
pt2 = (int(gt_ind[2]), int(gt_ind[3]))
pt3 = (int(gt_ind[4]), int(gt_ind[5]))
pt4 = (int(gt_ind[6]), int(gt_ind[7]))
difficult = gt_ind[8]
edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) * (pt1[1] - pt2[1]))
edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) * (pt2[1] - pt3[1]))
angle = 0
if edge1 > edge2:
width = edge1
height = edge2
if pt1[0] - pt2[0] != 0:
angle = -np.arctan(float(pt1[1] - pt2[1]) / float(pt1[0] - pt2[0])) / 3.1415926 * 180
else:
angle = 90.0
elif edge2 >= edge1:
width = edge2
height = edge1
#print pt2[0], pt3[0]
if pt2[0] - pt3[0] != 0:
angle = -np.arctan(float(pt2[1] - pt3[1]) / float(pt2[0] - pt3[0])) / 3.1415926 * 180
else:
angle = 90.0
if angle < -45.0:
angle = angle + 180
x_ctr = float(pt1[0] + pt3[0]) / 2#pt1[0] + np.abs(float(pt1[0] - pt3[0])) / 2
y_ctr = float(pt1[1] + pt3[1]) / 2#pt1[1] + np.abs(float(pt1[1] - pt3[1])) / 2
# write the region of text on xml file
xml_file.write(' \n')
xml_file.write(' ')
# write info into target_set_dir
img_lists = glob.glob(target_ann_dir + '/*.xml')
img_names = []
for item in img_lists:
temp1, temp2 = os.path.splitext(os.path.basename(item))
img_names.append(temp1)
train_fd = open(target_set_dir + "/Main/trainval.txt", 'w')
for item in img_names:
train_fd.write(str(item) + '\n')